aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/lguest/lguest.c155
-rw-r--r--arch/x86/lguest/boot.c48
-rw-r--r--arch/x86/lguest/i386_head.S8
-rw-r--r--drivers/lguest/core.c5
-rw-r--r--drivers/lguest/hypercalls.c11
-rw-r--r--drivers/lguest/interrupts_and_traps.c37
-rw-r--r--drivers/lguest/lg.h4
-rw-r--r--drivers/lguest/lguest_device.c11
-rw-r--r--drivers/lguest/lguest_user.c23
-rw-r--r--drivers/lguest/page_tables.c113
-rw-r--r--drivers/lguest/segments.c48
-rw-r--r--drivers/lguest/x86/core.c120
-rw-r--r--drivers/lguest/x86/switcher_32.S71
-rw-r--r--include/asm-x86/lguest_hcall.h16
-rw-r--r--include/linux/lguest.h4
-rw-r--r--include/linux/lguest_launcher.h6
16 files changed, 414 insertions, 266 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index c91c28ae8290..f2668390e8f7 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -360,8 +360,8 @@ static unsigned long load_bzimage(int fd)
360} 360}
361 361
362/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels 362/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
363 * come wrapped up in the self-decompressing "bzImage" format. With some funky 363 * come wrapped up in the self-decompressing "bzImage" format. With a little
364 * coding, we can load those, too. */ 364 * work, we can load those, too. */
365static unsigned long load_kernel(int fd) 365static unsigned long load_kernel(int fd)
366{ 366{
367 Elf32_Ehdr hdr; 367 Elf32_Ehdr hdr;
@@ -464,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem,
464 * to know where it is. */ 464 * to know where it is. */
465 return to_guest_phys(pgdir); 465 return to_guest_phys(pgdir);
466} 466}
467/*:*/
467 468
468/* Simple routine to roll all the commandline arguments together with spaces 469/* Simple routine to roll all the commandline arguments together with spaces
469 * between them. */ 470 * between them. */
@@ -480,9 +481,9 @@ static void concat(char *dst, char *args[])
480 dst[len] = '\0'; 481 dst[len] = '\0';
481} 482}
482 483
483/* This is where we actually tell the kernel to initialize the Guest. We saw 484/*L:185 This is where we actually tell the kernel to initialize the Guest. We
484 * the arguments it expects when we looked at initialize() in lguest_user.c: 485 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
485 * the base of guest "physical" memory, the top physical page to allow, the 486 * the base of Guest "physical" memory, the top physical page to allow, the
486 * top level pagetable and the entry point for the Guest. */ 487 * top level pagetable and the entry point for the Guest. */
487static int tell_kernel(unsigned long pgdir, unsigned long start) 488static int tell_kernel(unsigned long pgdir, unsigned long start)
488{ 489{
@@ -512,13 +513,14 @@ static void add_device_fd(int fd)
512/*L:200 513/*L:200
513 * The Waker. 514 * The Waker.
514 * 515 *
515 * With a console and network devices, we can have lots of input which we need 516 * With console, block and network devices, we can have lots of input which we
516 * to process. We could try to tell the kernel what file descriptors to watch, 517 * need to process. We could try to tell the kernel what file descriptors to
517 * but handing a file descriptor mask through to the kernel is fairly icky. 518 * watch, but handing a file descriptor mask through to the kernel is fairly
519 * icky.
518 * 520 *
519 * Instead, we fork off a process which watches the file descriptors and writes 521 * Instead, we fork off a process which watches the file descriptors and writes
520 * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host 522 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
521 * loop to stop running the Guest. This causes it to return from the 523 * stop running the Guest. This causes the Launcher to return from the
522 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset 524 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
523 * the LHREQ_BREAK and wake us up again. 525 * the LHREQ_BREAK and wake us up again.
524 * 526 *
@@ -544,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd)
544 if (read(pipefd, &fd, sizeof(fd)) == 0) 546 if (read(pipefd, &fd, sizeof(fd)) == 0)
545 exit(0); 547 exit(0);
546 /* Otherwise it's telling us to change what file 548 /* Otherwise it's telling us to change what file
547 * descriptors we're to listen to. */ 549 * descriptors we're to listen to. Positive means
550 * listen to a new one, negative means stop
551 * listening. */
548 if (fd >= 0) 552 if (fd >= 0)
549 FD_SET(fd, &devices.infds); 553 FD_SET(fd, &devices.infds);
550 else 554 else
@@ -559,7 +563,7 @@ static int setup_waker(int lguest_fd)
559{ 563{
560 int pipefd[2], child; 564 int pipefd[2], child;
561 565
562 /* We create a pipe to talk to the waker, and also so it knows when the 566 /* We create a pipe to talk to the Waker, and also so it knows when the
563 * Launcher dies (and closes pipe). */ 567 * Launcher dies (and closes pipe). */
564 pipe(pipefd); 568 pipe(pipefd);
565 child = fork(); 569 child = fork();
@@ -567,7 +571,8 @@ static int setup_waker(int lguest_fd)
567 err(1, "forking"); 571 err(1, "forking");
568 572
569 if (child == 0) { 573 if (child == 0) {
570 /* Close the "writing" end of our copy of the pipe */ 574 /* We are the Waker: close the "writing" end of our copy of the
575 * pipe and start waiting for input. */
571 close(pipefd[1]); 576 close(pipefd[1]);
572 wake_parent(pipefd[0], lguest_fd); 577 wake_parent(pipefd[0], lguest_fd);
573 } 578 }
@@ -578,12 +583,12 @@ static int setup_waker(int lguest_fd)
578 return pipefd[1]; 583 return pipefd[1];
579} 584}
580 585
581/*L:210 586/*
582 * Device Handling. 587 * Device Handling.
583 * 588 *
584 * When the Guest sends DMA to us, it sends us an array of addresses and sizes. 589 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
585 * We need to make sure it's not trying to reach into the Launcher itself, so 590 * We need to make sure it's not trying to reach into the Launcher itself, so
586 * we have a convenient routine which check it and exits with an error message 591 * we have a convenient routine which checks it and exits with an error message
587 * if something funny is going on: 592 * if something funny is going on:
588 */ 593 */
589static void *_check_pointer(unsigned long addr, unsigned int size, 594static void *_check_pointer(unsigned long addr, unsigned int size,
@@ -600,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
600/* A macro which transparently hands the line number to the real function. */ 605/* A macro which transparently hands the line number to the real function. */
601#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) 606#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
602 607
603/* This function returns the next descriptor in the chain, or vq->vring.num. */ 608/* Each buffer in the virtqueues is actually a chain of descriptors. This
609 * function returns the next descriptor in the chain, or vq->vring.num if we're
610 * at the end. */
604static unsigned next_desc(struct virtqueue *vq, unsigned int i) 611static unsigned next_desc(struct virtqueue *vq, unsigned int i)
605{ 612{
606 unsigned int next; 613 unsigned int next;
@@ -679,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq,
679 return head; 686 return head;
680} 687}
681 688
682/* Once we've used one of their buffers, we tell them about it. We'll then 689/* After we've used one of their buffers, we tell them about it. We'll then
683 * want to send them an interrupt, using trigger_irq(). */ 690 * want to send them an interrupt, using trigger_irq(). */
684static void add_used(struct virtqueue *vq, unsigned int head, int len) 691static void add_used(struct virtqueue *vq, unsigned int head, int len)
685{ 692{
686 struct vring_used_elem *used; 693 struct vring_used_elem *used;
687 694
688 /* Get a pointer to the next entry in the used ring. */ 695 /* The virtqueue contains a ring of used buffers. Get a pointer to the
696 * next entry in that used ring. */
689 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; 697 used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
690 used->id = head; 698 used->id = head;
691 used->len = len; 699 used->len = len;
@@ -699,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq)
699{ 707{
700 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 708 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
701 709
710 /* If they don't want an interrupt, don't send one. */
702 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) 711 if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
703 return; 712 return;
704 713
@@ -715,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq,
715 trigger_irq(fd, vq); 724 trigger_irq(fd, vq);
716} 725}
717 726
718/* Here is the input terminal setting we save, and the routine to restore them 727/*
719 * on exit so the user can see what they type next. */ 728 * The Console
729 *
730 * Here is the input terminal setting we save, and the routine to restore them
731 * on exit so the user gets their terminal back. */
720static struct termios orig_term; 732static struct termios orig_term;
721static void restore_term(void) 733static void restore_term(void)
722{ 734{
@@ -817,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq)
817 } 829 }
818} 830}
819 831
820/* Handling output for network is also simple: we get all the output buffers 832/*
833 * The Network
834 *
835 * Handling output for network is also simple: we get all the output buffers
821 * and write them (ignoring the first element) to this device's file descriptor 836 * and write them (ignoring the first element) to this device's file descriptor
822 * (stdout). */ 837 * (stdout). */
823static void handle_net_output(int fd, struct virtqueue *vq) 838static void handle_net_output(int fd, struct virtqueue *vq)
@@ -830,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq)
830 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
831 if (in) 846 if (in)
832 errx(1, "Input buffers in output queue?"); 847 errx(1, "Input buffers in output queue?");
833 /* Check header, but otherwise ignore it (we said we supported 848 /* Check header, but otherwise ignore it (we told the Guest we
834 * no features). */ 849 * supported no features, so it shouldn't have anything
850 * interesting). */
835 (void)convert(&iov[0], struct virtio_net_hdr); 851 (void)convert(&iov[0], struct virtio_net_hdr);
836 len = writev(vq->dev->fd, iov+1, out-1); 852 len = writev(vq->dev->fd, iov+1, out-1);
837 add_used_and_trigger(fd, vq, head, len); 853 add_used_and_trigger(fd, vq, head, len);
@@ -882,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev)
882 return true; 898 return true;
883} 899}
884 900
885/* This callback ensures we try again, in case we stopped console or net 901/*L:215 This is the callback attached to the network and console input
902 * virtqueues: it ensures we try again, in case we stopped console or net
886 * delivery because Guest didn't have any buffers. */ 903 * delivery because Guest didn't have any buffers. */
887static void enable_fd(int fd, struct virtqueue *vq) 904static void enable_fd(int fd, struct virtqueue *vq)
888{ 905{
@@ -918,7 +935,7 @@ static void handle_output(int fd, unsigned long addr)
918 strnlen(from_guest_phys(addr), guest_limit - addr)); 935 strnlen(from_guest_phys(addr), guest_limit - addr));
919} 936}
920 937
921/* This is called when the waker wakes us up: check for incoming file 938/* This is called when the Waker wakes us up: check for incoming file
922 * descriptors. */ 939 * descriptors. */
923static void handle_input(int fd) 940static void handle_input(int fd)
924{ 941{
@@ -985,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
985} 1002}
986 1003
987/* Each device descriptor is followed by some configuration information. 1004/* Each device descriptor is followed by some configuration information.
988 * The first byte is a "status" byte for the Guest to report what's happening. 1005 * Each configuration field looks like: u8 type, u8 len, [... len bytes...].
989 * After that are fields: u8 type, u8 len, [... len bytes...].
990 * 1006 *
991 * This routine adds a new field to an existing device's descriptor. It only 1007 * This routine adds a new field to an existing device's descriptor. It only
992 * works for the last device, but that's OK because that's how we use it. */ 1008 * works for the last device, but that's OK because that's how we use it. */
@@ -1043,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1043 /* Link virtqueue back to device. */ 1059 /* Link virtqueue back to device. */
1044 vq->dev = dev; 1060 vq->dev = dev;
1045 1061
1046 /* Set up handler. */ 1062 /* Set the routine to call when the Guest does something to this
1063 * virtqueue. */
1047 vq->handle_output = handle_output; 1064 vq->handle_output = handle_output;
1065
1066 /* Set the "Don't Notify Me" flag if we don't have a handler */
1048 if (!handle_output) 1067 if (!handle_output)
1049 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; 1068 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1050} 1069}
1051 1070
1052/* This routine does all the creation and setup of a new device, including 1071/* This routine does all the creation and setup of a new device, including
1053 * caling new_dev_desc() to allocate the descriptor and device memory. */ 1072 * calling new_dev_desc() to allocate the descriptor and device memory. */
1054static struct device *new_device(const char *name, u16 type, int fd, 1073static struct device *new_device(const char *name, u16 type, int fd,
1055 bool (*handle_input)(int, struct device *)) 1074 bool (*handle_input)(int, struct device *))
1056{ 1075{
@@ -1059,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd,
1059 /* Append to device list. Prepending to a single-linked list is 1078 /* Append to device list. Prepending to a single-linked list is
1060 * easier, but the user expects the devices to be arranged on the bus 1079 * easier, but the user expects the devices to be arranged on the bus
1061 * in command-line order. The first network device on the command line 1080 * in command-line order. The first network device on the command line
1062 * is eth0, the first block device /dev/lgba, etc. */ 1081 * is eth0, the first block device /dev/vda, etc. */
1063 *devices.lastdev = dev; 1082 *devices.lastdev = dev;
1064 dev->next = NULL; 1083 dev->next = NULL;
1065 devices.lastdev = &dev->next; 1084 devices.lastdev = &dev->next;
@@ -1103,7 +1122,7 @@ static void setup_console(void)
1103 /* The console needs two virtqueues: the input then the output. When 1122 /* The console needs two virtqueues: the input then the output. When
1104 * they put something the input queue, we make sure we're listening to 1123 * they put something the input queue, we make sure we're listening to
1105 * stdin. When they put something in the output queue, we write it to 1124 * stdin. When they put something in the output queue, we write it to
1106 * stdout. */ 1125 * stdout. */
1107 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1126 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
1108 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1127 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output);
1109 1128
@@ -1251,21 +1270,17 @@ static void setup_tun_net(const char *arg)
1251 verbose("attached to bridge: %s\n", br_name); 1270 verbose("attached to bridge: %s\n", br_name);
1252} 1271}
1253 1272
1254 1273/* Our block (disk) device should be really simple: the Guest asks for a block
1255/* 1274 * number and we read or write that position in the file. Unfortunately, that
1256 * Block device. 1275 * was amazingly slow: the Guest waits until the read is finished before
1276 * running anything else, even if it could have been doing useful work.
1257 * 1277 *
1258 * Serving a block device is really easy: the Guest asks for a block number and 1278 * We could use async I/O, except it's reputed to suck so hard that characters
1259 * we read or write that position in the file. 1279 * actually go missing from your code when you try to use it.
1260 *
1261 * Unfortunately, this is amazingly slow: the Guest waits until the read is
1262 * finished before running anything else, even if it could be doing useful
1263 * work. We could use async I/O, except it's reputed to suck so hard that
1264 * characters actually go missing from your code when you try to use it.
1265 * 1280 *
1266 * So we farm the I/O out to thread, and communicate with it via a pipe. */ 1281 * So we farm the I/O out to thread, and communicate with it via a pipe. */
1267 1282
1268/* This hangs off device->priv, with the data. */ 1283/* This hangs off device->priv. */
1269struct vblk_info 1284struct vblk_info
1270{ 1285{
1271 /* The size of the file. */ 1286 /* The size of the file. */
@@ -1281,8 +1296,14 @@ struct vblk_info
1281 * Launcher triggers interrupt to Guest. */ 1296 * Launcher triggers interrupt to Guest. */
1282 int done_fd; 1297 int done_fd;
1283}; 1298};
1299/*:*/
1284 1300
1285/* This is the core of the I/O thread. It returns true if it did something. */ 1301/*L:210
1302 * The Disk
1303 *
1304 * Remember that the block device is handled by a separate I/O thread. We head
1305 * straight into the core of that thread here:
1306 */
1286static bool service_io(struct device *dev) 1307static bool service_io(struct device *dev)
1287{ 1308{
1288 struct vblk_info *vblk = dev->priv; 1309 struct vblk_info *vblk = dev->priv;
@@ -1293,10 +1314,14 @@ static bool service_io(struct device *dev)
1293 struct iovec iov[dev->vq->vring.num]; 1314 struct iovec iov[dev->vq->vring.num];
1294 off64_t off; 1315 off64_t off;
1295 1316
1317 /* See if there's a request waiting. If not, nothing to do. */
1296 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1318 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
1297 if (head == dev->vq->vring.num) 1319 if (head == dev->vq->vring.num)
1298 return false; 1320 return false;
1299 1321
1322 /* Every block request should contain at least one output buffer
1323 * (detailing the location on disk and the type of request) and one
1324 * input buffer (to hold the result). */
1300 if (out_num == 0 || in_num == 0) 1325 if (out_num == 0 || in_num == 0)
1301 errx(1, "Bad virtblk cmd %u out=%u in=%u", 1326 errx(1, "Bad virtblk cmd %u out=%u in=%u",
1302 head, out_num, in_num); 1327 head, out_num, in_num);
@@ -1305,10 +1330,15 @@ static bool service_io(struct device *dev)
1305 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); 1330 in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
1306 off = out->sector * 512; 1331 off = out->sector * 512;
1307 1332
1308 /* This is how we implement barriers. Pretty poor, no? */ 1333 /* The block device implements "barriers", where the Guest indicates
1334 * that it wants all previous writes to occur before this write. We
1335 * don't have a way of asking our kernel to do a barrier, so we just
1336 * synchronize all the data in the file. Pretty poor, no? */
1309 if (out->type & VIRTIO_BLK_T_BARRIER) 1337 if (out->type & VIRTIO_BLK_T_BARRIER)
1310 fdatasync(vblk->fd); 1338 fdatasync(vblk->fd);
1311 1339
1340 /* In general the virtio block driver is allowed to try SCSI commands.
1341 * It'd be nice if we supported eject, for example, but we don't. */
1312 if (out->type & VIRTIO_BLK_T_SCSI_CMD) { 1342 if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
1313 fprintf(stderr, "Scsi commands unsupported\n"); 1343 fprintf(stderr, "Scsi commands unsupported\n");
1314 in->status = VIRTIO_BLK_S_UNSUPP; 1344 in->status = VIRTIO_BLK_S_UNSUPP;
@@ -1374,7 +1404,7 @@ static int io_thread(void *_dev)
1374 1404
1375 /* When this read fails, it means Launcher died, so we follow. */ 1405 /* When this read fails, it means Launcher died, so we follow. */
1376 while (read(vblk->workpipe[0], &c, 1) == 1) { 1406 while (read(vblk->workpipe[0], &c, 1) == 1) {
1377 /* We acknowledge each request immediately, to reduce latency, 1407 /* We acknowledge each request immediately to reduce latency,
1378 * rather than waiting until we've done them all. I haven't 1408 * rather than waiting until we've done them all. I haven't
1379 * measured to see if it makes any difference. */ 1409 * measured to see if it makes any difference. */
1380 while (service_io(dev)) 1410 while (service_io(dev))
@@ -1383,12 +1413,14 @@ static int io_thread(void *_dev)
1383 return 0; 1413 return 0;
1384} 1414}
1385 1415
1386/* When the thread says some I/O is done, we interrupt the Guest. */ 1416/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1417 * when the thread tells us it's completed some I/O. */
1387static bool handle_io_finish(int fd, struct device *dev) 1418static bool handle_io_finish(int fd, struct device *dev)
1388{ 1419{
1389 char c; 1420 char c;
1390 1421
1391 /* If child died, presumably it printed message. */ 1422 /* If the I/O thread died, presumably it printed the error, so we
1423 * simply exit. */
1392 if (read(dev->fd, &c, 1) != 1) 1424 if (read(dev->fd, &c, 1) != 1)
1393 exit(1); 1425 exit(1);
1394 1426
@@ -1397,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev)
1397 return true; 1429 return true;
1398} 1430}
1399 1431
1400/* When the Guest submits some I/O, we wake the I/O thread. */ 1432/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1401static void handle_virtblk_output(int fd, struct virtqueue *vq) 1433static void handle_virtblk_output(int fd, struct virtqueue *vq)
1402{ 1434{
1403 struct vblk_info *vblk = vq->dev->priv; 1435 struct vblk_info *vblk = vq->dev->priv;
@@ -1409,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq)
1409 exit(1); 1441 exit(1);
1410} 1442}
1411 1443
1412/* This creates a virtual block device. */ 1444/*L:198 This actually sets up a virtual block device. */
1413static void setup_block_file(const char *filename) 1445static void setup_block_file(const char *filename)
1414{ 1446{
1415 int p[2]; 1447 int p[2];
@@ -1425,7 +1457,7 @@ static void setup_block_file(const char *filename)
1425 /* The device responds to return from I/O thread. */ 1457 /* The device responds to return from I/O thread. */
1426 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1458 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish);
1427 1459
1428 /* The device has a virtqueue. */ 1460 /* The device has one virtqueue, where the Guest places requests. */
1429 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1461 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output);
1430 1462
1431 /* Allocate the room for our own bookkeeping */ 1463 /* Allocate the room for our own bookkeeping */
@@ -1447,7 +1479,8 @@ static void setup_block_file(const char *filename)
1447 /* The I/O thread writes to this end of the pipe when done. */ 1479 /* The I/O thread writes to this end of the pipe when done. */
1448 vblk->done_fd = p[1]; 1480 vblk->done_fd = p[1];
1449 1481
1450 /* This is how we tell the I/O thread about more work. */ 1482 /* This is the second pipe, which is how we tell the I/O thread about
1483 * more work. */
1451 pipe(vblk->workpipe); 1484 pipe(vblk->workpipe);
1452 1485
1453 /* Create stack for thread and run it */ 1486 /* Create stack for thread and run it */
@@ -1486,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1486 char reason[1024] = { 0 }; 1519 char reason[1024] = { 0 };
1487 read(lguest_fd, reason, sizeof(reason)-1); 1520 read(lguest_fd, reason, sizeof(reason)-1);
1488 errx(1, "%s", reason); 1521 errx(1, "%s", reason);
1489 /* EAGAIN means the waker wanted us to look at some input. 1522 /* EAGAIN means the Waker wanted us to look at some input.
1490 * Anything else means a bug or incompatible change. */ 1523 * Anything else means a bug or incompatible change. */
1491 } else if (errno != EAGAIN) 1524 } else if (errno != EAGAIN)
1492 err(1, "Running guest failed"); 1525 err(1, "Running guest failed");
1493 1526
1494 /* Service input, then unset the BREAK which releases 1527 /* Service input, then unset the BREAK to release the Waker. */
1495 * the Waker. */
1496 handle_input(lguest_fd); 1528 handle_input(lguest_fd);
1497 if (write(lguest_fd, args, sizeof(args)) < 0) 1529 if (write(lguest_fd, args, sizeof(args)) < 0)
1498 err(1, "Resetting break"); 1530 err(1, "Resetting break");
1499 } 1531 }
1500} 1532}
1501/* 1533/*
1502 * This is the end of the Launcher. 1534 * This is the end of the Launcher. The good news: we are over halfway
1535 * through! The bad news: the most fiendish part of the code still lies ahead
1536 * of us.
1503 * 1537 *
1504 * But wait! We've seen I/O from the Launcher, and we've seen I/O from the 1538 * Are you ready? Take a deep breath and join me in the core of the Host, in
1505 * Drivers. If we were to see the Host kernel I/O code, our understanding 1539 * "make Host".
1506 * would be complete... :*/ 1540 :*/
1507 1541
1508static struct option opts[] = { 1542static struct option opts[] = {
1509 { "verbose", 0, NULL, 'v' }, 1543 { "verbose", 0, NULL, 'v' },
@@ -1526,7 +1560,7 @@ int main(int argc, char *argv[])
1526 /* Memory, top-level pagetable, code startpoint and size of the 1560 /* Memory, top-level pagetable, code startpoint and size of the
1527 * (optional) initrd. */ 1561 * (optional) initrd. */
1528 unsigned long mem = 0, pgdir, start, initrd_size = 0; 1562 unsigned long mem = 0, pgdir, start, initrd_size = 0;
1529 /* A temporary and the /dev/lguest file descriptor. */ 1563 /* Two temporaries and the /dev/lguest file descriptor. */
1530 int i, c, lguest_fd; 1564 int i, c, lguest_fd;
1531 /* The boot information for the Guest. */ 1565 /* The boot information for the Guest. */
1532 struct boot_params *boot; 1566 struct boot_params *boot;
@@ -1621,6 +1655,7 @@ int main(int argc, char *argv[])
1621 /* The boot header contains a command line pointer: we put the command 1655 /* The boot header contains a command line pointer: we put the command
1622 * line after the boot header. */ 1656 * line after the boot header. */
1623 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); 1657 boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
1658 /* We use a simple helper to copy the arguments separated by spaces. */
1624 concat((char *)(boot + 1), argv+optind+2); 1659 concat((char *)(boot + 1), argv+optind+2);
1625 1660
1626 /* Boot protocol version: 2.07 supports the fields for lguest. */ 1661 /* Boot protocol version: 2.07 supports the fields for lguest. */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index a0179fc6b791..a55b0902f9d3 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -99,7 +99,7 @@ static cycle_t clock_base;
99 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do 99 * When lazy_mode is set, it means we're allowed to defer all hypercalls and do
100 * them as a batch when lazy_mode is eventually turned off. Because hypercalls 100 * them as a batch when lazy_mode is eventually turned off. Because hypercalls
101 * are reasonably expensive, batching them up makes sense. For example, a 101 * are reasonably expensive, batching them up makes sense. For example, a
102 * large mmap might update dozens of page table entries: that code calls 102 * large munmap might update dozens of page table entries: that code calls
103 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls 103 * paravirt_enter_lazy_mmu(), does the dozen updates, then calls
104 * lguest_leave_lazy_mode(). 104 * lguest_leave_lazy_mode().
105 * 105 *
@@ -164,8 +164,8 @@ void async_hcall(unsigned long call,
164/*:*/ 164/*:*/
165 165
166/*G:033 166/*G:033
167 * Here are our first native-instruction replacements: four functions for 167 * After that diversion we return to our first native-instruction
168 * interrupt control. 168 * replacements: four functions for interrupt control.
169 * 169 *
170 * The simplest way of implementing these would be to have "turn interrupts 170 * The simplest way of implementing these would be to have "turn interrupts
171 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: 171 * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow:
@@ -184,7 +184,7 @@ static unsigned long save_fl(void)
184 return lguest_data.irq_enabled; 184 return lguest_data.irq_enabled;
185} 185}
186 186
187/* "restore_flags" just sets the flags back to the value given. */ 187/* restore_flags() just sets the flags back to the value given. */
188static void restore_fl(unsigned long flags) 188static void restore_fl(unsigned long flags)
189{ 189{
190 lguest_data.irq_enabled = flags; 190 lguest_data.irq_enabled = flags;
@@ -357,7 +357,7 @@ static void lguest_cpuid(unsigned int *eax, unsigned int *ebx,
357 * it. The Host needs to know when the Guest wants to change them, so we have 357 * it. The Host needs to know when the Guest wants to change them, so we have
358 * a whole series of functions like read_cr0() and write_cr0(). 358 * a whole series of functions like read_cr0() and write_cr0().
359 * 359 *
360 * We start with CR0. CR0 allows you to turn on and off all kinds of basic 360 * We start with cr0. cr0 allows you to turn on and off all kinds of basic
361 * features, but Linux only really cares about one: the horrifically-named Task 361 * features, but Linux only really cares about one: the horrifically-named Task
362 * Switched (TS) bit at bit 3 (ie. 8) 362 * Switched (TS) bit at bit 3 (ie. 8)
363 * 363 *
@@ -390,7 +390,7 @@ static void lguest_clts(void)
390 current_cr0 &= ~X86_CR0_TS; 390 current_cr0 &= ~X86_CR0_TS;
391} 391}
392 392
393/* CR2 is the virtual address of the last page fault, which the Guest only ever 393/* cr2 is the virtual address of the last page fault, which the Guest only ever
394 * reads. The Host kindly writes this into our "struct lguest_data", so we 394 * reads. The Host kindly writes this into our "struct lguest_data", so we
395 * just read it out of there. */ 395 * just read it out of there. */
396static unsigned long lguest_read_cr2(void) 396static unsigned long lguest_read_cr2(void)
@@ -398,7 +398,7 @@ static unsigned long lguest_read_cr2(void)
398 return lguest_data.cr2; 398 return lguest_data.cr2;
399} 399}
400 400
401/* CR3 is the current toplevel pagetable page: the principle is the same as 401/* cr3 is the current toplevel pagetable page: the principle is the same as
402 * cr0. Keep a local copy, and tell the Host when it changes. */ 402 * cr0. Keep a local copy, and tell the Host when it changes. */
403static void lguest_write_cr3(unsigned long cr3) 403static void lguest_write_cr3(unsigned long cr3)
404{ 404{
@@ -411,7 +411,7 @@ static unsigned long lguest_read_cr3(void)
411 return current_cr3; 411 return current_cr3;
412} 412}
413 413
414/* CR4 is used to enable and disable PGE, but we don't care. */ 414/* cr4 is used to enable and disable PGE, but we don't care. */
415static unsigned long lguest_read_cr4(void) 415static unsigned long lguest_read_cr4(void)
416{ 416{
417 return 0; 417 return 0;
@@ -432,7 +432,7 @@ static void lguest_write_cr4(unsigned long val)
432 * maps virtual addresses to physical addresses using "page tables". We could 432 * maps virtual addresses to physical addresses using "page tables". We could
433 * use one huge index of 1 million entries: each address is 4 bytes, so that's 433 * use one huge index of 1 million entries: each address is 4 bytes, so that's
434 * 1024 pages just to hold the page tables. But since most virtual addresses 434 * 1024 pages just to hold the page tables. But since most virtual addresses
435 * are unused, we use a two level index which saves space. The CR3 register 435 * are unused, we use a two level index which saves space. The cr3 register
436 * contains the physical address of the top level "page directory" page, which 436 * contains the physical address of the top level "page directory" page, which
437 * contains physical addresses of up to 1024 second-level pages. Each of these 437 * contains physical addresses of up to 1024 second-level pages. Each of these
438 * second level pages contains up to 1024 physical addresses of actual pages, 438 * second level pages contains up to 1024 physical addresses of actual pages,
@@ -440,7 +440,7 @@ static void lguest_write_cr4(unsigned long val)
440 * 440 *
441 * Here's a diagram, where arrows indicate physical addresses: 441 * Here's a diagram, where arrows indicate physical addresses:
442 * 442 *
443 * CR3 ---> +---------+ 443 * cr3 ---> +---------+
444 * | --------->+---------+ 444 * | --------->+---------+
445 * | | | PADDR1 | 445 * | | | PADDR1 |
446 * Top-level | | PADDR2 | 446 * Top-level | | PADDR2 |
@@ -498,8 +498,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
498 * 498 *
499 * ... except in early boot when the kernel sets up the initial pagetables, 499 * ... except in early boot when the kernel sets up the initial pagetables,
500 * which makes booting astonishingly slow. So we don't even tell the Host 500 * which makes booting astonishingly slow. So we don't even tell the Host
501 * anything changed until we've done the first page table switch. 501 * anything changed until we've done the first page table switch. */
502 */
503static void lguest_set_pte(pte_t *ptep, pte_t pteval) 502static void lguest_set_pte(pte_t *ptep, pte_t pteval)
504{ 503{
505 *ptep = pteval; 504 *ptep = pteval;
@@ -720,10 +719,10 @@ static void lguest_time_init(void)
720 /* Set up the timer interrupt (0) to go to our simple timer routine */ 719 /* Set up the timer interrupt (0) to go to our simple timer routine */
721 set_irq_handler(0, lguest_time_irq); 720 set_irq_handler(0, lguest_time_irq);
722 721
723 /* Our clock structure look like arch/i386/kernel/tsc.c if we can use 722 /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can
724 * the TSC, otherwise it's a dumb nanosecond-resolution clock. Either 723 * use the TSC, otherwise it's a dumb nanosecond-resolution clock.
725 * way, the "rating" is initialized so high that it's always chosen 724 * Either way, the "rating" is set so high that it's always chosen over
726 * over any other clocksource. */ 725 * any other clocksource. */
727 if (lguest_data.tsc_khz) 726 if (lguest_data.tsc_khz)
728 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, 727 lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz,
729 lguest_clock.shift); 728 lguest_clock.shift);
@@ -749,7 +748,7 @@ static void lguest_time_init(void)
749 * to work. They're pretty simple. 748 * to work. They're pretty simple.
750 */ 749 */
751 750
752/* The Guest needs to tell the host what stack it expects traps to use. For 751/* The Guest needs to tell the Host what stack it expects traps to use. For
753 * native hardware, this is part of the Task State Segment mentioned above in 752 * native hardware, this is part of the Task State Segment mentioned above in
754 * lguest_load_tr_desc(), but to help hypervisors there's this special call. 753 * lguest_load_tr_desc(), but to help hypervisors there's this special call.
755 * 754 *
@@ -850,13 +849,16 @@ static __init char *lguest_memory_setup(void)
850 return "LGUEST"; 849 return "LGUEST";
851} 850}
852 851
853/* Before virtqueues are set up, we use LHCALL_NOTIFY on normal memory to 852/* We will eventually use the virtio console device to produce console output,
854 * produce console output. */ 853 * but before that is set up we use LHCALL_NOTIFY on normal memory to produce
854 * console output. */
855static __init int early_put_chars(u32 vtermno, const char *buf, int count) 855static __init int early_put_chars(u32 vtermno, const char *buf, int count)
856{ 856{
857 char scratch[17]; 857 char scratch[17];
858 unsigned int len = count; 858 unsigned int len = count;
859 859
860 /* We use a nul-terminated string, so we have to make a copy. Icky,
861 * huh? */
860 if (len > sizeof(scratch) - 1) 862 if (len > sizeof(scratch) - 1)
861 len = sizeof(scratch) - 1; 863 len = sizeof(scratch) - 1;
862 scratch[len] = '\0'; 864 scratch[len] = '\0';
@@ -883,7 +885,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
883 * Our current solution is to allow the paravirt back end to optionally patch 885 * Our current solution is to allow the paravirt back end to optionally patch
884 * over the indirect calls to replace them with something more efficient. We 886 * over the indirect calls to replace them with something more efficient. We
885 * patch the four most commonly called functions: disable interrupts, enable 887 * patch the four most commonly called functions: disable interrupts, enable
886 * interrupts, restore interrupts and save interrupts. We usually have 10 888 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
887 * bytes to patch into: the Guest versions of these operations are small enough 889 * bytes to patch into: the Guest versions of these operations are small enough
888 * that we can fit comfortably. 890 * that we can fit comfortably.
889 * 891 *
@@ -1015,7 +1017,7 @@ __init void lguest_init(void)
1015 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1017 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
1016 1018
1017 /* The Host uses the top of the Guest's virtual address space for the 1019 /* The Host uses the top of the Guest's virtual address space for the
1018 * Host<->Guest Switcher, and it tells us how much it needs in 1020 * Host<->Guest Switcher, and it tells us how big that is in
1019 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ 1021 * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */
1020 reserve_top_address(lguest_data.reserve_mem); 1022 reserve_top_address(lguest_data.reserve_mem);
1021 1023
@@ -1065,6 +1067,6 @@ __init void lguest_init(void)
1065/* 1067/*
1066 * This marks the end of stage II of our journey, The Guest. 1068 * This marks the end of stage II of our journey, The Guest.
1067 * 1069 *
1068 * It is now time for us to explore the nooks and crannies of the three Guest 1070 * It is now time for us to explore the layer of virtual drivers and complete
1069 * devices and complete our understanding of the Guest in "make Drivers". 1071 * our understanding of the Guest in "make Drivers".
1070 */ 1072 */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index ebc6ac733899..95b6fbcded63 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -6,7 +6,7 @@
6#include <asm/processor-flags.h> 6#include <asm/processor-flags.h>
7 7
8/*G:020 This is where we begin: head.S notes that the boot header's platform 8/*G:020 This is where we begin: head.S notes that the boot header's platform
9 * type field is "1" (lguest), so calls us here. The boot header is in %esi. 9 * type field is "1" (lguest), so calls us here.
10 * 10 *
11 * WARNING: be very careful here! We're running at addresses equal to physical 11 * WARNING: be very careful here! We're running at addresses equal to physical
12 * addesses (around 0), not above PAGE_OFFSET as most code expectes 12 * addesses (around 0), not above PAGE_OFFSET as most code expectes
@@ -17,13 +17,15 @@
17 * boot. */ 17 * boot. */
18.section .init.text, "ax", @progbits 18.section .init.text, "ax", @progbits
19ENTRY(lguest_entry) 19ENTRY(lguest_entry)
20 /* Make initial hypercall now, so we can set up the pagetables. */ 20 /* We make the "initialization" hypercall now to tell the Host about
21 * us, and also find out where it put our page tables. */
21 movl $LHCALL_LGUEST_INIT, %eax 22 movl $LHCALL_LGUEST_INIT, %eax
22 movl $lguest_data - __PAGE_OFFSET, %edx 23 movl $lguest_data - __PAGE_OFFSET, %edx
23 int $LGUEST_TRAP_ENTRY 24 int $LGUEST_TRAP_ENTRY
24 25
25 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl 26 /* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
26 * instruction uses %esi implicitly. */ 27 * instruction uses %esi implicitly as the source for the copy we'
28 * about to do. */
27 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi 29 movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
28 30
29 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries. 31 /* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 35d19ae58de7..cb4c67025d52 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -128,9 +128,12 @@ static void unmap_switcher(void)
128 __free_pages(switcher_page[i], 0); 128 __free_pages(switcher_page[i], 0);
129} 129}
130 130
131/*L:305 131/*H:032
132 * Dealing With Guest Memory. 132 * Dealing With Guest Memory.
133 * 133 *
134 * Before we go too much further into the Host, we need to grok the routines
135 * we use to deal with Guest memory.
136 *
134 * When the Guest gives us (what it thinks is) a physical address, we can use 137 * When the Guest gives us (what it thinks is) a physical address, we can use
135 * the normal copy_from_user() & copy_to_user() on the corresponding place in 138 * the normal copy_from_user() & copy_to_user() on the corresponding place in
136 * the memory region allocated by the Launcher. 139 * the memory region allocated by the Launcher.
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 9d5184c7c14a..b478affe8f91 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -90,6 +90,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args)
90 lg->pending_notify = args->arg1; 90 lg->pending_notify = args->arg1;
91 break; 91 break;
92 default: 92 default:
93 /* It should be an architecture-specific hypercall. */
93 if (lguest_arch_do_hcall(lg, args)) 94 if (lguest_arch_do_hcall(lg, args))
94 kill_guest(lg, "Bad hypercall %li\n", args->arg0); 95 kill_guest(lg, "Bad hypercall %li\n", args->arg0);
95 } 96 }
@@ -157,7 +158,6 @@ static void do_async_hcalls(struct lguest *lg)
157 * Guest makes a hypercall, we end up here to set things up: */ 158 * Guest makes a hypercall, we end up here to set things up: */
158static void initialize(struct lguest *lg) 159static void initialize(struct lguest *lg)
159{ 160{
160
161 /* You can't do anything until you're initialized. The Guest knows the 161 /* You can't do anything until you're initialized. The Guest knows the
162 * rules, so we're unforgiving here. */ 162 * rules, so we're unforgiving here. */
163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) { 163 if (lg->hcall->arg0 != LHCALL_LGUEST_INIT) {
@@ -174,7 +174,8 @@ static void initialize(struct lguest *lg)
174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end)) 174 || get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
175 kill_guest(lg, "bad guest page %p", lg->lguest_data); 175 kill_guest(lg, "bad guest page %p", lg->lguest_data);
176 176
177 /* We write the current time into the Guest's data page once now. */ 177 /* We write the current time into the Guest's data page once so it can
178 * set its clock. */
178 write_timestamp(lg); 179 write_timestamp(lg);
179 180
180 /* page_tables.c will also do some setup. */ 181 /* page_tables.c will also do some setup. */
@@ -182,8 +183,8 @@ static void initialize(struct lguest *lg)
182 183
183 /* This is the one case where the above accesses might have been the 184 /* This is the one case where the above accesses might have been the
184 * first write to a Guest page. This may have caused a copy-on-write 185 * first write to a Guest page. This may have caused a copy-on-write
185 * fault, but the Guest might be referring to the old (read-only) 186 * fault, but the old page might be (read-only) in the Guest
186 * page. */ 187 * pagetable. */
187 guest_pagetable_clear_all(lg); 188 guest_pagetable_clear_all(lg);
188} 189}
189 190
@@ -220,7 +221,7 @@ void do_hypercalls(struct lguest *lg)
220 * Normally it doesn't matter: the Guest will run again and 221 * Normally it doesn't matter: the Guest will run again and
221 * update the trap number before we come back here. 222 * update the trap number before we come back here.
222 * 223 *
223 * However, if we are signalled or the Guest sends DMA to the 224 * However, if we are signalled or the Guest sends I/O to the
224 * Launcher, the run_guest() loop will exit without running the 225 * Launcher, the run_guest() loop will exit without running the
225 * Guest. When it comes back it would try to re-run the 226 * Guest. When it comes back it would try to re-run the
226 * hypercall. */ 227 * hypercall. */
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 82966982cb38..2b66f79c208b 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -92,8 +92,8 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
92 92
93 /* Remember that we never let the Guest actually disable interrupts, so 93 /* Remember that we never let the Guest actually disable interrupts, so
94 * the "Interrupt Flag" bit is always set. We copy that bit from the 94 * the "Interrupt Flag" bit is always set. We copy that bit from the
95 * Guest's "irq_enabled" field into the eflags word: the Guest copies 95 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
96 * it back in "lguest_iret". */ 96 * copy it back in "lguest_iret". */
97 eflags = lg->regs->eflags; 97 eflags = lg->regs->eflags;
98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0 98 if (get_user(irq_enable, &lg->lguest_data->irq_enabled) == 0
99 && !(irq_enable & X86_EFLAGS_IF)) 99 && !(irq_enable & X86_EFLAGS_IF))
@@ -124,7 +124,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
124 kill_guest(lg, "Disabling interrupts"); 124 kill_guest(lg, "Disabling interrupts");
125} 125}
126 126
127/*H:200 127/*H:205
128 * Virtual Interrupts. 128 * Virtual Interrupts.
129 * 129 *
130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if 130 * maybe_do_interrupt() gets called before every entry to the Guest, to see if
@@ -256,19 +256,21 @@ int deliver_trap(struct lguest *lg, unsigned int num)
256 * bogus one in): if we fail here, the Guest will be killed. */ 256 * bogus one in): if we fail here, the Guest will be killed. */
257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b)) 257 if (!idt_present(lg->arch.idt[num].a, lg->arch.idt[num].b))
258 return 0; 258 return 0;
259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b, has_err(num)); 259 set_guest_interrupt(lg, lg->arch.idt[num].a, lg->arch.idt[num].b,
260 has_err(num));
260 return 1; 261 return 1;
261} 262}
262 263
263/*H:250 Here's the hard part: returning to the Host every time a trap happens 264/*H:250 Here's the hard part: returning to the Host every time a trap happens
264 * and then calling deliver_trap() and re-entering the Guest is slow. 265 * and then calling deliver_trap() and re-entering the Guest is slow.
265 * Particularly because Guest userspace system calls are traps (trap 128). 266 * Particularly because Guest userspace system calls are traps (usually trap
267 * 128).
266 * 268 *
267 * So we'd like to set up the IDT to tell the CPU to deliver traps directly 269 * So we'd like to set up the IDT to tell the CPU to deliver traps directly
268 * into the Guest. This is possible, but the complexities cause the size of 270 * into the Guest. This is possible, but the complexities cause the size of
269 * this file to double! However, 150 lines of code is worth writing for taking 271 * this file to double! However, 150 lines of code is worth writing for taking
270 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all 272 * system calls down from 1750ns to 270ns. Plus, if lguest didn't do it, all
271 * the other hypervisors would tease it. 273 * the other hypervisors would beat it up at lunchtime.
272 * 274 *
273 * This routine indicates if a particular trap number could be delivered 275 * This routine indicates if a particular trap number could be delivered
274 * directly. */ 276 * directly. */
@@ -331,7 +333,7 @@ void pin_stack_pages(struct lguest *lg)
331 * change stacks on each context switch. */ 333 * change stacks on each context switch. */
332void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages) 334void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
333{ 335{
334 /* You are not allowd have a stack segment with privilege level 0: bad 336 /* You are not allowed have a stack segment with privilege level 0: bad
335 * Guest! */ 337 * Guest! */
336 if ((seg & 0x3) != GUEST_PL) 338 if ((seg & 0x3) != GUEST_PL)
337 kill_guest(lg, "bad stack segment %i", seg); 339 kill_guest(lg, "bad stack segment %i", seg);
@@ -350,7 +352,7 @@ void guest_set_stack(struct lguest *lg, u32 seg, u32 esp, unsigned int pages)
350 * part of the Host: page table handling. */ 352 * part of the Host: page table handling. */
351 353
352/*H:235 This is the routine which actually checks the Guest's IDT entry and 354/*H:235 This is the routine which actually checks the Guest's IDT entry and
353 * transfers it into our entry in "struct lguest": */ 355 * transfers it into the entry in "struct lguest": */
354static void set_trap(struct lguest *lg, struct desc_struct *trap, 356static void set_trap(struct lguest *lg, struct desc_struct *trap,
355 unsigned int num, u32 lo, u32 hi) 357 unsigned int num, u32 lo, u32 hi)
356{ 358{
@@ -456,6 +458,18 @@ void copy_traps(const struct lguest *lg, struct desc_struct *idt,
456 } 458 }
457} 459}
458 460
461/*H:200
462 * The Guest Clock.
463 *
464 * There are two sources of virtual interrupts. We saw one in lguest_user.c:
465 * the Launcher sending interrupts for virtual devices. The other is the Guest
466 * timer interrupt.
467 *
468 * The Guest uses the LHCALL_SET_CLOCKEVENT hypercall to tell us how long to
469 * the next timer interrupt (in nanoseconds). We use the high-resolution timer
470 * infrastructure to set a callback at that time.
471 *
472 * 0 means "turn off the clock". */
459void guest_set_clockevent(struct lguest *lg, unsigned long delta) 473void guest_set_clockevent(struct lguest *lg, unsigned long delta)
460{ 474{
461 ktime_t expires; 475 ktime_t expires;
@@ -466,20 +480,27 @@ void guest_set_clockevent(struct lguest *lg, unsigned long delta)
466 return; 480 return;
467 } 481 }
468 482
483 /* We use wallclock time here, so the Guest might not be running for
484 * all the time between now and the timer interrupt it asked for. This
485 * is almost always the right thing to do. */
469 expires = ktime_add_ns(ktime_get_real(), delta); 486 expires = ktime_add_ns(ktime_get_real(), delta);
470 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS); 487 hrtimer_start(&lg->hrt, expires, HRTIMER_MODE_ABS);
471} 488}
472 489
490/* This is the function called when the Guest's timer expires. */
473static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) 491static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
474{ 492{
475 struct lguest *lg = container_of(timer, struct lguest, hrt); 493 struct lguest *lg = container_of(timer, struct lguest, hrt);
476 494
495 /* Remember the first interrupt is the timer interrupt. */
477 set_bit(0, lg->irqs_pending); 496 set_bit(0, lg->irqs_pending);
497 /* If the Guest is actually stopped, we need to wake it up. */
478 if (lg->halted) 498 if (lg->halted)
479 wake_up_process(lg->tsk); 499 wake_up_process(lg->tsk);
480 return HRTIMER_NORESTART; 500 return HRTIMER_NORESTART;
481} 501}
482 502
503/* This sets up the timer for this Guest. */
483void init_clockdev(struct lguest *lg) 504void init_clockdev(struct lguest *lg)
484{ 505{
485 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS); 506 hrtimer_init(&lg->hrt, CLOCK_REALTIME, HRTIMER_MODE_ABS);
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 0c74ac42cf01..86924891b5eb 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -100,7 +100,7 @@ int lguest_address_ok(const struct lguest *lg,
100void __lgread(struct lguest *, void *, unsigned long, unsigned); 100void __lgread(struct lguest *, void *, unsigned long, unsigned);
101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned); 101void __lgwrite(struct lguest *, unsigned long, const void *, unsigned);
102 102
103/*L:306 Using memory-copy operations like that is usually inconvient, so we 103/*H:035 Using memory-copy operations like that is usually inconvient, so we
104 * have the following helper macros which read and write a specific type (often 104 * have the following helper macros which read and write a specific type (often
105 * an unsigned long). 105 * an unsigned long).
106 * 106 *
@@ -188,7 +188,7 @@ void write_timestamp(struct lguest *lg);
188 * Let's step aside for the moment, to study one important routine that's used 188 * Let's step aside for the moment, to study one important routine that's used
189 * widely in the Host code. 189 * widely in the Host code.
190 * 190 *
191 * There are many cases where the Guest does something invalid, like pass crap 191 * There are many cases where the Guest can do something invalid, like pass crap
192 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite 192 * to a hypercall. Since only the Guest kernel can make hypercalls, it's quite
193 * acceptable to simply terminate the Guest and give the Launcher a nicely 193 * acceptable to simply terminate the Guest and give the Launcher a nicely
194 * formatted reason. It's also simpler for the Guest itself, which doesn't 194 * formatted reason. It's also simpler for the Guest itself, which doesn't
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 71c64837b437..8904f72f97c6 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -53,7 +53,8 @@ struct lguest_device {
53 * Device configurations 53 * Device configurations
54 * 54 *
55 * The configuration information for a device consists of a series of fields. 55 * The configuration information for a device consists of a series of fields.
56 * The device will look for these fields during setup. 56 * We don't really care what they are: the Launcher set them up, and the driver
57 * will look at them during setup.
57 * 58 *
58 * For us these fields come immediately after that device's descriptor in the 59 * For us these fields come immediately after that device's descriptor in the
59 * lguest_devices page. 60 * lguest_devices page.
@@ -122,8 +123,8 @@ static void lg_set_status(struct virtio_device *vdev, u8 status)
122 * The other piece of infrastructure virtio needs is a "virtqueue": a way of 123 * The other piece of infrastructure virtio needs is a "virtqueue": a way of
123 * the Guest device registering buffers for the other side to read from or 124 * the Guest device registering buffers for the other side to read from or
124 * write into (ie. send and receive buffers). Each device can have multiple 125 * write into (ie. send and receive buffers). Each device can have multiple
125 * virtqueues: for example the console has one queue for sending and one for 126 * virtqueues: for example the console driver uses one queue for sending and
126 * receiving. 127 * another for receiving.
127 * 128 *
128 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue 129 * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
129 * already exists in virtio_ring.c. We just need to connect it up. 130 * already exists in virtio_ring.c. We just need to connect it up.
@@ -158,7 +159,7 @@ static void lg_notify(struct virtqueue *vq)
158 * 159 *
159 * This is kind of an ugly duckling. It'd be nicer to have a standard 160 * This is kind of an ugly duckling. It'd be nicer to have a standard
160 * representation of a virtqueue in the configuration space, but it seems that 161 * representation of a virtqueue in the configuration space, but it seems that
161 * everyone wants to do it differently. The KVM guys want the Guest to 162 * everyone wants to do it differently. The KVM coders want the Guest to
162 * allocate its own pages and tell the Host where they are, but for lguest it's 163 * allocate its own pages and tell the Host where they are, but for lguest it's
163 * simpler for the Host to simply tell us where the pages are. 164 * simpler for the Host to simply tell us where the pages are.
164 * 165 *
@@ -284,6 +285,8 @@ static void add_lguest_device(struct lguest_device_desc *d)
284{ 285{
285 struct lguest_device *ldev; 286 struct lguest_device *ldev;
286 287
288 /* Start with zeroed memory; Linux's device layer seems to count on
289 * it. */
287 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 290 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
288 if (!ldev) { 291 if (!ldev) {
289 printk(KERN_EMERG "Cannot allocate lguest dev %u\n", 292 printk(KERN_EMERG "Cannot allocate lguest dev %u\n",
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ee405b38383d..9d716fa42cad 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -8,20 +8,22 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include "lg.h" 9#include "lg.h"
10 10
11/*L:315 To force the Guest to stop running and return to the Launcher, the 11/*L:055 When something happens, the Waker process needs a way to stop the
12 * Waker sets writes LHREQ_BREAK and the value "1" to /dev/lguest. The 12 * kernel running the Guest and return to the Launcher. So the Waker writes
13 * Launcher then writes LHREQ_BREAK and "0" to release the Waker. */ 13 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
14 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
15 * the Waker. */
14static int break_guest_out(struct lguest *lg, const unsigned long __user *input) 16static int break_guest_out(struct lguest *lg, const unsigned long __user *input)
15{ 17{
16 unsigned long on; 18 unsigned long on;
17 19
18 /* Fetch whether they're turning break on or off.. */ 20 /* Fetch whether they're turning break on or off. */
19 if (get_user(on, input) != 0) 21 if (get_user(on, input) != 0)
20 return -EFAULT; 22 return -EFAULT;
21 23
22 if (on) { 24 if (on) {
23 lg->break_out = 1; 25 lg->break_out = 1;
24 /* Pop it out (may be running on different CPU) */ 26 /* Pop it out of the Guest (may be running on different CPU) */
25 wake_up_process(lg->tsk); 27 wake_up_process(lg->tsk);
26 /* Wait for them to reset it */ 28 /* Wait for them to reset it */
27 return wait_event_interruptible(lg->break_wq, !lg->break_out); 29 return wait_event_interruptible(lg->break_wq, !lg->break_out);
@@ -58,7 +60,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
58 if (!lg) 60 if (!lg)
59 return -EINVAL; 61 return -EINVAL;
60 62
61 /* If you're not the task which owns the guest, go away. */ 63 /* If you're not the task which owns the Guest, go away. */
62 if (current != lg->tsk) 64 if (current != lg->tsk)
63 return -EPERM; 65 return -EPERM;
64 66
@@ -92,8 +94,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
92 * base: The start of the Guest-physical memory inside the Launcher memory. 94 * base: The start of the Guest-physical memory inside the Launcher memory.
93 * 95 *
94 * pfnlimit: The highest (Guest-physical) page number the Guest should be 96 * pfnlimit: The highest (Guest-physical) page number the Guest should be
95 * allowed to access. The Launcher has to live in Guest memory, so it sets 97 * allowed to access. The Guest memory lives inside the Launcher, so it sets
96 * this to ensure the Guest can't reach it. 98 * this to ensure the Guest can only reach its own memory.
97 * 99 *
98 * pgdir: The (Guest-physical) address of the top of the initial Guest 100 * pgdir: The (Guest-physical) address of the top of the initial Guest
99 * pagetables (which are set up by the Launcher). 101 * pagetables (which are set up by the Launcher).
@@ -189,7 +191,7 @@ unlock:
189} 191}
190 192
191/*L:010 The first operation the Launcher does must be a write. All writes 193/*L:010 The first operation the Launcher does must be a write. All writes
192 * start with a 32 bit number: for the first write this must be 194 * start with an unsigned long number: for the first write this must be
193 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 195 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
194 * writes of other values to send interrupts. */ 196 * writes of other values to send interrupts. */
195static ssize_t write(struct file *file, const char __user *in, 197static ssize_t write(struct file *file, const char __user *in,
@@ -275,8 +277,7 @@ static int close(struct inode *inode, struct file *file)
275 * The Launcher is the Host userspace program which sets up, runs and services 277 * The Launcher is the Host userspace program which sets up, runs and services
276 * the Guest. In fact, many comments in the Drivers which refer to "the Host" 278 * the Guest. In fact, many comments in the Drivers which refer to "the Host"
277 * doing things are inaccurate: the Launcher does all the device handling for 279 * doing things are inaccurate: the Launcher does all the device handling for
278 * the Guest. The Guest can't tell what's done by the the Launcher and what by 280 * the Guest, but the Guest can't know that.
279 * the Host.
280 * 281 *
281 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we 282 * Just to confuse you: to the Host kernel, the Launcher *is* the Guest and we
282 * shall see more of that later. 283 * shall see more of that later.
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 2a45f0691c9b..fffabb327157 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -26,7 +26,8 @@
26 * 26 *
27 * We use two-level page tables for the Guest. If you're not entirely 27 * We use two-level page tables for the Guest. If you're not entirely
28 * comfortable with virtual addresses, physical addresses and page tables then 28 * comfortable with virtual addresses, physical addresses and page tables then
29 * I recommend you review lguest.c's "Page Table Handling" (with diagrams!). 29 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with
30 * diagrams!).
30 * 31 *
31 * The Guest keeps page tables, but we maintain the actual ones here: these are 32 * The Guest keeps page tables, but we maintain the actual ones here: these are
32 * called "shadow" page tables. Which is a very Guest-centric name: these are 33 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -36,11 +37,11 @@
36 * 37 *
37 * Anyway, this is the most complicated part of the Host code. There are seven 38 * Anyway, this is the most complicated part of the Host code. There are seven
38 * parts to this: 39 * parts to this:
39 * (i) Setting up a page table entry for the Guest when it faults, 40 * (i) Looking up a page table entry when the Guest faults,
40 * (ii) Setting up the page table entry for the Guest stack, 41 * (ii) Making sure the Guest stack is mapped,
41 * (iii) Setting up a page table entry when the Guest tells us it has changed, 42 * (iii) Setting up a page table entry when the Guest tells us one has changed,
42 * (iv) Switching page tables, 43 * (iv) Switching page tables,
43 * (v) Flushing (thowing away) page tables, 44 * (v) Flushing (throwing away) page tables,
44 * (vi) Mapping the Switcher when the Guest is about to run, 45 * (vi) Mapping the Switcher when the Guest is about to run,
45 * (vii) Setting up the page tables initially. 46 * (vii) Setting up the page tables initially.
46 :*/ 47 :*/
@@ -57,16 +58,15 @@
57static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 58static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
58#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 59#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
59 60
60/*H:320 With our shadow and Guest types established, we need to deal with 61/*H:320 The page table code is curly enough to need helper functions to keep it
61 * them: the page table code is curly enough to need helper functions to keep 62 * clear and clean.
62 * it clear and clean.
63 * 63 *
64 * There are two functions which return pointers to the shadow (aka "real") 64 * There are two functions which return pointers to the shadow (aka "real")
65 * page tables. 65 * page tables.
66 * 66 *
67 * spgd_addr() takes the virtual address and returns a pointer to the top-level 67 * spgd_addr() takes the virtual address and returns a pointer to the top-level
68 * page directory entry for that address. Since we keep track of several page 68 * page directory entry (PGD) for that address. Since we keep track of several
69 * tables, the "i" argument tells us which one we're interested in (it's 69 * page tables, the "i" argument tells us which one we're interested in (it's
70 * usually the current one). */ 70 * usually the current one). */
71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) 71static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
72{ 72{
@@ -81,9 +81,9 @@ static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr)
81 return &lg->pgdirs[i].pgdir[index]; 81 return &lg->pgdirs[i].pgdir[index];
82} 82}
83 83
84/* This routine then takes the PGD entry given above, which contains the 84/* This routine then takes the page directory entry returned above, which
85 * address of the PTE page. It then returns a pointer to the PTE entry for the 85 * contains the address of the page table entry (PTE) page. It then returns a
86 * given address. */ 86 * pointer to the PTE entry for the given address. */
87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) 87static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr)
88{ 88{
89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 89 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -191,7 +191,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
191} 191}
192 192
193/*H:330 193/*H:330
194 * (i) Setting up a page table entry for the Guest when it faults 194 * (i) Looking up a page table entry when the Guest faults.
195 * 195 *
196 * We saw this call in run_guest(): when we see a page fault in the Guest, we 196 * We saw this call in run_guest(): when we see a page fault in the Guest, we
197 * come here. That's because we only set up the shadow page tables lazily as 197 * come here. That's because we only set up the shadow page tables lazily as
@@ -199,7 +199,7 @@ static void check_gpgd(struct lguest *lg, pgd_t gpgd)
199 * and return to the Guest without it knowing. 199 * and return to the Guest without it knowing.
200 * 200 *
201 * If we fixed up the fault (ie. we mapped the address), this routine returns 201 * If we fixed up the fault (ie. we mapped the address), this routine returns
202 * true. */ 202 * true. Otherwise, it was a real fault and we need to tell the Guest. */
203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) 203int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
204{ 204{
205 pgd_t gpgd; 205 pgd_t gpgd;
@@ -246,16 +246,16 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 246 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
247 return 0; 247 return 0;
248 248
249 /* User access to a kernel page? (bit 3 == user access) */ 249 /* User access to a kernel-only page? (bit 3 == user access) */
250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 250 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
251 return 0; 251 return 0;
252 252
253 /* Check that the Guest PTE flags are OK, and the page number is below 253 /* Check that the Guest PTE flags are OK, and the page number is below
254 * the pfn_limit (ie. not mapping the Launcher binary). */ 254 * the pfn_limit (ie. not mapping the Launcher binary). */
255 check_gpte(lg, gpte); 255 check_gpte(lg, gpte);
256
256 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 257 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
257 gpte = pte_mkyoung(gpte); 258 gpte = pte_mkyoung(gpte);
258
259 if (errcode & 2) 259 if (errcode & 2)
260 gpte = pte_mkdirty(gpte); 260 gpte = pte_mkdirty(gpte);
261 261
@@ -272,23 +272,28 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode)
272 else 272 else
273 /* If this is a read, don't set the "writable" bit in the page 273 /* If this is a read, don't set the "writable" bit in the page
274 * table entry, even if the Guest says it's writable. That way 274 * table entry, even if the Guest says it's writable. That way
275 * we come back here when a write does actually ocur, so we can 275 * we will come back here when a write does actually occur, so
276 * update the Guest's _PAGE_DIRTY flag. */ 276 * we can update the Guest's _PAGE_DIRTY flag. */
277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); 277 *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0);
278 278
279 /* Finally, we write the Guest PTE entry back: we've set the 279 /* Finally, we write the Guest PTE entry back: we've set the
280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 280 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
281 lgwrite(lg, gpte_ptr, pte_t, gpte); 281 lgwrite(lg, gpte_ptr, pte_t, gpte);
282 282
283 /* We succeeded in mapping the page! */ 283 /* The fault is fixed, the page table is populated, the mapping
284 * manipulated, the result returned and the code complete. A small
285 * delay and a trace of alliteration are the only indications the Guest
286 * has that a page fault occurred at all. */
284 return 1; 287 return 1;
285} 288}
286 289
287/*H:360 (ii) Setting up the page table entry for the Guest stack. 290/*H:360
291 * (ii) Making sure the Guest stack is mapped.
288 * 292 *
289 * Remember pin_stack_pages() which makes sure the stack is mapped? It could 293 * Remember that direct traps into the Guest need a mapped Guest kernel stack.
290 * simply call demand_page(), but as we've seen that logic is quite long, and 294 * pin_stack_pages() calls us here: we could simply call demand_page(), but as
291 * usually the stack pages are already mapped anyway, so it's not required. 295 * we've seen that logic is quite long, and usually the stack pages are already
296 * mapped, so it's overkill.
292 * 297 *
293 * This is a quick version which answers the question: is this virtual address 298 * This is a quick version which answers the question: is this virtual address
294 * mapped by the shadow page tables, and is it writable? */ 299 * mapped by the shadow page tables, and is it writable? */
@@ -297,7 +302,7 @@ static int page_writable(struct lguest *lg, unsigned long vaddr)
297 pgd_t *spgd; 302 pgd_t *spgd;
298 unsigned long flags; 303 unsigned long flags;
299 304
300 /* Look at the top level entry: is it present? */ 305 /* Look at the current top level entry: is it present? */
301 spgd = spgd_addr(lg, lg->pgdidx, vaddr); 306 spgd = spgd_addr(lg, lg->pgdidx, vaddr);
302 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 307 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
303 return 0; 308 return 0;
@@ -333,15 +338,14 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd)
333 release_pte(ptepage[i]); 338 release_pte(ptepage[i]);
334 /* Now we can free the page of PTEs */ 339 /* Now we can free the page of PTEs */
335 free_page((long)ptepage); 340 free_page((long)ptepage);
336 /* And zero out the PGD entry we we never release it twice. */ 341 /* And zero out the PGD entry so we never release it twice. */
337 *spgd = __pgd(0); 342 *spgd = __pgd(0);
338 } 343 }
339} 344}
340 345
341/*H:440 (v) Flushing (thowing away) page tables, 346/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
342 * 347 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
343 * We saw flush_user_mappings() called when we re-used a top-level pgdir page. 348 * It simply releases every PTE page from 0 up to the Guest's kernel address. */
344 * It simply releases every PTE page from 0 up to the kernel address. */
345static void flush_user_mappings(struct lguest *lg, int idx) 349static void flush_user_mappings(struct lguest *lg, int idx)
346{ 350{
347 unsigned int i; 351 unsigned int i;
@@ -350,8 +354,10 @@ static void flush_user_mappings(struct lguest *lg, int idx)
350 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 354 release_pgd(lg, lg->pgdirs[idx].pgdir + i);
351} 355}
352 356
353/* The Guest also has a hypercall to do this manually: it's used when a large 357/*H:440 (v) Flushing (throwing away) page tables,
354 * number of mappings have been changed. */ 358 *
359 * The Guest has a hypercall to throw away the page tables: it's used when a
360 * large number of mappings have been changed. */
355void guest_pagetable_flush_user(struct lguest *lg) 361void guest_pagetable_flush_user(struct lguest *lg)
356{ 362{
357 /* Drop the userspace part of the current page table. */ 363 /* Drop the userspace part of the current page table. */
@@ -423,8 +429,9 @@ static unsigned int new_pgdir(struct lguest *lg,
423 429
424/*H:430 (iv) Switching page tables 430/*H:430 (iv) Switching page tables
425 * 431 *
426 * This is what happens when the Guest changes page tables (ie. changes the 432 * Now we've seen all the page table setting and manipulation, let's see what
427 * top-level pgdir). This happens on almost every context switch. */ 433 * what happens when the Guest changes page tables (ie. changes the top-level
434 * pgdir). This occurs on almost every context switch. */
428void guest_new_pagetable(struct lguest *lg, unsigned long pgtable) 435void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
429{ 436{
430 int newpgdir, repin = 0; 437 int newpgdir, repin = 0;
@@ -443,7 +450,8 @@ void guest_new_pagetable(struct lguest *lg, unsigned long pgtable)
443} 450}
444 451
445/*H:470 Finally, a routine which throws away everything: all PGD entries in all 452/*H:470 Finally, a routine which throws away everything: all PGD entries in all
446 * the shadow page tables. This is used when we destroy the Guest. */ 453 * the shadow page tables, including the Guest's kernel mappings. This is used
454 * when we destroy the Guest. */
447static void release_all_pagetables(struct lguest *lg) 455static void release_all_pagetables(struct lguest *lg)
448{ 456{
449 unsigned int i, j; 457 unsigned int i, j;
@@ -458,13 +466,22 @@ static void release_all_pagetables(struct lguest *lg)
458 466
459/* We also throw away everything when a Guest tells us it's changed a kernel 467/* We also throw away everything when a Guest tells us it's changed a kernel
460 * mapping. Since kernel mappings are in every page table, it's easiest to 468 * mapping. Since kernel mappings are in every page table, it's easiest to
461 * throw them all away. This is amazingly slow, but thankfully rare. */ 469 * throw them all away. This traps the Guest in amber for a while as
470 * everything faults back in, but it's rare. */
462void guest_pagetable_clear_all(struct lguest *lg) 471void guest_pagetable_clear_all(struct lguest *lg)
463{ 472{
464 release_all_pagetables(lg); 473 release_all_pagetables(lg);
465 /* We need the Guest kernel stack mapped again. */ 474 /* We need the Guest kernel stack mapped again. */
466 pin_stack_pages(lg); 475 pin_stack_pages(lg);
467} 476}
477/*:*/
478/*M:009 Since we throw away all mappings when a kernel mapping changes, our
479 * performance sucks for guests using highmem. In fact, a guest with
480 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
481 * usually slower than a Guest with less memory.
482 *
483 * This, of course, cannot be fixed. It would take some kind of... well, I
484 * don't know, but the term "puissant code-fu" comes to mind. :*/
468 485
469/*H:420 This is the routine which actually sets the page table entry for then 486/*H:420 This is the routine which actually sets the page table entry for then
470 * "idx"'th shadow page table. 487 * "idx"'th shadow page table.
@@ -483,7 +500,7 @@ void guest_pagetable_clear_all(struct lguest *lg)
483static void do_set_pte(struct lguest *lg, int idx, 500static void do_set_pte(struct lguest *lg, int idx,
484 unsigned long vaddr, pte_t gpte) 501 unsigned long vaddr, pte_t gpte)
485{ 502{
486 /* Look up the matching shadow page directot entry. */ 503 /* Look up the matching shadow page directory entry. */
487 pgd_t *spgd = spgd_addr(lg, idx, vaddr); 504 pgd_t *spgd = spgd_addr(lg, idx, vaddr);
488 505
489 /* If the top level isn't present, there's no entry to update. */ 506 /* If the top level isn't present, there's no entry to update. */
@@ -500,7 +517,8 @@ static void do_set_pte(struct lguest *lg, int idx,
500 *spte = gpte_to_spte(lg, gpte, 517 *spte = gpte_to_spte(lg, gpte,
501 pte_flags(gpte) & _PAGE_DIRTY); 518 pte_flags(gpte) & _PAGE_DIRTY);
502 } else 519 } else
503 /* Otherwise we can demand_page() it in later. */ 520 /* Otherwise kill it and we can demand_page() it in
521 * later. */
504 *spte = __pte(0); 522 *spte = __pte(0);
505 } 523 }
506} 524}
@@ -535,7 +553,7 @@ void guest_set_pte(struct lguest *lg,
535} 553}
536 554
537/*H:400 555/*H:400
538 * (iii) Setting up a page table entry when the Guest tells us it has changed. 556 * (iii) Setting up a page table entry when the Guest tells us one has changed.
539 * 557 *
540 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal 558 * Just like we did in interrupts_and_traps.c, it makes sense for us to deal
541 * with the other side of page tables while we're here: what happens when the 559 * with the other side of page tables while we're here: what happens when the
@@ -612,9 +630,10 @@ void free_guest_pagetable(struct lguest *lg)
612 630
613/*H:480 (vi) Mapping the Switcher when the Guest is about to run. 631/*H:480 (vi) Mapping the Switcher when the Guest is about to run.
614 * 632 *
615 * The Switcher and the two pages for this CPU need to be available to the 633 * The Switcher and the two pages for this CPU need to be visible in the
616 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 634 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
617 * for each CPU already set up, we just need to hook them in. */ 635 * for each CPU already set up, we just need to hook them in now we know which
636 * Guest is about to run on this CPU. */
618void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) 637void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages)
619{ 638{
620 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 639 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
@@ -677,6 +696,18 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
677 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 696 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));
678} 697}
679 698
699/* We've made it through the page table code. Perhaps our tired brains are
700 * still processing the details, or perhaps we're simply glad it's over.
701 *
702 * If nothing else, note that all this complexity in juggling shadow page
703 * tables in sync with the Guest's page tables is for one reason: for most
704 * Guests this page table dance determines how bad performance will be. This
705 * is why Xen uses exotic direct Guest pagetable manipulation, and why both
706 * Intel and AMD have implemented shadow page table support directly into
707 * hardware.
708 *
709 * There is just one file remaining in the Host. */
710
680/*H:510 At boot or module load time, init_pagetables() allocates and populates 711/*H:510 At boot or module load time, init_pagetables() allocates and populates
681 * the Switcher PTE page for each CPU. */ 712 * the Switcher PTE page for each CPU. */
682__init int init_pagetables(struct page **switcher_page, unsigned int pages) 713__init int init_pagetables(struct page **switcher_page, unsigned int pages)
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index c2434ec99f7b..9e189cbec7dd 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -12,8 +12,6 @@
12#include "lg.h" 12#include "lg.h"
13 13
14/*H:600 14/*H:600
15 * We've almost completed the Host; there's just one file to go!
16 *
17 * Segments & The Global Descriptor Table 15 * Segments & The Global Descriptor Table
18 * 16 *
19 * (That title sounds like a bad Nerdcore group. Not to suggest that there are 17 * (That title sounds like a bad Nerdcore group. Not to suggest that there are
@@ -55,7 +53,7 @@ static int ignored_gdt(unsigned int num)
55 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 53 || num == GDT_ENTRY_DOUBLEFAULT_TSS);
56} 54}
57 55
58/*H:610 Once the GDT has been changed, we fix the new entries up a little. We 56/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
59 * don't care if they're invalid: the worst that can happen is a General 57 * don't care if they're invalid: the worst that can happen is a General
60 * Protection Fault in the Switcher when it restores a Guest segment register 58 * Protection Fault in the Switcher when it restores a Guest segment register
61 * which tries to use that entry. Then we kill the Guest for causing such a 59 * which tries to use that entry. Then we kill the Guest for causing such a
@@ -84,25 +82,33 @@ static void fixup_gdt_table(struct lguest *lg, unsigned start, unsigned end)
84 } 82 }
85} 83}
86 84
87/* This routine is called at boot or modprobe time for each CPU to set up the 85/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep
88 * "constant" GDT entries for Guests running on that CPU. */ 86 * a GDT for each CPU, and copy across the Guest's entries each time we want to
87 * run the Guest on that CPU.
88 *
89 * This routine is called at boot or modprobe time for each CPU to set up the
90 * constant GDT entries: the ones which are the same no matter what Guest we're
91 * running. */
89void setup_default_gdt_entries(struct lguest_ro_state *state) 92void setup_default_gdt_entries(struct lguest_ro_state *state)
90{ 93{
91 struct desc_struct *gdt = state->guest_gdt; 94 struct desc_struct *gdt = state->guest_gdt;
92 unsigned long tss = (unsigned long)&state->guest_tss; 95 unsigned long tss = (unsigned long)&state->guest_tss;
93 96
94 /* The hypervisor segments are full 0-4G segments, privilege level 0 */ 97 /* The Switcher segments are full 0-4G segments, privilege level 0 */
95 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 98 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
96 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 99 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
97 100
98 /* The TSS segment refers to the TSS entry for this CPU, so we cannot 101 /* The TSS segment refers to the TSS entry for this particular CPU.
99 * copy it from the Guest. Forgive the magic flags */ 102 * Forgive the magic flags: the 0x8900 means the entry is Present, it's
103 * privilege level 0 Available 386 TSS system segment, and the 0x67
104 * means Saturn is eclipsed by Mercury in the twelfth house. */
100 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 105 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
101 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 106 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
102 | ((tss >> 16) & 0x000000FF); 107 | ((tss >> 16) & 0x000000FF);
103} 108}
104 109
105/* This routine is called before the Guest is run for the first time. */ 110/* This routine sets up the initial Guest GDT for booting. All entries start
111 * as 0 (unusable). */
106void setup_guest_gdt(struct lguest *lg) 112void setup_guest_gdt(struct lguest *lg)
107{ 113{
108 /* Start with full 0-4G segments... */ 114 /* Start with full 0-4G segments... */
@@ -114,13 +120,8 @@ void setup_guest_gdt(struct lguest *lg)
114 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 120 lg->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
115} 121}
116 122
117/* Like the IDT, we never simply use the GDT the Guest gives us. We set up the 123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage"
118 * GDTs for each CPU, then we copy across the entries each time we want to run 124 * entries. */
119 * a different Guest on that CPU. */
120
121/* A partial GDT load, for the three "thead-local storage" entries. Otherwise
122 * it's just like load_guest_gdt(). So much, in fact, it would probably be
123 * neater to have a single hypercall to cover both. */
124void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt) 125void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
125{ 126{
126 unsigned int i; 127 unsigned int i;
@@ -129,7 +130,9 @@ void copy_gdt_tls(const struct lguest *lg, struct desc_struct *gdt)
129 gdt[i] = lg->arch.gdt[i]; 130 gdt[i] = lg->arch.gdt[i];
130} 131}
131 132
132/* This is the full version */ 133/*H:640 When the Guest is run on a different CPU, or the GDT entries have
134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this
135 * CPU's GDT. */
133void copy_gdt(const struct lguest *lg, struct desc_struct *gdt) 136void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
134{ 137{
135 unsigned int i; 138 unsigned int i;
@@ -141,7 +144,8 @@ void copy_gdt(const struct lguest *lg, struct desc_struct *gdt)
141 gdt[i] = lg->arch.gdt[i]; 144 gdt[i] = lg->arch.gdt[i];
142} 145}
143 146
144/* This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT). */ 147/*H:620 This is where the Guest asks us to load a new GDT (LHCALL_LOAD_GDT).
148 * We copy it from the Guest and tweak the entries. */
145void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num) 149void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
146{ 150{
147 /* We assume the Guest has the same number of GDT entries as the 151 /* We assume the Guest has the same number of GDT entries as the
@@ -157,16 +161,22 @@ void load_guest_gdt(struct lguest *lg, unsigned long table, u32 num)
157 lg->changed |= CHANGED_GDT; 161 lg->changed |= CHANGED_GDT;
158} 162}
159 163
164/* This is the fast-track version for just changing the three TLS entries.
165 * Remember that this happens on every context switch, so it's worth
166 * optimizing. But wouldn't it be neater to have a single hypercall to cover
167 * both cases? */
160void guest_load_tls(struct lguest *lg, unsigned long gtls) 168void guest_load_tls(struct lguest *lg, unsigned long gtls)
161{ 169{
162 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN]; 170 struct desc_struct *tls = &lg->arch.gdt[GDT_ENTRY_TLS_MIN];
163 171
164 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES); 172 __lgread(lg, tls, gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
165 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1); 173 fixup_gdt_table(lg, GDT_ENTRY_TLS_MIN, GDT_ENTRY_TLS_MAX+1);
174 /* Note that just the TLS entries have changed. */
166 lg->changed |= CHANGED_GDT_TLS; 175 lg->changed |= CHANGED_GDT_TLS;
167} 176}
177/*:*/
168 178
169/* 179/*H:660
170 * With this, we have finished the Host. 180 * With this, we have finished the Host.
171 * 181 *
172 * Five of the seven parts of our task are complete. You have made it through 182 * Five of the seven parts of our task are complete. You have made it through
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 09d9207420dc..482aec2a9631 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -63,7 +63,7 @@ static struct lguest_pages *lguest_pages(unsigned int cpu)
63static DEFINE_PER_CPU(struct lguest *, last_guest); 63static DEFINE_PER_CPU(struct lguest *, last_guest);
64 64
65/*S:010 65/*S:010
66 * We are getting close to the Switcher. 66 * We approach the Switcher.
67 * 67 *
68 * Remember that each CPU has two pages which are visible to the Guest when it 68 * Remember that each CPU has two pages which are visible to the Guest when it
69 * runs on that CPU. This has to contain the state for that Guest: we copy the 69 * runs on that CPU. This has to contain the state for that Guest: we copy the
@@ -134,7 +134,7 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
134 * 134 *
135 * The lcall also pushes the old code segment (KERNEL_CS) onto the 135 * The lcall also pushes the old code segment (KERNEL_CS) onto the
136 * stack, then the address of this call. This stack layout happens to 136 * stack, then the address of this call. This stack layout happens to
137 * exactly match the stack of an interrupt... */ 137 * exactly match the stack layout created by an interrupt... */
138 asm volatile("pushf; lcall *lguest_entry" 138 asm volatile("pushf; lcall *lguest_entry"
139 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 139 /* This is how we tell GCC that %eax ("a") and %ebx ("b")
140 * are changed by this routine. The "=" means output. */ 140 * are changed by this routine. The "=" means output. */
@@ -151,40 +151,46 @@ static void run_guest_once(struct lguest *lg, struct lguest_pages *pages)
151} 151}
152/*:*/ 152/*:*/
153 153
154/*M:002 There are hooks in the scheduler which we can register to tell when we
155 * get kicked off the CPU (preempt_notifier_register()). This would allow us
156 * to lazily disable SYSENTER which would regain some performance, and should
157 * also simplify copy_in_guest_info(). Note that we'd still need to restore
158 * things when we exit to Launcher userspace, but that's fairly easy.
159 *
160 * The hooks were designed for KVM, but we can also put them to good use. :*/
161
154/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 162/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts
155 * are disabled: we own the CPU. */ 163 * are disabled: we own the CPU. */
156void lguest_arch_run_guest(struct lguest *lg) 164void lguest_arch_run_guest(struct lguest *lg)
157{ 165{
158 /* Remember the awfully-named TS bit? If the Guest has asked 166 /* Remember the awfully-named TS bit? If the Guest has asked to set it
159 * to set it we set it now, so we can trap and pass that trap 167 * we set it now, so we can trap and pass that trap to the Guest if it
160 * to the Guest if it uses the FPU. */ 168 * uses the FPU. */
161 if (lg->ts) 169 if (lg->ts)
162 lguest_set_ts(); 170 lguest_set_ts();
163 171
164 /* SYSENTER is an optimized way of doing system calls. We 172 /* SYSENTER is an optimized way of doing system calls. We can't allow
165 * can't allow it because it always jumps to privilege level 0. 173 * it because it always jumps to privilege level 0. A normal Guest
166 * A normal Guest won't try it because we don't advertise it in 174 * won't try it because we don't advertise it in CPUID, but a malicious
167 * CPUID, but a malicious Guest (or malicious Guest userspace 175 * Guest (or malicious Guest userspace program) could, so we tell the
168 * program) could, so we tell the CPU to disable it before 176 * CPU to disable it before running the Guest. */
169 * running the Guest. */
170 if (boot_cpu_has(X86_FEATURE_SEP)) 177 if (boot_cpu_has(X86_FEATURE_SEP))
171 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 178 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
172 179
173 /* Now we actually run the Guest. It will pop back out when 180 /* Now we actually run the Guest. It will return when something
174 * something interesting happens, and we can examine its 181 * interesting happens, and we can examine its registers to see what it
175 * registers to see what it was doing. */ 182 * was doing. */
176 run_guest_once(lg, lguest_pages(raw_smp_processor_id())); 183 run_guest_once(lg, lguest_pages(raw_smp_processor_id()));
177 184
178 /* The "regs" pointer contains two extra entries which are not 185 /* Note that the "regs" pointer contains two extra entries which are
179 * really registers: a trap number which says what interrupt or 186 * not really registers: a trap number which says what interrupt or
180 * trap made the switcher code come back, and an error code 187 * trap made the switcher code come back, and an error code which some
181 * which some traps set. */ 188 * traps set. */
182 189
183 /* If the Guest page faulted, then the cr2 register will tell 190 /* If the Guest page faulted, then the cr2 register will tell us the
184 * us the bad virtual address. We have to grab this now, 191 * bad virtual address. We have to grab this now, because once we
185 * because once we re-enable interrupts an interrupt could 192 * re-enable interrupts an interrupt could fault and thus overwrite
186 * fault and thus overwrite cr2, or we could even move off to a 193 * cr2, or we could even move off to a different CPU. */
187 * different CPU. */
188 if (lg->regs->trapnum == 14) 194 if (lg->regs->trapnum == 14)
189 lg->arch.last_pagefault = read_cr2(); 195 lg->arch.last_pagefault = read_cr2();
190 /* Similarly, if we took a trap because the Guest used the FPU, 196 /* Similarly, if we took a trap because the Guest used the FPU,
@@ -197,14 +203,15 @@ void lguest_arch_run_guest(struct lguest *lg)
197 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 203 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
198} 204}
199 205
200/*H:130 Our Guest is usually so well behaved; it never tries to do things it 206/*H:130 Now we've examined the hypercall code; our Guest can make requests.
201 * isn't allowed to. Unfortunately, Linux's paravirtual infrastructure isn't 207 * Our Guest is usually so well behaved; it never tries to do things it isn't
202 * quite complete, because it doesn't contain replacements for the Intel I/O 208 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
203 * instructions. As a result, the Guest sometimes fumbles across one during 209 * infrastructure isn't quite complete, because it doesn't contain replacements
204 * the boot process as it probes for various things which are usually attached 210 * for the Intel I/O instructions. As a result, the Guest sometimes fumbles
205 * to a PC. 211 * across one during the boot process as it probes for various things which are
212 * usually attached to a PC.
206 * 213 *
207 * When the Guest uses one of these instructions, we get trap #13 (General 214 * When the Guest uses one of these instructions, we get a trap (General
208 * Protection Fault) and come here. We see if it's one of those troublesome 215 * Protection Fault) and come here. We see if it's one of those troublesome
209 * instructions and skip over it. We return true if we did. */ 216 * instructions and skip over it. We return true if we did. */
210static int emulate_insn(struct lguest *lg) 217static int emulate_insn(struct lguest *lg)
@@ -275,43 +282,43 @@ static int emulate_insn(struct lguest *lg)
275void lguest_arch_handle_trap(struct lguest *lg) 282void lguest_arch_handle_trap(struct lguest *lg)
276{ 283{
277 switch (lg->regs->trapnum) { 284 switch (lg->regs->trapnum) {
278 case 13: /* We've intercepted a GPF. */ 285 case 13: /* We've intercepted a General Protection Fault. */
279 /* Check if this was one of those annoying IN or OUT 286 /* Check if this was one of those annoying IN or OUT
280 * instructions which we need to emulate. If so, we 287 * instructions which we need to emulate. If so, we just go
281 * just go back into the Guest after we've done it. */ 288 * back into the Guest after we've done it. */
282 if (lg->regs->errcode == 0) { 289 if (lg->regs->errcode == 0) {
283 if (emulate_insn(lg)) 290 if (emulate_insn(lg))
284 return; 291 return;
285 } 292 }
286 break; 293 break;
287 case 14: /* We've intercepted a page fault. */ 294 case 14: /* We've intercepted a Page Fault. */
288 /* The Guest accessed a virtual address that wasn't 295 /* The Guest accessed a virtual address that wasn't mapped.
289 * mapped. This happens a lot: we don't actually set 296 * This happens a lot: we don't actually set up most of the
290 * up most of the page tables for the Guest at all when 297 * page tables for the Guest at all when we start: as it runs
291 * we start: as it runs it asks for more and more, and 298 * it asks for more and more, and we set them up as
292 * we set them up as required. In this case, we don't 299 * required. In this case, we don't even tell the Guest that
293 * even tell the Guest that the fault happened. 300 * the fault happened.
294 * 301 *
295 * The errcode tells whether this was a read or a 302 * The errcode tells whether this was a read or a write, and
296 * write, and whether kernel or userspace code. */ 303 * whether kernel or userspace code. */
297 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode)) 304 if (demand_page(lg, lg->arch.last_pagefault, lg->regs->errcode))
298 return; 305 return;
299 306
300 /* OK, it's really not there (or not OK): the Guest 307 /* OK, it's really not there (or not OK): the Guest needs to
301 * needs to know. We write out the cr2 value so it 308 * know. We write out the cr2 value so it knows where the
302 * knows where the fault occurred. 309 * fault occurred.
303 * 310 *
304 * Note that if the Guest were really messed up, this 311 * Note that if the Guest were really messed up, this could
305 * could happen before it's done the INITIALIZE 312 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
306 * hypercall, so lg->lguest_data will be NULL */ 313 * lg->lguest_data could be NULL */
307 if (lg->lguest_data && 314 if (lg->lguest_data &&
308 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2)) 315 put_user(lg->arch.last_pagefault, &lg->lguest_data->cr2))
309 kill_guest(lg, "Writing cr2"); 316 kill_guest(lg, "Writing cr2");
310 break; 317 break;
311 case 7: /* We've intercepted a Device Not Available fault. */ 318 case 7: /* We've intercepted a Device Not Available fault. */
312 /* If the Guest doesn't want to know, we already 319 /* If the Guest doesn't want to know, we already restored the
313 * restored the Floating Point Unit, so we just 320 * Floating Point Unit, so we just continue without telling
314 * continue without telling it. */ 321 * it. */
315 if (!lg->ts) 322 if (!lg->ts)
316 return; 323 return;
317 break; 324 break;
@@ -536,9 +543,6 @@ int lguest_arch_init_hypercalls(struct lguest *lg)
536 543
537 return 0; 544 return 0;
538} 545}
539/* Now we've examined the hypercall code; our Guest can make requests. There
540 * is one other way we can do things for the Guest, as we see in
541 * emulate_insn(). :*/
542 546
543/*L:030 lguest_arch_setup_regs() 547/*L:030 lguest_arch_setup_regs()
544 * 548 *
@@ -570,8 +574,8 @@ void lguest_arch_setup_regs(struct lguest *lg, unsigned long start)
570 574
571 /* %esi points to our boot information, at physical address 0, so don't 575 /* %esi points to our boot information, at physical address 0, so don't
572 * touch it. */ 576 * touch it. */
577
573 /* There are a couple of GDT entries the Guest expects when first 578 /* There are a couple of GDT entries the Guest expects when first
574 * booting. */ 579 * booting. */
575
576 setup_guest_gdt(lg); 580 setup_guest_gdt(lg);
577} 581}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 1010b90b11fc..0af8baaa0d4a 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -6,6 +6,37 @@
6 * are feeling invigorated and refreshed then the next, more challenging stage 6 * are feeling invigorated and refreshed then the next, more challenging stage
7 * can be found in "make Guest". :*/ 7 * can be found in "make Guest". :*/
8 8
9/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
10 * gain at least 1% more performance. Since neither LOC nor performance can be
11 * measured beforehand, it generally means implementing a feature then deciding
12 * if it's worth it. And once it's implemented, who can say no?
13 *
14 * This is why I haven't implemented this idea myself. I want to, but I
15 * haven't. You could, though.
16 *
17 * The main place where lguest performance sucks is Guest page faulting. When
18 * a Guest userspace process hits an unmapped page we switch back to the Host,
19 * walk the page tables, find it's not mapped, switch back to the Guest page
20 * fault handler, which calls a hypercall to set the page table entry, then
21 * finally returns to userspace. That's two round-trips.
22 *
23 * If we had a small walker in the Switcher, we could quickly check the Guest
24 * page table and if the page isn't mapped, immediately reflect the fault back
25 * into the Guest. This means the Switcher would have to know the top of the
26 * Guest page table and the page fault handler address.
27 *
28 * For simplicity, the Guest should only handle the case where the privilege
29 * level of the fault is 3 and probably only not present or write faults. It
30 * should also detect recursive faults, and hand the original fault to the
31 * Host (which is actually really easy).
32 *
33 * Two questions remain. Would the performance gain outweigh the complexity?
34 * And who would write the verse documenting it? :*/
35
36/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their
37 * code). It's worth doing though, since it would let us use oprofile in the
38 * Host when a Guest is running. :*/
39
9/*S:100 40/*S:100
10 * Welcome to the Switcher itself! 41 * Welcome to the Switcher itself!
11 * 42 *
@@ -88,7 +119,7 @@ ENTRY(switch_to_guest)
88 119
89 // All saved and there's now five steps before us: 120 // All saved and there's now five steps before us:
90 // Stack, GDT, IDT, TSS 121 // Stack, GDT, IDT, TSS
91 // And last of all the page tables are flipped. 122 // Then last of all the page tables are flipped.
92 123
93 // Yet beware that our stack pointer must be 124 // Yet beware that our stack pointer must be
94 // Always valid lest an NMI hits 125 // Always valid lest an NMI hits
@@ -103,25 +134,25 @@ ENTRY(switch_to_guest)
103 lgdt LGUEST_PAGES_guest_gdt_desc(%eax) 134 lgdt LGUEST_PAGES_guest_gdt_desc(%eax)
104 135
105 // The Guest's IDT we did partially 136 // The Guest's IDT we did partially
106 // Move to the "struct lguest_pages" as well. 137 // Copy to "struct lguest_pages" as well.
107 lidt LGUEST_PAGES_guest_idt_desc(%eax) 138 lidt LGUEST_PAGES_guest_idt_desc(%eax)
108 139
109 // The TSS entry which controls traps 140 // The TSS entry which controls traps
110 // Must be loaded up with "ltr" now: 141 // Must be loaded up with "ltr" now:
142 // The GDT entry that TSS uses
143 // Changes type when we load it: damn Intel!
111 // For after we switch over our page tables 144 // For after we switch over our page tables
112 // It (as the rest) will be writable no more. 145 // That entry will be read-only: we'd crash.
113 // (The GDT entry TSS needs
114 // Changes type when we load it: damn Intel!)
115 movl $(GDT_ENTRY_TSS*8), %edx 146 movl $(GDT_ENTRY_TSS*8), %edx
116 ltr %dx 147 ltr %dx
117 148
118 // Look back now, before we take this last step! 149 // Look back now, before we take this last step!
119 // The Host's TSS entry was also marked used; 150 // The Host's TSS entry was also marked used;
120 // Let's clear it again, ere we return. 151 // Let's clear it again for our return.
121 // The GDT descriptor of the Host 152 // The GDT descriptor of the Host
122 // Points to the table after two "size" bytes 153 // Points to the table after two "size" bytes
123 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx 154 movl (LGUEST_PAGES_host_gdt_desc+2)(%eax), %edx
124 // Clear the type field of "used" (byte 5, bit 2) 155 // Clear "used" from type field (byte 5, bit 2)
125 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx) 156 andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%edx)
126 157
127 // Once our page table's switched, the Guest is live! 158 // Once our page table's switched, the Guest is live!
@@ -131,7 +162,7 @@ ENTRY(switch_to_guest)
131 162
132 // The page table change did one tricky thing: 163 // The page table change did one tricky thing:
133 // The Guest's register page has been mapped 164 // The Guest's register page has been mapped
134 // Writable onto our %esp (stack) -- 165 // Writable under our %esp (stack) --
135 // We can simply pop off all Guest regs. 166 // We can simply pop off all Guest regs.
136 popl %eax 167 popl %eax
137 popl %ebx 168 popl %ebx
@@ -152,16 +183,15 @@ ENTRY(switch_to_guest)
152 addl $8, %esp 183 addl $8, %esp
153 184
154 // The last five stack slots hold return address 185 // The last five stack slots hold return address
155 // And everything needed to change privilege 186 // And everything needed to switch privilege
156 // Into the Guest privilege level of 1, 187 // From Switcher's level 0 to Guest's 1,
157 // And the stack where the Guest had last left it. 188 // And the stack where the Guest had last left it.
158 // Interrupts are turned back on: we are Guest. 189 // Interrupts are turned back on: we are Guest.
159 iret 190 iret
160 191
161// There are two paths where we switch to the Host 192// We treat two paths to switch back to the Host
193// Yet both must save Guest state and restore Host
162// So we put the routine in a macro. 194// So we put the routine in a macro.
163// We are on our way home, back to the Host
164// Interrupted out of the Guest, we come here.
165#define SWITCH_TO_HOST \ 195#define SWITCH_TO_HOST \
166 /* We save the Guest state: all registers first \ 196 /* We save the Guest state: all registers first \
167 * Laid out just as "struct lguest_regs" defines */ \ 197 * Laid out just as "struct lguest_regs" defines */ \
@@ -194,7 +224,7 @@ ENTRY(switch_to_guest)
194 movl %esp, %eax; \ 224 movl %esp, %eax; \
195 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \ 225 andl $(~(1 << PAGE_SHIFT - 1)), %eax; \
196 /* Save our trap number: the switch will obscure it \ 226 /* Save our trap number: the switch will obscure it \
197 * (The Guest regs are not mapped here in the Host) \ 227 * (In the Host the Guest regs are not mapped here) \
198 * %ebx holds it safe for deliver_to_host */ \ 228 * %ebx holds it safe for deliver_to_host */ \
199 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \ 229 movl LGUEST_PAGES_regs_trapnum(%eax), %ebx; \
200 /* The Host GDT, IDT and stack! \ 230 /* The Host GDT, IDT and stack! \
@@ -210,9 +240,9 @@ ENTRY(switch_to_guest)
210 /* Switch to Host's GDT, IDT. */ \ 240 /* Switch to Host's GDT, IDT. */ \
211 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \ 241 lgdt LGUEST_PAGES_host_gdt_desc(%eax); \
212 lidt LGUEST_PAGES_host_idt_desc(%eax); \ 242 lidt LGUEST_PAGES_host_idt_desc(%eax); \
213 /* Restore the Host's stack where it's saved regs lie */ \ 243 /* Restore the Host's stack where its saved regs lie */ \
214 movl LGUEST_PAGES_host_sp(%eax), %esp; \ 244 movl LGUEST_PAGES_host_sp(%eax), %esp; \
215 /* Last the TSS: our Host is complete */ \ 245 /* Last the TSS: our Host is returned */ \
216 movl $(GDT_ENTRY_TSS*8), %edx; \ 246 movl $(GDT_ENTRY_TSS*8), %edx; \
217 ltr %dx; \ 247 ltr %dx; \
218 /* Restore now the regs saved right at the first. */ \ 248 /* Restore now the regs saved right at the first. */ \
@@ -222,14 +252,15 @@ ENTRY(switch_to_guest)
222 popl %ds; \ 252 popl %ds; \
223 popl %es 253 popl %es
224 254
225// Here's where we come when the Guest has just trapped: 255// The first path is trod when the Guest has trapped:
226// (Which trap we'll see has been pushed on the stack). 256// (Which trap it was has been pushed on the stack).
227// We need only switch back, and the Host will decode 257// We need only switch back, and the Host will decode
228// Why we came home, and what needs to be done. 258// Why we came home, and what needs to be done.
229return_to_host: 259return_to_host:
230 SWITCH_TO_HOST 260 SWITCH_TO_HOST
231 iret 261 iret
232 262
263// We are lead to the second path like so:
233// An interrupt, with some cause external 264// An interrupt, with some cause external
234// Has ajerked us rudely from the Guest's code 265// Has ajerked us rudely from the Guest's code
235// Again we must return home to the Host 266// Again we must return home to the Host
@@ -238,7 +269,7 @@ deliver_to_host:
238 // But now we must go home via that place 269 // But now we must go home via that place
239 // Where that interrupt was supposed to go 270 // Where that interrupt was supposed to go
240 // Had we not been ensconced, running the Guest. 271 // Had we not been ensconced, running the Guest.
241 // Here we see the cleverness of our stack: 272 // Here we see the trickness of run_guest_once():
242 // The Host stack is formed like an interrupt 273 // The Host stack is formed like an interrupt
243 // With EIP, CS and EFLAGS layered. 274 // With EIP, CS and EFLAGS layered.
244 // Interrupt handlers end with "iret" 275 // Interrupt handlers end with "iret"
@@ -263,7 +294,7 @@ deliver_to_host:
263 xorw %ax, %ax 294 xorw %ax, %ax
264 orl %eax, %edx 295 orl %eax, %edx
265 // Now the address of the handler's in %edx 296 // Now the address of the handler's in %edx
266 // We call it now: its "iret" takes us home. 297 // We call it now: its "iret" drops us home.
267 jmp *%edx 298 jmp *%edx
268 299
269// Every interrupt can come to us here 300// Every interrupt can come to us here
diff --git a/include/asm-x86/lguest_hcall.h b/include/asm-x86/lguest_hcall.h
index f948491eb56a..9c5092b6aa9f 100644
--- a/include/asm-x86/lguest_hcall.h
+++ b/include/asm-x86/lguest_hcall.h
@@ -18,12 +18,17 @@
18#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
20 20
21#define LGUEST_TRAP_ENTRY 0x1F
22
23#ifndef __ASSEMBLY__
24#include <asm/hw_irq.h>
25
21/*G:031 First, how does our Guest contact the Host to ask for privileged 26/*G:031 First, how does our Guest contact the Host to ask for privileged
22 * operations? There are two ways: the direct way is to make a "hypercall", 27 * operations? There are two ways: the direct way is to make a "hypercall",
23 * to make requests of the Host Itself. 28 * to make requests of the Host Itself.
24 * 29 *
25 * Our hypercall mechanism uses the highest unused trap code (traps 32 and 30 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
26 * above are used by real hardware interrupts). Seventeen hypercalls are 31 * above are used by real hardware interrupts). Fifteen hypercalls are
27 * available: the hypercall number is put in the %eax register, and the 32 * available: the hypercall number is put in the %eax register, and the
28 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return 33 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
29 * value makes sense, it's returned in %eax. 34 * value makes sense, it's returned in %eax.
@@ -31,20 +36,15 @@
31 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 36 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
32 * Host, rather than returning failure. This reflects Winston Churchill's 37 * Host, rather than returning failure. This reflects Winston Churchill's
33 * definition of a gentleman: "someone who is only rude intentionally". */ 38 * definition of a gentleman: "someone who is only rude intentionally". */
34#define LGUEST_TRAP_ENTRY 0x1F
35
36#ifndef __ASSEMBLY__
37#include <asm/hw_irq.h>
38
39static inline unsigned long 39static inline unsigned long
40hcall(unsigned long call, 40hcall(unsigned long call,
41 unsigned long arg1, unsigned long arg2, unsigned long arg3) 41 unsigned long arg1, unsigned long arg2, unsigned long arg3)
42{ 42{
43 /* "int" is the Intel instruction to trigger a trap. */ 43 /* "int" is the Intel instruction to trigger a trap. */
44 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY) 44 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
45 /* The call is in %eax (aka "a"), and can be replaced */ 45 /* The call in %eax (aka "a") might be overwritten */
46 : "=a"(call) 46 : "=a"(call)
47 /* The other arguments are in %eax, %edx, %ebx & %ecx */ 47 /* The arguments are in %eax, %edx, %ebx & %ecx */
48 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3) 48 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
49 /* "memory" means this might write somewhere in memory. 49 /* "memory" means this might write somewhere in memory.
50 * This isn't true for all calls, but it's safe to tell 50 * This isn't true for all calls, but it's safe to tell
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 8beb29134626..175e63f4a8c0 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -12,8 +12,8 @@
12#define LG_CLOCK_MAX_DELTA ULONG_MAX 12#define LG_CLOCK_MAX_DELTA ULONG_MAX
13 13
14/*G:032 The second method of communicating with the Host is to via "struct 14/*G:032 The second method of communicating with the Host is to via "struct
15 * lguest_data". The Guest's very first hypercall is to tell the Host where 15 * lguest_data". Once the Guest's initialization hypercall tells the Host where
16 * this is, and then the Guest and Host both publish information in it. :*/ 16 * this is, the Guest and Host both publish information in it. :*/
17struct lguest_data 17struct lguest_data
18{ 18{
19 /* 512 == enabled (same as eflags in normal hardware). The Guest 19 /* 512 == enabled (same as eflags in normal hardware). The Guest
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index c41fd483af34..697104da91f1 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -10,7 +10,11 @@
10 * real devices (think of the damage it could do!) we provide virtual devices. 10 * real devices (think of the damage it could do!) we provide virtual devices.
11 * We could emulate a PCI bus with various devices on it, but that is a fairly 11 * We could emulate a PCI bus with various devices on it, but that is a fairly
12 * complex burden for the Host and suboptimal for the Guest, so we have our own 12 * complex burden for the Host and suboptimal for the Guest, so we have our own
13 * "lguest" bus and simple drivers. 13 * simple lguest bus and we use "virtio" drivers. These drivers need a set of
14 * routines from us which will actually do the virtual I/O, but they handle all
15 * the net/block/console stuff themselves. This means that if we want to add
16 * a new device, we simply need to write a new virtio driver and create support
17 * for it in the Launcher: this code won't need to change.
14 * 18 *
15 * Devices are described by a simplified ID, a status byte, and some "config" 19 * Devices are described by a simplified ID, a status byte, and some "config"
16 * bytes which describe this device's configuration. This is placed by the 20 * bytes which describe this device's configuration. This is placed by the