aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/lguest/lguest.c184
1 files changed, 139 insertions, 45 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index aa66a52b73e9..45163651b519 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -49,7 +49,7 @@
49#include "linux/virtio_ring.h" 49#include "linux/virtio_ring.h"
50#include "asm/bootparam.h" 50#include "asm/bootparam.h"
51/*L:110 51/*L:110
52 * We can ignore the 39 include files we need for this program, but I do want 52 * We can ignore the 42 include files we need for this program, but I do want
53 * to draw attention to the use of kernel-style types. 53 * to draw attention to the use of kernel-style types.
54 * 54 *
55 * As Linus said, "C is a Spartan language, and so should your naming be." I 55 * As Linus said, "C is a Spartan language, and so should your naming be." I
@@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num)
305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); 305 PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0);
306 if (addr == MAP_FAILED) 306 if (addr == MAP_FAILED)
307 err(1, "Mmaping %u pages of /dev/zero", num); 307 err(1, "Mmaping %u pages of /dev/zero", num);
308
309 /*
310 * One neat mmap feature is that you can close the fd, and it
311 * stays mapped.
312 */
308 close(fd); 313 close(fd);
309 314
310 return addr; 315 return addr;
@@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start)
557} 562}
558/*:*/ 563/*:*/
559 564
560/* 565/*L:200
561 * Device Handling. 566 * Device Handling.
562 * 567 *
563 * When the Guest gives us a buffer, it sends an array of addresses and sizes. 568 * When the Guest gives us a buffer, it sends an array of addresses and sizes.
@@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc,
608 return next; 613 return next;
609} 614}
610 615
611/* This actually sends the interrupt for this virtqueue */ 616/*
617 * This actually sends the interrupt for this virtqueue, if we've used a
618 * buffer.
619 */
612static void trigger_irq(struct virtqueue *vq) 620static void trigger_irq(struct virtqueue *vq)
613{ 621{
614 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; 622 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
@@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq)
629} 637}
630 638
631/* 639/*
632 * This looks in the virtqueue and for the first available buffer, and converts 640 * This looks in the virtqueue for the first available buffer, and converts
633 * it to an iovec for convenient access. Since descriptors consist of some 641 * it to an iovec for convenient access. Since descriptors consist of some
634 * number of output then some number of input descriptors, it's actually two 642 * number of output then some number of input descriptors, it's actually two
635 * iovecs, but we pack them into one and note how many of each there were. 643 * iovecs, but we pack them into one and note how many of each there were.
636 * 644 *
637 * This function returns the descriptor number found. 645 * This function waits if necessary, and returns the descriptor number found.
638 */ 646 */
639static unsigned wait_for_vq_desc(struct virtqueue *vq, 647static unsigned wait_for_vq_desc(struct virtqueue *vq,
640 struct iovec iov[], 648 struct iovec iov[],
@@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
644 struct vring_desc *desc; 652 struct vring_desc *desc;
645 u16 last_avail = lg_last_avail(vq); 653 u16 last_avail = lg_last_avail(vq);
646 654
655 /* There's nothing available? */
647 while (last_avail == vq->vring.avail->idx) { 656 while (last_avail == vq->vring.avail->idx) {
648 u64 event; 657 u64 event;
649 658
650 /* OK, tell Guest about progress up to now. */ 659 /*
660 * Since we're about to sleep, now is a good time to tell the
661 * Guest about what we've used up to now.
662 */
651 trigger_irq(vq); 663 trigger_irq(vq);
652 664
653 /* OK, now we need to know about added descriptors. */ 665 /* OK, now we need to know about added descriptors. */
@@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq,
734} 746}
735 747
736/* 748/*
737 * After we've used one of their buffers, we tell them about it. We'll then 749 * After we've used one of their buffers, we tell the Guest about it. Sometime
738 * want to send them an interrupt, using trigger_irq(). 750 * later we'll want to send them an interrupt using trigger_irq(); note that
751 * wait_for_vq_desc() does that for us if it has to wait.
739 */ 752 */
740static void add_used(struct virtqueue *vq, unsigned int head, int len) 753static void add_used(struct virtqueue *vq, unsigned int head, int len)
741{ 754{
@@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq)
782 struct console_abort *abort = vq->dev->priv; 795 struct console_abort *abort = vq->dev->priv;
783 struct iovec iov[vq->vring.num]; 796 struct iovec iov[vq->vring.num];
784 797
785 /* Make sure there's a descriptor waiting. */ 798 /* Make sure there's a descriptor available. */
786 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 799 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
787 if (out_num) 800 if (out_num)
788 errx(1, "Output buffers in console in queue?"); 801 errx(1, "Output buffers in console in queue?");
789 802
790 /* Read it in. */ 803 /* Read into it. This is where we usually wait. */
791 len = readv(STDIN_FILENO, iov, in_num); 804 len = readv(STDIN_FILENO, iov, in_num);
792 if (len <= 0) { 805 if (len <= 0) {
793 /* Ran out of input? */ 806 /* Ran out of input? */
@@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq)
800 pause(); 813 pause();
801 } 814 }
802 815
816 /* Tell the Guest we used a buffer. */
803 add_used_and_trigger(vq, head, len); 817 add_used_and_trigger(vq, head, len);
804 818
805 /* 819 /*
@@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq)
834 unsigned int head, out, in; 848 unsigned int head, out, in;
835 struct iovec iov[vq->vring.num]; 849 struct iovec iov[vq->vring.num];
836 850
851 /* We usually wait in here, for the Guest to give us something. */
837 head = wait_for_vq_desc(vq, iov, &out, &in); 852 head = wait_for_vq_desc(vq, iov, &out, &in);
838 if (in) 853 if (in)
839 errx(1, "Input buffers in console output queue?"); 854 errx(1, "Input buffers in console output queue?");
855
856 /* writev can return a partial write, so we loop here. */
840 while (!iov_empty(iov, out)) { 857 while (!iov_empty(iov, out)) {
841 int len = writev(STDOUT_FILENO, iov, out); 858 int len = writev(STDOUT_FILENO, iov, out);
842 if (len <= 0) 859 if (len <= 0)
843 err(1, "Write to stdout gave %i", len); 860 err(1, "Write to stdout gave %i", len);
844 iov_consume(iov, out, len); 861 iov_consume(iov, out, len);
845 } 862 }
863
864 /*
865 * We're finished with that buffer: if we're going to sleep,
866 * wait_for_vq_desc() will prod the Guest with an interrupt.
867 */
846 add_used(vq, head, 0); 868 add_used(vq, head, 0);
847} 869}
848 870
@@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq)
862 unsigned int head, out, in; 884 unsigned int head, out, in;
863 struct iovec iov[vq->vring.num]; 885 struct iovec iov[vq->vring.num];
864 886
887 /* We usually wait in here for the Guest to give us a packet. */
865 head = wait_for_vq_desc(vq, iov, &out, &in); 888 head = wait_for_vq_desc(vq, iov, &out, &in);
866 if (in) 889 if (in)
867 errx(1, "Input buffers in net output queue?"); 890 errx(1, "Input buffers in net output queue?");
891 /*
892 * Send the whole thing through to /dev/net/tun. It expects the exact
893 * same format: what a coincidence!
894 */
868 if (writev(net_info->tunfd, iov, out) < 0) 895 if (writev(net_info->tunfd, iov, out) < 0)
869 errx(1, "Write to tun failed?"); 896 errx(1, "Write to tun failed?");
897
898 /*
899 * Done with that one; wait_for_vq_desc() will send the interrupt if
900 * all packets are processed.
901 */
870 add_used(vq, head, 0); 902 add_used(vq, head, 0);
871} 903}
872 904
873/* Will reading from this file descriptor block? */ 905/*
906 * Handling network input is a bit trickier, because I've tried to optimize it.
907 *
908 * First we have a helper routine which tells is if from this file descriptor
909 * (ie. the /dev/net/tun device) will block:
910 */
874static bool will_block(int fd) 911static bool will_block(int fd)
875{ 912{
876 fd_set fdset; 913 fd_set fdset;
@@ -880,7 +917,11 @@ static bool will_block(int fd)
880 return select(fd+1, &fdset, NULL, NULL, &zero) != 1; 917 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
881} 918}
882 919
883/* This handles packets coming in from the tun device to our Guest. */ 920/*
921 * This handles packets coming in from the tun device to our Guest. Like all
922 * service routines, it gets called again as soon as it returns, so you don't
923 * see a while(1) loop here.
924 */
884static void net_input(struct virtqueue *vq) 925static void net_input(struct virtqueue *vq)
885{ 926{
886 int len; 927 int len;
@@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq)
888 struct iovec iov[vq->vring.num]; 929 struct iovec iov[vq->vring.num];
889 struct net_info *net_info = vq->dev->priv; 930 struct net_info *net_info = vq->dev->priv;
890 931
932 /*
933 * Get a descriptor to write an incoming packet into. This will also
934 * send an interrupt if they're out of descriptors.
935 */
891 head = wait_for_vq_desc(vq, iov, &out, &in); 936 head = wait_for_vq_desc(vq, iov, &out, &in);
892 if (out) 937 if (out)
893 errx(1, "Output buffers in net input queue?"); 938 errx(1, "Output buffers in net input queue?");
894 939
895 /* Deliver interrupt now, since we're about to sleep. */ 940 /*
941 * If it looks like we'll block reading from the tun device, send them
942 * an interrupt.
943 */
896 if (vq->pending_used && will_block(net_info->tunfd)) 944 if (vq->pending_used && will_block(net_info->tunfd))
897 trigger_irq(vq); 945 trigger_irq(vq);
898 946
947 /*
948 * Read in the packet. This is where we normally wait (when there's no
949 * incoming network traffic).
950 */
899 len = readv(net_info->tunfd, iov, in); 951 len = readv(net_info->tunfd, iov, in);
900 if (len <= 0) 952 if (len <= 0)
901 err(1, "Failed to read from tun."); 953 err(1, "Failed to read from tun.");
954
955 /*
956 * Mark that packet buffer as used, but don't interrupt here. We want
957 * to wait until we've done as much work as we can.
958 */
902 add_used(vq, head, len); 959 add_used(vq, head, len);
903} 960}
961/*:*/
904 962
905/* This is the helper to create threads. */ 963/* This is the helper to create threads: run the service routine in a loop. */
906static int do_thread(void *_vq) 964static int do_thread(void *_vq)
907{ 965{
908 struct virtqueue *vq = _vq; 966 struct virtqueue *vq = _vq;
@@ -950,11 +1008,14 @@ static void reset_device(struct device *dev)
950 signal(SIGCHLD, (void *)kill_launcher); 1008 signal(SIGCHLD, (void *)kill_launcher);
951} 1009}
952 1010
1011/*L:216
1012 * This actually creates the thread which services the virtqueue for a device.
1013 */
953static void create_thread(struct virtqueue *vq) 1014static void create_thread(struct virtqueue *vq)
954{ 1015{
955 /* 1016 /*
956 * Create stack for thread and run it. Since the stack grows upwards, 1017 * Create stack for thread. Since the stack grows upwards, we point
957 * we point the stack pointer to the end of this region. 1018 * the stack pointer to the end of this region.
958 */ 1019 */
959 char *stack = malloc(32768); 1020 char *stack = malloc(32768);
960 unsigned long args[] = { LHREQ_EVENTFD, 1021 unsigned long args[] = { LHREQ_EVENTFD,
@@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq)
966 err(1, "Creating eventfd"); 1027 err(1, "Creating eventfd");
967 args[2] = vq->eventfd; 1028 args[2] = vq->eventfd;
968 1029
969 /* Attach an eventfd to this virtqueue: it will go off 1030 /*
970 * when the Guest does an LHCALL_NOTIFY for this vq. */ 1031 * Attach an eventfd to this virtqueue: it will go off when the Guest
1032 * does an LHCALL_NOTIFY for this vq.
1033 */
971 if (write(lguest_fd, &args, sizeof(args)) != 0) 1034 if (write(lguest_fd, &args, sizeof(args)) != 0)
972 err(1, "Attaching eventfd"); 1035 err(1, "Attaching eventfd");
973 1036
974 /* CLONE_VM: because it has to access the Guest memory, and 1037 /*
975 * SIGCHLD so we get a signal if it dies. */ 1038 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
1039 * we get a signal if it dies.
1040 */
976 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); 1041 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
977 if (vq->thread == (pid_t)-1) 1042 if (vq->thread == (pid_t)-1)
978 err(1, "Creating clone"); 1043 err(1, "Creating clone");
979 /* We close our local copy, now the child has it. */ 1044
1045 /* We close our local copy now the child has it. */
980 close(vq->eventfd); 1046 close(vq->eventfd);
981} 1047}
982 1048
@@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev)
1028 } 1094 }
1029} 1095}
1030 1096
1031/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 1097/*L:215
1098 * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In
1099 * particular, it's used to notify us of device status changes during boot.
1100 */
1032static void handle_output(unsigned long addr) 1101static void handle_output(unsigned long addr)
1033{ 1102{
1034 struct device *i; 1103 struct device *i;
@@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr)
1037 for (i = devices.dev; i; i = i->next) { 1106 for (i = devices.dev; i; i = i->next) {
1038 struct virtqueue *vq; 1107 struct virtqueue *vq;
1039 1108
1040 /* Notifications to device descriptors update device status. */ 1109 /*
1110 * Notifications to device descriptors mean they updated the
1111 * device status.
1112 */
1041 if (from_guest_phys(addr) == i->desc) { 1113 if (from_guest_phys(addr) == i->desc) {
1042 update_device_status(i); 1114 update_device_status(i);
1043 return; 1115 return;
1044 } 1116 }
1045 1117
1046 /* Devices *can* be used before status is set to DRIVER_OK. */ 1118 /*
1119 * Devices *can* be used before status is set to DRIVER_OK.
1120 * The original plan was that they would never do this: they
1121 * would always finish setting up their status bits before
1122 * actually touching the virtqueues. In practice, we allowed
1123 * them to, and they do (eg. the disk probes for partition
1124 * tables as part of initialization).
1125 *
1126 * If we see this, we start the device: once it's running, we
1127 * expect the device to catch all the notifications.
1128 */
1047 for (vq = i->vq; vq; vq = vq->next) { 1129 for (vq = i->vq; vq; vq = vq->next) {
1048 if (addr != vq->config.pfn*getpagesize()) 1130 if (addr != vq->config.pfn*getpagesize())
1049 continue; 1131 continue;
1050 if (i->running) 1132 if (i->running)
1051 errx(1, "Notification on running %s", i->name); 1133 errx(1, "Notification on running %s", i->name);
1134 /* This just calls create_thread() for each virtqueue */
1052 start_device(i); 1135 start_device(i);
1053 return; 1136 return;
1054 } 1137 }
@@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1132 vq->next = NULL; 1215 vq->next = NULL;
1133 vq->last_avail_idx = 0; 1216 vq->last_avail_idx = 0;
1134 vq->dev = dev; 1217 vq->dev = dev;
1218
1219 /*
1220 * This is the routine the service thread will run, and its Process ID
1221 * once it's running.
1222 */
1135 vq->service = service; 1223 vq->service = service;
1136 vq->thread = (pid_t)-1; 1224 vq->thread = (pid_t)-1;
1137 1225
@@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1202 1290
1203/* 1291/*
1204 * This routine does all the creation and setup of a new device, including 1292 * This routine does all the creation and setup of a new device, including
1205 * calling new_dev_desc() to allocate the descriptor and device memory. 1293 * calling new_dev_desc() to allocate the descriptor and device memory. We
1294 * don't actually start the service threads until later.
1206 * 1295 *
1207 * See what I mean about userspace being boring? 1296 * See what I mean about userspace being boring?
1208 */ 1297 */
@@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg)
1478 verbose("device %u: tun %s: %s\n", 1567 verbose("device %u: tun %s: %s\n",
1479 devices.device_num, tapif, arg); 1568 devices.device_num, tapif, arg);
1480} 1569}
1481 1570/*:*/
1482/*
1483 * Our block (disk) device should be really simple: the Guest asks for a block
1484 * number and we read or write that position in the file. Unfortunately, that
1485 * was amazingly slow: the Guest waits until the read is finished before
1486 * running anything else, even if it could have been doing useful work.
1487 *
1488 * We could use async I/O, except it's reputed to suck so hard that characters
1489 * actually go missing from your code when you try to use it.
1490 *
1491 * So this was one reason why lguest now does all virtqueue servicing in
1492 * separate threads: it's more efficient and more like a real device.
1493 */
1494 1571
1495/* This hangs off device->priv. */ 1572/* This hangs off device->priv. */
1496struct vblk_info 1573struct vblk_info
@@ -1512,8 +1589,16 @@ struct vblk_info
1512/*L:210 1589/*L:210
1513 * The Disk 1590 * The Disk
1514 * 1591 *
1515 * Remember that the block device is handled by a separate I/O thread. We head 1592 * The disk only has one virtqueue, so it only has one thread. It is really
1516 * straight into the core of that thread here: 1593 * simple: the Guest asks for a block number and we read or write that position
1594 * in the file.
1595 *
1596 * Before we serviced each virtqueue in a separate thread, that was unacceptably
1597 * slow: the Guest waits until the read is finished before running anything
1598 * else, even if it could have been doing useful work.
1599 *
1600 * We could have used async I/O, except it's reputed to suck so hard that
1601 * characters actually go missing from your code when you try to use it.
1517 */ 1602 */
1518static void blk_request(struct virtqueue *vq) 1603static void blk_request(struct virtqueue *vq)
1519{ 1604{
@@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq)
1525 struct iovec iov[vq->vring.num]; 1610 struct iovec iov[vq->vring.num];
1526 off64_t off; 1611 off64_t off;
1527 1612
1528 /* Get the next request. */ 1613 /*
1614 * Get the next request, where we normally wait. It triggers the
1615 * interrupt to acknowledge previously serviced requests (if any).
1616 */
1529 head = wait_for_vq_desc(vq, iov, &out_num, &in_num); 1617 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1530 1618
1531 /* 1619 /*
@@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq)
1539 1627
1540 out = convert(&iov[0], struct virtio_blk_outhdr); 1628 out = convert(&iov[0], struct virtio_blk_outhdr);
1541 in = convert(&iov[out_num+in_num-1], u8); 1629 in = convert(&iov[out_num+in_num-1], u8);
1630 /*
1631 * For historical reasons, block operations are expressed in 512 byte
1632 * "sectors".
1633 */
1542 off = out->sector * 512; 1634 off = out->sector * 512;
1543 1635
1544 /* 1636 /*
@@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq)
1614 if (out->type & VIRTIO_BLK_T_BARRIER) 1706 if (out->type & VIRTIO_BLK_T_BARRIER)
1615 fdatasync(vblk->fd); 1707 fdatasync(vblk->fd);
1616 1708
1709 /* Finished that request. */
1617 add_used(vq, head, wlen); 1710 add_used(vq, head, wlen);
1618} 1711}
1619 1712
@@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq)
1682 errx(1, "Output buffers in rng?"); 1775 errx(1, "Output buffers in rng?");
1683 1776
1684 /* 1777 /*
1685 * This is why we convert to iovecs: the readv() call uses them, and so 1778 * Just like the console write, we loop to cover the whole iovec.
1686 * it reads straight into the Guest's buffer. We loop to make sure we 1779 * In this case, short reads actually happen quite a bit.
1687 * fill it.
1688 */ 1780 */
1689 while (!iov_empty(iov, in_num)) { 1781 while (!iov_empty(iov, in_num)) {
1690 len = readv(rng_info->rfd, iov, in_num); 1782 len = readv(rng_info->rfd, iov, in_num);
@@ -1818,7 +1910,9 @@ int main(int argc, char *argv[])
1818 devices.lastdev = NULL; 1910 devices.lastdev = NULL;
1819 devices.next_irq = 1; 1911 devices.next_irq = 1;
1820 1912
1913 /* We're CPU 0. In fact, that's the only CPU possible right now. */
1821 cpu_id = 0; 1914 cpu_id = 0;
1915
1822 /* 1916 /*
1823 * We need to know how much memory so we can set up the device 1917 * We need to know how much memory so we can set up the device
1824 * descriptor and memory pages for the devices as we parse the command 1918 * descriptor and memory pages for the devices as we parse the command
@@ -1926,7 +2020,7 @@ int main(int argc, char *argv[])
1926 */ 2020 */
1927 tell_kernel(start); 2021 tell_kernel(start);
1928 2022
1929 /* Ensure that we terminate if a child dies. */ 2023 /* Ensure that we terminate if a device-servicing child dies. */
1930 signal(SIGCHLD, kill_launcher); 2024 signal(SIGCHLD, kill_launcher);
1931 2025
1932 /* If we exit via err(), this kills all the threads, restores tty. */ 2026 /* If we exit via err(), this kills all the threads, restores tty. */