aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-06-13 00:27:10 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-06-12 08:57:10 -0400
commit659a0e6633567246edcb7bd400c7e2bece9237d9 (patch)
tree2ece97564e432b837389e166d9b61773d34b1063 /Documentation
parentdf60aeef4f4fe0645d9a195a7689005520422de5 (diff)
lguest: have example Launcher service all devices in separate threads
Currently lguest has three threads: the main Launcher thread, a Waker thread, and a thread for the block device (because synchronous block was simply too painful to bear). The Waker selects() on all the input file descriptors (eg. stdin, net devices, pipe to the block thread) and when one becomes readable it calls into the kernel to kick the Launcher thread out into userspace, which repeats the poll, services the device(s), and then tells the kernel to release the Waker before re-entering the kernel to run the Guest. Also, to make a slightly-decent network transmit routine, the Launcher would suppress further network interrupts while it set a timer: that signal handler would write to a pipe, which would rouse the Waker which would prod the Launcher out of the kernel to check the network device again. Now we can convert all our virtqueues to separate threads: each one has a separate eventfd for when the Guest pokes the device, and can trigger interrupts in the Guest directly. The linecount shows how much this simplifies, but to really bring it home, here's an strace analysis of single Guest->Host ping before: * Guest sends packet, notifies xmit vq, return control to Launcher * Launcher clears notification flag on xmit ring * Launcher writes packet to TUN device writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\366\r\224`\2058\272m\224vf\274\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108 * Launcher sets up interrupt for Guest (xmit ring is empty) write(10, "\2\0\0\0\3\0\0\0", 8) = 0 * Launcher sets up timer for interrupt mitigation setitimer(ITIMER_REAL, {it_interval={0, 0}, it_value={0, 505}}, NULL) = 0 * Launcher re-runs guest pread64(10, 0xbfa5f4d4, 4, 0) ... * Waker notices reply packet in tun device (it was in select) select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [4]) * Waker kicks Launcher out of guest: pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0 * Launcher returns from running guest: ... = -1 EAGAIN (Resource temporarily unavailable) * Launcher looks at input fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [4], left {0, 0}) * Launcher reads pong from tun device: readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\272m\224vf\274\366\r\224`\2058\10\0E\0\0T\364\26\0\0@"..., 1518}], 2) = 108 * Launcher injects guest notification: write(10, "\2\0\0\0\2\0\0\0", 8) = 0 * Launcher rechecks fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout) * Launcher clears Waker: pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0 * Launcher reruns Guest: pread64(10, 0xbfa5f4d4, 4, 0) = ? ERESTARTSYS (To be restarted) * Signal comes in, uses pipe to wake up Launcher: --- SIGALRM (Alarm clock) @ 0 (0) --- write(8, "\0", 1) = 1 sigreturn() = ? (mask now []) * Waker sees write on pipe: select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [6]) * Waker kicks Launcher out of Guest: pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0 * Launcher exits from kernel: pread64(10, 0xbfa5f4d4, 4, 0) = -1 EAGAIN (Resource temporarily unavailable) * Launcher looks to see what fd woke it: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [6], left {0, 0}) * Launcher reads timeout fd, sets notification flag on xmit ring read(6, "\0", 32) = 1 * Launcher rechecks fds: select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout) * Launcher clears Waker: pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0 * Launcher resumes Guest: pread64(10, "\0p\0\4", 4, 0) .... strace analysis of single Guest->Host ping after: * Guest sends packet, notifies xmit vq, creates event on eventfd. * Network xmit thread wakes from read on eventfd: read(7, "\1\0\0\0\0\0\0\0", 8) = 8 * Network xmit thread writes packet to TUN device writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"J\217\232FI\37j\27\375\276\0\304\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108 * Network recv thread wakes up from read on tunfd: readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"j\27\375\276\0\304J\217\232FI\37\10\0E\0\0TiO\0\0@\1\214"..., 1518}], 2) = 108 * Network recv thread sets up interrupt for the Guest write(6, "\2\0\0\0\2\0\0\0", 8) = 0 * Network recv thread goes back to reading tunfd 13:39:42.460285 readv(4, <unfinished ...> * Network xmit thread sets up interrupt for Guest (xmit ring is empty) write(6, "\2\0\0\0\3\0\0\0", 8) = 0 * Network xmit thread goes back to reading from eventfd read(7, <unfinished ...> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/lguest/lguest.c833
1 files changed, 259 insertions, 574 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 02fa524cf4ad..5470b8ed2149 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
16#include <sys/types.h> 16#include <sys/types.h>
17#include <sys/stat.h> 17#include <sys/stat.h>
18#include <sys/wait.h> 18#include <sys/wait.h>
19#include <sys/eventfd.h>
19#include <fcntl.h> 20#include <fcntl.h>
20#include <stdbool.h> 21#include <stdbool.h>
21#include <errno.h> 22#include <errno.h>
@@ -59,7 +60,6 @@ typedef uint8_t u8;
59/*:*/ 60/*:*/
60 61
61#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 62#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
62#define NET_PEERNUM 1
63#define BRIDGE_PFX "bridge:" 63#define BRIDGE_PFX "bridge:"
64#ifndef SIOCBRADDIF 64#ifndef SIOCBRADDIF
65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -76,18 +76,10 @@ static bool verbose;
76 do { if (verbose) printf(args); } while(0) 76 do { if (verbose) printf(args); } while(0)
77/*:*/ 77/*:*/
78 78
79/* File descriptors for the Waker. */
80struct {
81 int pipe[2];
82} waker_fds;
83
84/* The pointer to the start of guest memory. */ 79/* The pointer to the start of guest memory. */
85static void *guest_base; 80static void *guest_base;
86/* The maximum guest physical address allowed, and maximum possible. */ 81/* The maximum guest physical address allowed, and maximum possible. */
87static unsigned long guest_limit, guest_max; 82static unsigned long guest_limit, guest_max;
88/* The pipe for signal hander to write to. */
89static int timeoutpipe[2];
90static unsigned int timeout_usec = 500;
91/* The /dev/lguest file descriptor. */ 83/* The /dev/lguest file descriptor. */
92static int lguest_fd; 84static int lguest_fd;
93 85
@@ -97,11 +89,6 @@ static unsigned int __thread cpu_id;
97/* This is our list of devices. */ 89/* This is our list of devices. */
98struct device_list 90struct device_list
99{ 91{
100 /* Summary information about the devices in our list: ready to pass to
101 * select() to ask which need servicing.*/
102 fd_set infds;
103 int max_infd;
104
105 /* Counter to assign interrupt numbers. */ 92 /* Counter to assign interrupt numbers. */
106 unsigned int next_irq; 93 unsigned int next_irq;
107 94
@@ -137,16 +124,11 @@ struct device
137 /* The name of this device, for --verbose. */ 124 /* The name of this device, for --verbose. */
138 const char *name; 125 const char *name;
139 126
140 /* If handle_input is set, it wants to be called when this file
141 * descriptor is ready. */
142 int fd;
143 bool (*handle_input)(struct device *me);
144
145 /* Any queues attached to this device */ 127 /* Any queues attached to this device */
146 struct virtqueue *vq; 128 struct virtqueue *vq;
147 129
148 /* Handle status being finalized (ie. feature bits stable). */ 130 /* Is it operational */
149 void (*ready)(struct device *me); 131 bool running;
150 132
151 /* Device-specific data. */ 133 /* Device-specific data. */
152 void *priv; 134 void *priv;
@@ -169,16 +151,20 @@ struct virtqueue
169 /* Last available index we saw. */ 151 /* Last available index we saw. */
170 u16 last_avail_idx; 152 u16 last_avail_idx;
171 153
172 /* The routine to call when the Guest pings us, or timeout. */ 154 /* Eventfd where Guest notifications arrive. */
173 void (*handle_output)(struct virtqueue *me, bool timeout); 155 int eventfd;
174 156
175 /* Is this blocked awaiting a timer? */ 157 /* Function for the thread which is servicing this virtqueue. */
176 bool blocked; 158 void (*service)(struct virtqueue *vq);
159 pid_t thread;
177}; 160};
178 161
179/* Remember the arguments to the program so we can "reboot" */ 162/* Remember the arguments to the program so we can "reboot" */
180static char **main_args; 163static char **main_args;
181 164
165/* The original tty settings to restore on exit. */
166static struct termios orig_term;
167
182/* We have to be careful with barriers: our devices are all run in separate 168/* We have to be careful with barriers: our devices are all run in separate
183 * threads and so we need to make sure that changes visible to the Guest happen 169 * threads and so we need to make sure that changes visible to the Guest happen
184 * in precise order. */ 170 * in precise order. */
@@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start)
521} 507}
522/*:*/ 508/*:*/
523 509
524static void add_device_fd(int fd)
525{
526 FD_SET(fd, &devices.infds);
527 if (fd > devices.max_infd)
528 devices.max_infd = fd;
529}
530
531/*L:200
532 * The Waker.
533 *
534 * With console, block and network devices, we can have lots of input which we
535 * need to process. We could try to tell the kernel what file descriptors to
536 * watch, but handing a file descriptor mask through to the kernel is fairly
537 * icky.
538 *
539 * Instead, we clone off a thread which watches the file descriptors and writes
540 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
541 * stop running the Guest. This causes the Launcher to return from the
542 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
543 * the LHREQ_BREAK and wake us up again.
544 *
545 * This, of course, is merely a different *kind* of icky.
546 *
547 * Given my well-known antipathy to threads, I'd prefer to use processes. But
548 * it's easier to share Guest memory with threads, and trivial to share the
549 * devices.infds as the Launcher changes it.
550 */
551static int waker(void *unused)
552{
553 /* Close the write end of the pipe: only the Launcher has it open. */
554 close(waker_fds.pipe[1]);
555
556 for (;;) {
557 fd_set rfds = devices.infds;
558 unsigned long args[] = { LHREQ_BREAK, 1 };
559 unsigned int maxfd = devices.max_infd;
560
561 /* We also listen to the pipe from the Launcher. */
562 FD_SET(waker_fds.pipe[0], &rfds);
563 if (waker_fds.pipe[0] > maxfd)
564 maxfd = waker_fds.pipe[0];
565
566 /* Wait until input is ready from one of the devices. */
567 select(maxfd+1, &rfds, NULL, NULL, NULL);
568
569 /* Message from Launcher? */
570 if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
571 char c;
572 /* If this fails, then assume Launcher has exited.
573 * Don't do anything on exit: we're just a thread! */
574 if (read(waker_fds.pipe[0], &c, 1) != 1)
575 _exit(0);
576 continue;
577 }
578
579 /* Send LHREQ_BREAK command to snap the Launcher out of it. */
580 pwrite(lguest_fd, args, sizeof(args), cpu_id);
581 }
582 return 0;
583}
584
585/* This routine just sets up a pipe to the Waker process. */
586static void setup_waker(void)
587{
588 /* This pipe is closed when Launcher dies, telling Waker. */
589 if (pipe(waker_fds.pipe) != 0)
590 err(1, "Creating pipe for Waker");
591
592 if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
593 err(1, "Creating Waker");
594}
595
596/* 510/*
597 * Device Handling. 511 * Device Handling.
598 * 512 *
@@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i)
642 * number of output then some number of input descriptors, it's actually two 556 * number of output then some number of input descriptors, it's actually two
643 * iovecs, but we pack them into one and note how many of each there were. 557 * iovecs, but we pack them into one and note how many of each there were.
644 * 558 *
645 * This function returns the descriptor number found, or vq->vring.num (which 559 * This function returns the descriptor number found. */
646 * is never a valid descriptor number) if none was found. */ 560static unsigned wait_for_vq_desc(struct virtqueue *vq,
647static unsigned get_vq_desc(struct virtqueue *vq, 561 struct iovec iov[],
648 struct iovec iov[], 562 unsigned int *out_num, unsigned int *in_num)
649 unsigned int *out_num, unsigned int *in_num)
650{ 563{
651 unsigned int i, head; 564 unsigned int i, head;
652 u16 last_avail; 565 u16 last_avail = lg_last_avail(vq);
566
567 while (last_avail == vq->vring.avail->idx) {
568 u64 event;
569
570 /* Nothing new? Wait for eventfd to tell us they refilled. */
571 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
572 errx(1, "Event read failed?");
573 }
653 574
654 /* Check it isn't doing very strange things with descriptor numbers. */ 575 /* Check it isn't doing very strange things with descriptor numbers. */
655 last_avail = lg_last_avail(vq);
656 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 576 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
657 errx(1, "Guest moved used index from %u to %u", 577 errx(1, "Guest moved used index from %u to %u",
658 last_avail, vq->vring.avail->idx); 578 last_avail, vq->vring.avail->idx);
659 579
660 /* If there's nothing new since last we looked, return invalid. */
661 if (vq->vring.avail->idx == last_avail)
662 return vq->vring.num;
663
664 /* Grab the next descriptor number they're advertising, and increment 580 /* Grab the next descriptor number they're advertising, and increment
665 * the index we've seen. */ 581 * the index we've seen. */
666 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 582 head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
740/* 656/*
741 * The Console 657 * The Console
742 * 658 *
743 * Here is the input terminal setting we save, and the routine to restore them 659 * We associate some data with the console for our exit hack. */
744 * on exit so the user gets their terminal back. */
745static struct termios orig_term;
746static void restore_term(void)
747{
748 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
749}
750
751/* We associate some data with the console for our exit hack. */
752struct console_abort 660struct console_abort
753{ 661{
754 /* How many times have they hit ^C? */ 662 /* How many times have they hit ^C? */
@@ -758,245 +666,235 @@ struct console_abort
758}; 666};
759 667
760/* This is the routine which handles console input (ie. stdin). */ 668/* This is the routine which handles console input (ie. stdin). */
761static bool handle_console_input(struct device *dev) 669static void console_input(struct virtqueue *vq)
762{ 670{
763 int len; 671 int len;
764 unsigned int head, in_num, out_num; 672 unsigned int head, in_num, out_num;
765 struct iovec iov[dev->vq->vring.num]; 673 struct console_abort *abort = vq->dev->priv;
766 struct console_abort *abort = dev->priv; 674 struct iovec iov[vq->vring.num];
767
768 /* First we need a console buffer from the Guests's input virtqueue. */
769 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
770
771 /* If they're not ready for input, stop listening to this file
772 * descriptor. We'll start again once they add an input buffer. */
773 if (head == dev->vq->vring.num)
774 return false;
775 675
676 /* Make sure there's a descriptor waiting. */
677 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
776 if (out_num) 678 if (out_num)
777 errx(1, "Output buffers in console in queue?"); 679 errx(1, "Output buffers in console in queue?");
778 680
779 /* This is why we convert to iovecs: the readv() call uses them, and so 681 /* Read it in. */
780 * it reads straight into the Guest's buffer. */ 682 len = readv(STDIN_FILENO, iov, in_num);
781 len = readv(dev->fd, iov, in_num);
782 if (len <= 0) { 683 if (len <= 0) {
783 /* This implies that the console is closed, is /dev/null, or 684 /* Ran out of input? */
784 * something went terribly wrong. */
785 warnx("Failed to get console input, ignoring console."); 685 warnx("Failed to get console input, ignoring console.");
786 /* Put the input terminal back. */ 686 /* For simplicity, dying threads kill the whole Launcher. So
787 restore_term(); 687 * just nap here. */
788 /* Remove callback from input vq, so it doesn't restart us. */ 688 for (;;)
789 dev->vq->handle_output = NULL; 689 pause();
790 /* Stop listening to this fd: don't call us again. */
791 return false;
792 } 690 }
793 691
794 /* Tell the Guest about the new input. */ 692 add_used_and_trigger(vq, head, len);
795 add_used_and_trigger(dev->vq, head, len);
796 693
797 /* Three ^C within one second? Exit. 694 /* Three ^C within one second? Exit.
798 * 695 *
799 * This is such a hack, but works surprisingly well. Each ^C has to be 696 * This is such a hack, but works surprisingly well. Each ^C has to
800 * in a buffer by itself, so they can't be too fast. But we check that 697 * be in a buffer by itself, so they can't be too fast. But we check
801 * we get three within about a second, so they can't be too slow. */ 698 * that we get three within about a second, so they can't be too
802 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { 699 * slow. */
803 if (!abort->count++) 700 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
804 gettimeofday(&abort->start, NULL);
805 else if (abort->count == 3) {
806 struct timeval now;
807 gettimeofday(&now, NULL);
808 if (now.tv_sec <= abort->start.tv_sec+1) {
809 unsigned long args[] = { LHREQ_BREAK, 0 };
810 /* Close the fd so Waker will know it has to
811 * exit. */
812 close(waker_fds.pipe[1]);
813 /* Just in case Waker is blocked in BREAK, send
814 * unbreak now. */
815 write(lguest_fd, args, sizeof(args));
816 exit(2);
817 }
818 abort->count = 0;
819 }
820 } else
821 /* Any other key resets the abort counter. */
822 abort->count = 0; 701 abort->count = 0;
702 return;
703 }
823 704
824 /* Everything went OK! */ 705 abort->count++;
825 return true; 706 if (abort->count == 1)
707 gettimeofday(&abort->start, NULL);
708 else if (abort->count == 3) {
709 struct timeval now;
710 gettimeofday(&now, NULL);
711 /* Kill all Launcher processes with SIGINT, like normal ^C */
712 if (now.tv_sec <= abort->start.tv_sec+1)
713 kill(0, SIGINT);
714 abort->count = 0;
715 }
826} 716}
827 717
828/* Handling output for console is simple: we just get all the output buffers 718/* This is the routine which handles console output (ie. stdout). */
829 * and write them to stdout. */ 719static void console_output(struct virtqueue *vq)
830static void handle_console_output(struct virtqueue *vq, bool timeout)
831{ 720{
832 unsigned int head, out, in; 721 unsigned int head, out, in;
833 struct iovec iov[vq->vring.num]; 722 struct iovec iov[vq->vring.num];
834 723
835 /* Keep getting output buffers from the Guest until we run out. */ 724 head = wait_for_vq_desc(vq, iov, &out, &in);
836 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 725 if (in)
837 if (in) 726 errx(1, "Input buffers in console output queue?");
838 errx(1, "Input buffers in output queue?"); 727 while (!iov_empty(iov, out)) {
839 while (!iov_empty(iov, out)) { 728 int len = writev(STDOUT_FILENO, iov, out);
840 int len = writev(STDOUT_FILENO, iov, out); 729 if (len <= 0)
841 if (len <= 0) 730 err(1, "Write to stdout gave %i", len);
842 err(1, "Write to stdout gave %i", len); 731 iov_consume(iov, out, len);
843 iov_consume(iov, out, len);
844 }
845 add_used_and_trigger(vq, head, 0);
846 } 732 }
847} 733 add_used_and_trigger(vq, head, 0);
848
849/* This is called when we no longer want to hear about Guest changes to a
850 * virtqueue. This is more efficient in high-traffic cases, but it means we
851 * have to set a timer to check if any more changes have occurred. */
852static void block_vq(struct virtqueue *vq)
853{
854 struct itimerval itm;
855
856 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
857 vq->blocked = true;
858
859 itm.it_interval.tv_sec = 0;
860 itm.it_interval.tv_usec = 0;
861 itm.it_value.tv_sec = 0;
862 itm.it_value.tv_usec = timeout_usec;
863
864 setitimer(ITIMER_REAL, &itm, NULL);
865} 734}
866 735
867/* 736/*
868 * The Network 737 * The Network
869 * 738 *
870 * Handling output for network is also simple: we get all the output buffers 739 * Handling output for network is also simple: we get all the output buffers
871 * and write them (ignoring the first element) to this device's file descriptor 740 * and write them to /dev/net/tun.
872 * (/dev/net/tun).
873 */ 741 */
874static void handle_net_output(struct virtqueue *vq, bool timeout) 742struct net_info {
743 int tunfd;
744};
745
746static void net_output(struct virtqueue *vq)
875{ 747{
876 unsigned int head, out, in, num = 0; 748 struct net_info *net_info = vq->dev->priv;
749 unsigned int head, out, in;
877 struct iovec iov[vq->vring.num]; 750 struct iovec iov[vq->vring.num];
878 static int last_timeout_num;
879
880 /* Keep getting output buffers from the Guest until we run out. */
881 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
882 if (in)
883 errx(1, "Input buffers in output queue?");
884 if (writev(vq->dev->fd, iov, out) < 0)
885 err(1, "Writing network packet to tun");
886 add_used_and_trigger(vq, head, 0);
887 num++;
888 }
889 751
890 /* Block further kicks and set up a timer if we saw anything. */ 752 head = wait_for_vq_desc(vq, iov, &out, &in);
891 if (!timeout && num) 753 if (in)
892 block_vq(vq); 754 errx(1, "Input buffers in net output queue?");
893 755 if (writev(net_info->tunfd, iov, out) < 0)
894 /* We never quite know how long should we wait before we check the 756 errx(1, "Write to tun failed?");
895 * queue again for more packets. We start at 500 microseconds, and if 757 add_used_and_trigger(vq, head, 0);
896 * we get fewer packets than last time, we assume we made the timeout
897 * too small and increase it by 10 microseconds. Otherwise, we drop it
898 * by one microsecond every time. It seems to work well enough. */
899 if (timeout) {
900 if (num < last_timeout_num)
901 timeout_usec += 10;
902 else if (timeout_usec > 1)
903 timeout_usec--;
904 last_timeout_num = num;
905 }
906} 758}
907 759
908/* This is where we handle a packet coming in from the tun device to our 760/* This is where we handle packets coming in from the tun device to our
909 * Guest. */ 761 * Guest. */
910static bool handle_tun_input(struct device *dev) 762static void net_input(struct virtqueue *vq)
911{ 763{
912 unsigned int head, in_num, out_num;
913 int len; 764 int len;
914 struct iovec iov[dev->vq->vring.num]; 765 unsigned int head, out, in;
915 766 struct iovec iov[vq->vring.num];
916 /* First we need a network buffer from the Guests's recv virtqueue. */ 767 struct net_info *net_info = vq->dev->priv;
917 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 768
918 if (head == dev->vq->vring.num) { 769 head = wait_for_vq_desc(vq, iov, &out, &in);
919 /* Now, it's expected that if we try to send a packet too 770 if (out)
920 * early, the Guest won't be ready yet. Wait until the device 771 errx(1, "Output buffers in net input queue?");
921 * status says it's ready. */ 772 len = readv(net_info->tunfd, iov, in);
922 /* FIXME: Actually want DRIVER_ACTIVE here. */
923
924 /* Now tell it we want to know if new things appear. */
925 dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
926 wmb();
927
928 /* We'll turn this back on if input buffers are registered. */
929 return false;
930 } else if (out_num)
931 errx(1, "Output buffers in network recv queue?");
932
933 /* Read the packet from the device directly into the Guest's buffer. */
934 len = readv(dev->fd, iov, in_num);
935 if (len <= 0) 773 if (len <= 0)
936 err(1, "reading network"); 774 err(1, "Failed to read from tun.");
775 add_used_and_trigger(vq, head, len);
776}
937 777
938 /* Tell the Guest about the new packet. */ 778/* This is the helper to create threads. */
939 add_used_and_trigger(dev->vq, head, len); 779static int do_thread(void *_vq)
780{
781 struct virtqueue *vq = _vq;
940 782
941 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 783 for (;;)
942 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 784 vq->service(vq);
943 head != dev->vq->vring.num ? "sent" : "discarded"); 785 return 0;
786}
944 787
945 /* All good. */ 788/* When a child dies, we kill our entire process group with SIGTERM. This
946 return true; 789 * also has the side effect that the shell restores the console for us! */
790static void kill_launcher(int signal)
791{
792 kill(0, SIGTERM);
947} 793}
948 794
949/*L:215 This is the callback attached to the network and console input 795static void reset_device(struct device *dev)
950 * virtqueues: it ensures we try again, in case we stopped console or net
951 * delivery because Guest didn't have any buffers. */
952static void enable_fd(struct virtqueue *vq, bool timeout)
953{ 796{
954 add_device_fd(vq->dev->fd); 797 struct virtqueue *vq;
955 /* Snap the Waker out of its select loop. */ 798
956 write(waker_fds.pipe[1], "", 1); 799 verbose("Resetting device %s\n", dev->name);
800
801 /* Clear any features they've acked. */
802 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
803
804 /* We're going to be explicitly killing threads, so ignore them. */
805 signal(SIGCHLD, SIG_IGN);
806
807 /* Zero out the virtqueues, get rid of their threads */
808 for (vq = dev->vq; vq; vq = vq->next) {
809 if (vq->thread != (pid_t)-1) {
810 kill(vq->thread, SIGTERM);
811 waitpid(vq->thread, NULL, 0);
812 vq->thread = (pid_t)-1;
813 }
814 memset(vq->vring.desc, 0,
815 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
816 lg_last_avail(vq) = 0;
817 }
818 dev->running = false;
819
820 /* Now we care if threads die. */
821 signal(SIGCHLD, (void *)kill_launcher);
957} 822}
958 823
959static void net_enable_fd(struct virtqueue *vq, bool timeout) 824static void create_thread(struct virtqueue *vq)
960{ 825{
961 /* We don't need to know again when Guest refills receive buffer. */ 826 /* Create stack for thread and run it. Since stack grows
962 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; 827 * upwards, we point the stack pointer to the end of this
963 enable_fd(vq, timeout); 828 * region. */
829 char *stack = malloc(32768);
830 unsigned long args[] = { LHREQ_EVENTFD,
831 vq->config.pfn*getpagesize(), 0 };
832
833 /* Create a zero-initialized eventfd. */
834 vq->eventfd = eventfd(0, 0);
835 if (vq->eventfd < 0)
836 err(1, "Creating eventfd");
837 args[2] = vq->eventfd;
838
839 /* Attach an eventfd to this virtqueue: it will go off
840 * when the Guest does an LHCALL_NOTIFY for this vq. */
841 if (write(lguest_fd, &args, sizeof(args)) != 0)
842 err(1, "Attaching eventfd");
843
844 /* CLONE_VM: because it has to access the Guest memory, and
845 * SIGCHLD so we get a signal if it dies. */
846 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
847 if (vq->thread == (pid_t)-1)
848 err(1, "Creating clone");
849 /* We close our local copy, now the child has it. */
850 close(vq->eventfd);
964} 851}
965 852
966/* When the Guest tells us they updated the status field, we handle it. */ 853static void start_device(struct device *dev)
967static void update_device_status(struct device *dev)
968{ 854{
855 unsigned int i;
969 struct virtqueue *vq; 856 struct virtqueue *vq;
970 857
971 /* This is a reset. */ 858 verbose("Device %s OK: offered", dev->name);
972 if (dev->desc->status == 0) { 859 for (i = 0; i < dev->feature_len; i++)
973 verbose("Resetting device %s\n", dev->name); 860 verbose(" %02x", get_feature_bits(dev)[i]);
861 verbose(", accepted");
862 for (i = 0; i < dev->feature_len; i++)
863 verbose(" %02x", get_feature_bits(dev)
864 [dev->feature_len+i]);
865
866 for (vq = dev->vq; vq; vq = vq->next) {
867 if (vq->service)
868 create_thread(vq);
869 }
870 dev->running = true;
871}
872
873static void cleanup_devices(void)
874{
875 struct device *dev;
876
877 for (dev = devices.dev; dev; dev = dev->next)
878 reset_device(dev);
974 879
975 /* Clear any features they've acked. */ 880 /* If we saved off the original terminal settings, restore them now. */
976 memset(get_feature_bits(dev) + dev->feature_len, 0, 881 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
977 dev->feature_len); 882 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
883}
978 884
979 /* Zero out the virtqueues. */ 885/* When the Guest tells us they updated the status field, we handle it. */
980 for (vq = dev->vq; vq; vq = vq->next) { 886static void update_device_status(struct device *dev)
981 memset(vq->vring.desc, 0, 887{
982 vring_size(vq->config.num, LGUEST_VRING_ALIGN)); 888 /* A zero status is a reset, otherwise it's a set of flags. */
983 lg_last_avail(vq) = 0; 889 if (dev->desc->status == 0)
984 } 890 reset_device(dev);
985 } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 891 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
986 warnx("Device %s configuration FAILED", dev->name); 892 warnx("Device %s configuration FAILED", dev->name);
893 if (dev->running)
894 reset_device(dev);
987 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 895 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
988 unsigned int i; 896 if (!dev->running)
989 897 start_device(dev);
990 verbose("Device %s OK: offered", dev->name);
991 for (i = 0; i < dev->feature_len; i++)
992 verbose(" %02x", get_feature_bits(dev)[i]);
993 verbose(", accepted");
994 for (i = 0; i < dev->feature_len; i++)
995 verbose(" %02x", get_feature_bits(dev)
996 [dev->feature_len+i]);
997
998 if (dev->ready)
999 dev->ready(dev);
1000 } 898 }
1001} 899}
1002 900
@@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev)
1004static void handle_output(unsigned long addr) 902static void handle_output(unsigned long addr)
1005{ 903{
1006 struct device *i; 904 struct device *i;
1007 struct virtqueue *vq;
1008 905
1009 /* Check each device and virtqueue. */ 906 /* Check each device. */
1010 for (i = devices.dev; i; i = i->next) { 907 for (i = devices.dev; i; i = i->next) {
908 struct virtqueue *vq;
909
1011 /* Notifications to device descriptors update device status. */ 910 /* Notifications to device descriptors update device status. */
1012 if (from_guest_phys(addr) == i->desc) { 911 if (from_guest_phys(addr) == i->desc) {
1013 update_device_status(i); 912 update_device_status(i);
1014 return; 913 return;
1015 } 914 }
1016 915
1017 /* Notifications to virtqueues mean output has occurred. */ 916 /* Devices *can* be used before status is set to DRIVER_OK. */
1018 for (vq = i->vq; vq; vq = vq->next) { 917 for (vq = i->vq; vq; vq = vq->next) {
1019 if (vq->config.pfn != addr/getpagesize()) 918 if (addr != vq->config.pfn*getpagesize())
1020 continue; 919 continue;
1021 920 if (i->running)
1022 /* Guest should acknowledge (and set features!) before 921 errx(1, "Notification on running %s", i->name);
1023 * using the device. */ 922 start_device(i);
1024 if (i->desc->status == 0) {
1025 warnx("%s gave early output", i->name);
1026 return;
1027 }
1028
1029 if (strcmp(vq->dev->name, "console") != 0)
1030 verbose("Output to %s\n", vq->dev->name);
1031 if (vq->handle_output)
1032 vq->handle_output(vq, false);
1033 return; 923 return;
1034 } 924 }
1035 } 925 }
@@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr)
1043 strnlen(from_guest_phys(addr), guest_limit - addr)); 933 strnlen(from_guest_phys(addr), guest_limit - addr));
1044} 934}
1045 935
1046static void handle_timeout(void)
1047{
1048 char buf[32];
1049 struct device *i;
1050 struct virtqueue *vq;
1051
1052 /* Clear the pipe */
1053 read(timeoutpipe[0], buf, sizeof(buf));
1054
1055 /* Check each device and virtqueue: flush blocked ones. */
1056 for (i = devices.dev; i; i = i->next) {
1057 for (vq = i->vq; vq; vq = vq->next) {
1058 if (!vq->blocked)
1059 continue;
1060
1061 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
1062 vq->blocked = false;
1063 if (vq->handle_output)
1064 vq->handle_output(vq, true);
1065 }
1066 }
1067}
1068
1069/* This is called when the Waker wakes us up: check for incoming file
1070 * descriptors. */
1071static void handle_input(void)
1072{
1073 /* select() wants a zeroed timeval to mean "don't wait". */
1074 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
1075
1076 for (;;) {
1077 struct device *i;
1078 fd_set fds = devices.infds;
1079 int num;
1080
1081 num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
1082 /* Could get interrupted */
1083 if (num < 0)
1084 continue;
1085 /* If nothing is ready, we're done. */
1086 if (num == 0)
1087 break;
1088
1089 /* Otherwise, call the device(s) which have readable file
1090 * descriptors and a method of handling them. */
1091 for (i = devices.dev; i; i = i->next) {
1092 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
1093 if (i->handle_input(i))
1094 continue;
1095
1096 /* If handle_input() returns false, it means we
1097 * should no longer service it. Networking and
1098 * console do this when there's no input
1099 * buffers to deliver into. Console also uses
1100 * it when it discovers that stdin is closed. */
1101 FD_CLR(i->fd, &devices.infds);
1102 }
1103 }
1104
1105 /* Is this the timeout fd? */
1106 if (FD_ISSET(timeoutpipe[0], &fds))
1107 handle_timeout();
1108 }
1109}
1110
1111/*L:190 936/*L:190
1112 * Device Setup 937 * Device Setup
1113 * 938 *
@@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1153/* Each device descriptor is followed by the description of its virtqueues. We 978/* Each device descriptor is followed by the description of its virtqueues. We
1154 * specify how many descriptors the virtqueue is to have. */ 979 * specify how many descriptors the virtqueue is to have. */
1155static void add_virtqueue(struct device *dev, unsigned int num_descs, 980static void add_virtqueue(struct device *dev, unsigned int num_descs,
1156 void (*handle_output)(struct virtqueue *, bool)) 981 void (*service)(struct virtqueue *))
1157{ 982{
1158 unsigned int pages; 983 unsigned int pages;
1159 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 984 struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1168 vq->next = NULL; 993 vq->next = NULL;
1169 vq->last_avail_idx = 0; 994 vq->last_avail_idx = 0;
1170 vq->dev = dev; 995 vq->dev = dev;
1171 vq->blocked = false; 996 vq->service = service;
997 vq->thread = (pid_t)-1;
1172 998
1173 /* Initialize the configuration. */ 999 /* Initialize the configuration. */
1174 vq->config.num = num_descs; 1000 vq->config.num = num_descs;
@@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1193 * second. */ 1019 * second. */
1194 for (i = &dev->vq; *i; i = &(*i)->next); 1020 for (i = &dev->vq; *i; i = &(*i)->next);
1195 *i = vq; 1021 *i = vq;
1196
1197 /* Set the routine to call when the Guest does something to this
1198 * virtqueue. */
1199 vq->handle_output = handle_output;
1200
1201 /* As an optimization, set the advisory "Don't Notify Me" flag if we
1202 * don't have a handler */
1203 if (!handle_output)
1204 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1205} 1022}
1206 1023
1207/* The first half of the feature bitmask is for us to advertise features. The 1024/* The first half of the feature bitmask is for us to advertise features. The
@@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1237 * calling new_dev_desc() to allocate the descriptor and device memory. 1054 * calling new_dev_desc() to allocate the descriptor and device memory.
1238 * 1055 *
1239 * See what I mean about userspace being boring? */ 1056 * See what I mean about userspace being boring? */
1240static struct device *new_device(const char *name, u16 type, int fd, 1057static struct device *new_device(const char *name, u16 type)
1241 bool (*handle_input)(struct device *))
1242{ 1058{
1243 struct device *dev = malloc(sizeof(*dev)); 1059 struct device *dev = malloc(sizeof(*dev));
1244 1060
1245 /* Now we populate the fields one at a time. */ 1061 /* Now we populate the fields one at a time. */
1246 dev->fd = fd;
1247 /* If we have an input handler for this file descriptor, then we add it
1248 * to the device_list's fdset and maxfd. */
1249 if (handle_input)
1250 add_device_fd(dev->fd);
1251 dev->desc = new_dev_desc(type); 1062 dev->desc = new_dev_desc(type);
1252 dev->handle_input = handle_input;
1253 dev->name = name; 1063 dev->name = name;
1254 dev->vq = NULL; 1064 dev->vq = NULL;
1255 dev->ready = NULL;
1256 dev->feature_len = 0; 1065 dev->feature_len = 0;
1257 dev->num_vq = 0; 1066 dev->num_vq = 0;
1067 dev->running = false;
1258 1068
1259 /* Append to device list. Prepending to a single-linked list is 1069 /* Append to device list. Prepending to a single-linked list is
1260 * easier, but the user expects the devices to be arranged on the bus 1070 * easier, but the user expects the devices to be arranged on the bus
@@ -1282,13 +1092,10 @@ static void setup_console(void)
1282 * raw input stream to the Guest. */ 1092 * raw input stream to the Guest. */
1283 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1093 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1284 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1094 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1285 /* If we exit gracefully, the original settings will be
1286 * restored so the user can see what they're typing. */
1287 atexit(restore_term);
1288 } 1095 }
1289 1096
1290 dev = new_device("console", VIRTIO_ID_CONSOLE, 1097 dev = new_device("console", VIRTIO_ID_CONSOLE);
1291 STDIN_FILENO, handle_console_input); 1098
1292 /* We store the console state in dev->priv, and initialize it. */ 1099 /* We store the console state in dev->priv, and initialize it. */
1293 dev->priv = malloc(sizeof(struct console_abort)); 1100 dev->priv = malloc(sizeof(struct console_abort));
1294 ((struct console_abort *)dev->priv)->count = 0; 1101 ((struct console_abort *)dev->priv)->count = 0;
@@ -1297,31 +1104,13 @@ static void setup_console(void)
1297 * they put something the input queue, we make sure we're listening to 1104 * they put something the input queue, we make sure we're listening to
1298 * stdin. When they put something in the output queue, we write it to 1105 * stdin. When they put something in the output queue, we write it to
1299 * stdout. */ 1106 * stdout. */
1300 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1107 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1301 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1108 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1302 1109
1303 verbose("device %u: console\n", devices.device_num++); 1110 verbose("device %u: console\n", ++devices.device_num);
1304} 1111}
1305/*:*/ 1112/*:*/
1306 1113
1307static void timeout_alarm(int sig)
1308{
1309 write(timeoutpipe[1], "", 1);
1310}
1311
1312static void setup_timeout(void)
1313{
1314 if (pipe(timeoutpipe) != 0)
1315 err(1, "Creating timeout pipe");
1316
1317 if (fcntl(timeoutpipe[1], F_SETFL,
1318 fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
1319 err(1, "Making timeout pipe nonblocking");
1320
1321 add_device_fd(timeoutpipe[0]);
1322 signal(SIGALRM, timeout_alarm);
1323}
1324
1325/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1114/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1326 * --sharenet=<name> option which opens or creates a named pipe. This can be 1115 * --sharenet=<name> option which opens or creates a named pipe. This can be
1327 * used to send packets to another guest in a 1:1 manner. 1116 * used to send packets to another guest in a 1:1 manner.
@@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1443static void setup_tun_net(char *arg) 1232static void setup_tun_net(char *arg)
1444{ 1233{
1445 struct device *dev; 1234 struct device *dev;
1446 int netfd, ipfd; 1235 struct net_info *net_info = malloc(sizeof(*net_info));
1236 int ipfd;
1447 u32 ip = INADDR_ANY; 1237 u32 ip = INADDR_ANY;
1448 bool bridging = false; 1238 bool bridging = false;
1449 char tapif[IFNAMSIZ], *p; 1239 char tapif[IFNAMSIZ], *p;
1450 struct virtio_net_config conf; 1240 struct virtio_net_config conf;
1451 1241
1452 netfd = get_tun_device(tapif); 1242 net_info->tunfd = get_tun_device(tapif);
1453 1243
1454 /* First we create a new network device. */ 1244 /* First we create a new network device. */
1455 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1245 dev = new_device("net", VIRTIO_ID_NET);
1246 dev->priv = net_info;
1456 1247
1457 /* Network devices need a receive and a send queue, just like 1248 /* Network devices need a receive and a send queue, just like
1458 * console. */ 1249 * console. */
1459 add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); 1250 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1460 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1251 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1461 1252
1462 /* We need a socket to perform the magic network ioctls to bring up the 1253 /* We need a socket to perform the magic network ioctls to bring up the
1463 * tap interface, connect to the bridge etc. Any socket will do! */ 1254 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1546,20 +1337,18 @@ struct vblk_info
1546 * Remember that the block device is handled by a separate I/O thread. We head 1337 * Remember that the block device is handled by a separate I/O thread. We head
1547 * straight into the core of that thread here: 1338 * straight into the core of that thread here:
1548 */ 1339 */
1549static bool service_io(struct device *dev) 1340static void blk_request(struct virtqueue *vq)
1550{ 1341{
1551 struct vblk_info *vblk = dev->priv; 1342 struct vblk_info *vblk = vq->dev->priv;
1552 unsigned int head, out_num, in_num, wlen; 1343 unsigned int head, out_num, in_num, wlen;
1553 int ret; 1344 int ret;
1554 u8 *in; 1345 u8 *in;
1555 struct virtio_blk_outhdr *out; 1346 struct virtio_blk_outhdr *out;
1556 struct iovec iov[dev->vq->vring.num]; 1347 struct iovec iov[vq->vring.num];
1557 off64_t off; 1348 off64_t off;
1558 1349
1559 /* See if there's a request waiting. If not, nothing to do. */ 1350 /* Get the next request. */
1560 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1351 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1561 if (head == dev->vq->vring.num)
1562 return false;
1563 1352
1564 /* Every block request should contain at least one output buffer 1353 /* Every block request should contain at least one output buffer
1565 * (detailing the location on disk and the type of request) and one 1354 * (detailing the location on disk and the type of request) and one
@@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev)
1633 if (out->type & VIRTIO_BLK_T_BARRIER) 1422 if (out->type & VIRTIO_BLK_T_BARRIER)
1634 fdatasync(vblk->fd); 1423 fdatasync(vblk->fd);
1635 1424
1636 /* We can't trigger an IRQ, because we're not the Launcher. It does 1425 add_used_and_trigger(vq, head, wlen);
1637 * that when we tell it we're done. */
1638 add_used(dev->vq, head, wlen);
1639 return true;
1640}
1641
1642/* This is the thread which actually services the I/O. */
1643static int io_thread(void *_dev)
1644{
1645 struct device *dev = _dev;
1646 struct vblk_info *vblk = dev->priv;
1647 char c;
1648
1649 /* Close other side of workpipe so we get 0 read when main dies. */
1650 close(vblk->workpipe[1]);
1651 /* Close the other side of the done_fd pipe. */
1652 close(dev->fd);
1653
1654 /* When this read fails, it means Launcher died, so we follow. */
1655 while (read(vblk->workpipe[0], &c, 1) == 1) {
1656 /* We acknowledge each request immediately to reduce latency,
1657 * rather than waiting until we've done them all. I haven't
1658 * measured to see if it makes any difference.
1659 *
1660 * That would be an interesting test, wouldn't it? You could
1661 * also try having more than one I/O thread. */
1662 while (service_io(dev))
1663 write(vblk->done_fd, &c, 1);
1664 }
1665 return 0;
1666}
1667
1668/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1669 * when that thread tells us it's completed some I/O. */
1670static bool handle_io_finish(struct device *dev)
1671{
1672 char c;
1673
1674 /* If the I/O thread died, presumably it printed the error, so we
1675 * simply exit. */
1676 if (read(dev->fd, &c, 1) != 1)
1677 exit(1);
1678
1679 /* It did some work, so trigger the irq. */
1680 trigger_irq(dev->vq);
1681 return true;
1682}
1683
1684/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1685static void handle_virtblk_output(struct virtqueue *vq, bool timeout)
1686{
1687 struct vblk_info *vblk = vq->dev->priv;
1688 char c = 0;
1689
1690 /* Wake up I/O thread and tell it to go to work! */
1691 if (write(vblk->workpipe[1], &c, 1) != 1)
1692 /* Presumably it indicated why it died. */
1693 exit(1);
1694} 1426}
1695 1427
1696/*L:198 This actually sets up a virtual block device. */ 1428/*L:198 This actually sets up a virtual block device. */
1697static void setup_block_file(const char *filename) 1429static void setup_block_file(const char *filename)
1698{ 1430{
1699 int p[2];
1700 struct device *dev; 1431 struct device *dev;
1701 struct vblk_info *vblk; 1432 struct vblk_info *vblk;
1702 void *stack;
1703 struct virtio_blk_config conf; 1433 struct virtio_blk_config conf;
1704 1434
1705 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1706 pipe(p);
1707
1708 /* The device responds to return from I/O thread. */ 1435 /* The device responds to return from I/O thread. */
1709 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1436 dev = new_device("block", VIRTIO_ID_BLOCK);
1710 1437
1711 /* The device has one virtqueue, where the Guest places requests. */ 1438 /* The device has one virtqueue, where the Guest places requests. */
1712 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1439 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1713 1440
1714 /* Allocate the room for our own bookkeeping */ 1441 /* Allocate the room for our own bookkeeping */
1715 vblk = dev->priv = malloc(sizeof(*vblk)); 1442 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename)
1731 1458
1732 set_config(dev, sizeof(conf), &conf); 1459 set_config(dev, sizeof(conf), &conf);
1733 1460
1734 /* The I/O thread writes to this end of the pipe when done. */
1735 vblk->done_fd = p[1];
1736
1737 /* This is the second pipe, which is how we tell the I/O thread about
1738 * more work. */
1739 pipe(vblk->workpipe);
1740
1741 /* Create stack for thread and run it. Since stack grows upwards, we
1742 * point the stack pointer to the end of this region. */
1743 stack = malloc(32768);
1744 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
1745 * becoming a zombie. */
1746 if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
1747 err(1, "Creating clone");
1748
1749 /* We don't need to keep the I/O thread's end of the pipes open. */
1750 close(vblk->done_fd);
1751 close(vblk->workpipe[0]);
1752
1753 verbose("device %u: virtblock %llu sectors\n", 1461 verbose("device %u: virtblock %llu sectors\n",
1754 devices.device_num, le64_to_cpu(conf.capacity)); 1462 ++devices.device_num, le64_to_cpu(conf.capacity));
1755} 1463}
1756 1464
1465struct rng_info {
1466 int rfd;
1467};
1468
1757/* Our random number generator device reads from /dev/random into the Guest's 1469/* Our random number generator device reads from /dev/random into the Guest's
1758 * input buffers. The usual case is that the Guest doesn't want random numbers 1470 * input buffers. The usual case is that the Guest doesn't want random numbers
1759 * and so has no buffers although /dev/random is still readable, whereas 1471 * and so has no buffers although /dev/random is still readable, whereas
1760 * console is the reverse. 1472 * console is the reverse.
1761 * 1473 *
1762 * The same logic applies, however. */ 1474 * The same logic applies, however. */
1763static bool handle_rng_input(struct device *dev) 1475static void rng_input(struct virtqueue *vq)
1764{ 1476{
1765 int len; 1477 int len;
1766 unsigned int head, in_num, out_num, totlen = 0; 1478 unsigned int head, in_num, out_num, totlen = 0;
1767 struct iovec iov[dev->vq->vring.num]; 1479 struct rng_info *rng_info = vq->dev->priv;
1480 struct iovec iov[vq->vring.num];
1768 1481
1769 /* First we need a buffer from the Guests's virtqueue. */ 1482 /* First we need a buffer from the Guests's virtqueue. */
1770 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1483 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1771
1772 /* If they're not ready for input, stop listening to this file
1773 * descriptor. We'll start again once they add an input buffer. */
1774 if (head == dev->vq->vring.num)
1775 return false;
1776
1777 if (out_num) 1484 if (out_num)
1778 errx(1, "Output buffers in rng?"); 1485 errx(1, "Output buffers in rng?");
1779 1486
@@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev)
1781 * it reads straight into the Guest's buffer. We loop to make sure we 1488 * it reads straight into the Guest's buffer. We loop to make sure we
1782 * fill it. */ 1489 * fill it. */
1783 while (!iov_empty(iov, in_num)) { 1490 while (!iov_empty(iov, in_num)) {
1784 len = readv(dev->fd, iov, in_num); 1491 len = readv(rng_info->rfd, iov, in_num);
1785 if (len <= 0) 1492 if (len <= 0)
1786 err(1, "Read from /dev/random gave %i", len); 1493 err(1, "Read from /dev/random gave %i", len);
1787 iov_consume(iov, in_num, len); 1494 iov_consume(iov, in_num, len);
@@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev)
1789 } 1496 }
1790 1497
1791 /* Tell the Guest about the new input. */ 1498 /* Tell the Guest about the new input. */
1792 add_used_and_trigger(dev->vq, head, totlen); 1499 add_used_and_trigger(vq, head, totlen);
1793
1794 /* Everything went OK! */
1795 return true;
1796} 1500}
1797 1501
1798/* And this creates a "hardware" random number device for the Guest. */ 1502/* And this creates a "hardware" random number device for the Guest. */
1799static void setup_rng(void) 1503static void setup_rng(void)
1800{ 1504{
1801 struct device *dev; 1505 struct device *dev;
1802 int fd; 1506 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1803 1507
1804 fd = open_or_die("/dev/random", O_RDONLY); 1508 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1805 1509
1806 /* The device responds to return from I/O thread. */ 1510 /* The device responds to return from I/O thread. */
1807 dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); 1511 dev = new_device("rng", VIRTIO_ID_RNG);
1512 dev->priv = rng_info;
1808 1513
1809 /* The device has one virtqueue, where the Guest places inbufs. */ 1514 /* The device has one virtqueue, where the Guest places inbufs. */
1810 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1515 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1811 1516
1812 verbose("device %u: rng\n", devices.device_num++); 1517 verbose("device %u: rng\n", devices.device_num++);
1813} 1518}
@@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void)
1823 for (i = 3; i < FD_SETSIZE; i++) 1528 for (i = 3; i < FD_SETSIZE; i++)
1824 close(i); 1529 close(i);
1825 1530
1826 /* The exec automatically gets rid of the I/O and Waker threads. */ 1531 /* Reset all the devices (kills all threads). */
1532 cleanup_devices();
1533
1827 execv(main_args[0], main_args); 1534 execv(main_args[0], main_args);
1828 err(1, "Could not exec %s", main_args[0]); 1535 err(1, "Could not exec %s", main_args[0]);
1829} 1536}
@@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void)
1833static void __attribute__((noreturn)) run_guest(void) 1540static void __attribute__((noreturn)) run_guest(void)
1834{ 1541{
1835 for (;;) { 1542 for (;;) {
1836 unsigned long args[] = { LHREQ_BREAK, 0 };
1837 unsigned long notify_addr; 1543 unsigned long notify_addr;
1838 int readval; 1544 int readval;
1839 1545
@@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void)
1845 if (readval == sizeof(notify_addr)) { 1551 if (readval == sizeof(notify_addr)) {
1846 verbose("Notify on address %#lx\n", notify_addr); 1552 verbose("Notify on address %#lx\n", notify_addr);
1847 handle_output(notify_addr); 1553 handle_output(notify_addr);
1848 continue;
1849 /* ENOENT means the Guest died. Reading tells us why. */ 1554 /* ENOENT means the Guest died. Reading tells us why. */
1850 } else if (errno == ENOENT) { 1555 } else if (errno == ENOENT) {
1851 char reason[1024] = { 0 }; 1556 char reason[1024] = { 0 };
@@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void)
1854 /* ERESTART means that we need to reboot the guest */ 1559 /* ERESTART means that we need to reboot the guest */
1855 } else if (errno == ERESTART) { 1560 } else if (errno == ERESTART) {
1856 restart_guest(); 1561 restart_guest();
1857 /* EAGAIN means a signal (timeout). 1562 /* Anything else means a bug or incompatible change. */
1858 * Anything else means a bug or incompatible change. */ 1563 } else
1859 } else if (errno != EAGAIN)
1860 err(1, "Running guest failed"); 1564 err(1, "Running guest failed");
1861
1862 /* Only service input on thread for CPU 0. */
1863 if (cpu_id != 0)
1864 continue;
1865
1866 /* Service input, then unset the BREAK to release the Waker. */
1867 handle_input();
1868 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1869 err(1, "Resetting break");
1870 } 1565 }
1871} 1566}
1872/*L:240 1567/*L:240
@@ -1909,18 +1604,10 @@ int main(int argc, char *argv[])
1909 1604
1910 /* Save the args: we "reboot" by execing ourselves again. */ 1605 /* Save the args: we "reboot" by execing ourselves again. */
1911 main_args = argv; 1606 main_args = argv;
1912 /* We don't "wait" for the children, so prevent them from becoming
1913 * zombies. */
1914 signal(SIGCHLD, SIG_IGN);
1915 1607
1916 /* First we initialize the device list. Since console and network 1608 /* First we initialize the device list. We keep a pointer to the last
1917 * device receive input from a file descriptor, we keep an fdset 1609 * device, and the next interrupt number to use for devices (1:
1918 * (infds) and the maximum fd number (max_infd) with the head of the 1610 * remember that 0 is used by the timer). */
1919 * list. We also keep a pointer to the last device. Finally, we keep
1920 * the next interrupt number to use for devices (1: remember that 0 is
1921 * used by the timer). */
1922 FD_ZERO(&devices.infds);
1923 devices.max_infd = -1;
1924 devices.lastdev = NULL; 1611 devices.lastdev = NULL;
1925 devices.next_irq = 1; 1612 devices.next_irq = 1;
1926 1613
@@ -1978,9 +1665,6 @@ int main(int argc, char *argv[])
1978 /* We always have a console device */ 1665 /* We always have a console device */
1979 setup_console(); 1666 setup_console();
1980 1667
1981 /* We can timeout waiting for Guest network transmit. */
1982 setup_timeout();
1983
1984 /* Now we load the kernel */ 1668 /* Now we load the kernel */
1985 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1669 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1986 1670
@@ -2021,10 +1705,11 @@ int main(int argc, char *argv[])
2021 * /dev/lguest file descriptor. */ 1705 * /dev/lguest file descriptor. */
2022 tell_kernel(start); 1706 tell_kernel(start);
2023 1707
2024 /* We clone off a thread, which wakes the Launcher whenever one of the 1708 /* Ensure that we terminate if a child dies. */
2025 * input file descriptors needs attention. We call this the Waker, and 1709 signal(SIGCHLD, kill_launcher);
2026 * we'll cover it in a moment. */ 1710
2027 setup_waker(); 1711 /* If we exit via err(), this kills all the threads, restores tty. */
1712 atexit(cleanup_devices);
2028 1713
2029 /* Finally, run the Guest. This doesn't return. */ 1714 /* Finally, run the Guest. This doesn't return. */
2030 run_guest(); 1715 run_guest();