diff options
-rw-r--r-- | Documentation/lguest/lguest.c | 833 |
1 files changed, 259 insertions, 574 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 02fa524cf4ad..5470b8ed2149 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <sys/types.h> | 16 | #include <sys/types.h> |
17 | #include <sys/stat.h> | 17 | #include <sys/stat.h> |
18 | #include <sys/wait.h> | 18 | #include <sys/wait.h> |
19 | #include <sys/eventfd.h> | ||
19 | #include <fcntl.h> | 20 | #include <fcntl.h> |
20 | #include <stdbool.h> | 21 | #include <stdbool.h> |
21 | #include <errno.h> | 22 | #include <errno.h> |
@@ -59,7 +60,6 @@ typedef uint8_t u8; | |||
59 | /*:*/ | 60 | /*:*/ |
60 | 61 | ||
61 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 62 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
62 | #define NET_PEERNUM 1 | ||
63 | #define BRIDGE_PFX "bridge:" | 63 | #define BRIDGE_PFX "bridge:" |
64 | #ifndef SIOCBRADDIF | 64 | #ifndef SIOCBRADDIF |
65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
@@ -76,18 +76,10 @@ static bool verbose; | |||
76 | do { if (verbose) printf(args); } while(0) | 76 | do { if (verbose) printf(args); } while(0) |
77 | /*:*/ | 77 | /*:*/ |
78 | 78 | ||
79 | /* File descriptors for the Waker. */ | ||
80 | struct { | ||
81 | int pipe[2]; | ||
82 | } waker_fds; | ||
83 | |||
84 | /* The pointer to the start of guest memory. */ | 79 | /* The pointer to the start of guest memory. */ |
85 | static void *guest_base; | 80 | static void *guest_base; |
86 | /* The maximum guest physical address allowed, and maximum possible. */ | 81 | /* The maximum guest physical address allowed, and maximum possible. */ |
87 | static unsigned long guest_limit, guest_max; | 82 | static unsigned long guest_limit, guest_max; |
88 | /* The pipe for signal hander to write to. */ | ||
89 | static int timeoutpipe[2]; | ||
90 | static unsigned int timeout_usec = 500; | ||
91 | /* The /dev/lguest file descriptor. */ | 83 | /* The /dev/lguest file descriptor. */ |
92 | static int lguest_fd; | 84 | static int lguest_fd; |
93 | 85 | ||
@@ -97,11 +89,6 @@ static unsigned int __thread cpu_id; | |||
97 | /* This is our list of devices. */ | 89 | /* This is our list of devices. */ |
98 | struct device_list | 90 | struct device_list |
99 | { | 91 | { |
100 | /* Summary information about the devices in our list: ready to pass to | ||
101 | * select() to ask which need servicing.*/ | ||
102 | fd_set infds; | ||
103 | int max_infd; | ||
104 | |||
105 | /* Counter to assign interrupt numbers. */ | 92 | /* Counter to assign interrupt numbers. */ |
106 | unsigned int next_irq; | 93 | unsigned int next_irq; |
107 | 94 | ||
@@ -137,16 +124,11 @@ struct device | |||
137 | /* The name of this device, for --verbose. */ | 124 | /* The name of this device, for --verbose. */ |
138 | const char *name; | 125 | const char *name; |
139 | 126 | ||
140 | /* If handle_input is set, it wants to be called when this file | ||
141 | * descriptor is ready. */ | ||
142 | int fd; | ||
143 | bool (*handle_input)(struct device *me); | ||
144 | |||
145 | /* Any queues attached to this device */ | 127 | /* Any queues attached to this device */ |
146 | struct virtqueue *vq; | 128 | struct virtqueue *vq; |
147 | 129 | ||
148 | /* Handle status being finalized (ie. feature bits stable). */ | 130 | /* Is it operational */ |
149 | void (*ready)(struct device *me); | 131 | bool running; |
150 | 132 | ||
151 | /* Device-specific data. */ | 133 | /* Device-specific data. */ |
152 | void *priv; | 134 | void *priv; |
@@ -169,16 +151,20 @@ struct virtqueue | |||
169 | /* Last available index we saw. */ | 151 | /* Last available index we saw. */ |
170 | u16 last_avail_idx; | 152 | u16 last_avail_idx; |
171 | 153 | ||
172 | /* The routine to call when the Guest pings us, or timeout. */ | 154 | /* Eventfd where Guest notifications arrive. */ |
173 | void (*handle_output)(struct virtqueue *me, bool timeout); | 155 | int eventfd; |
174 | 156 | ||
175 | /* Is this blocked awaiting a timer? */ | 157 | /* Function for the thread which is servicing this virtqueue. */ |
176 | bool blocked; | 158 | void (*service)(struct virtqueue *vq); |
159 | pid_t thread; | ||
177 | }; | 160 | }; |
178 | 161 | ||
179 | /* Remember the arguments to the program so we can "reboot" */ | 162 | /* Remember the arguments to the program so we can "reboot" */ |
180 | static char **main_args; | 163 | static char **main_args; |
181 | 164 | ||
165 | /* The original tty settings to restore on exit. */ | ||
166 | static struct termios orig_term; | ||
167 | |||
182 | /* We have to be careful with barriers: our devices are all run in separate | 168 | /* We have to be careful with barriers: our devices are all run in separate |
183 | * threads and so we need to make sure that changes visible to the Guest happen | 169 | * threads and so we need to make sure that changes visible to the Guest happen |
184 | * in precise order. */ | 170 | * in precise order. */ |
@@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start) | |||
521 | } | 507 | } |
522 | /*:*/ | 508 | /*:*/ |
523 | 509 | ||
524 | static void add_device_fd(int fd) | ||
525 | { | ||
526 | FD_SET(fd, &devices.infds); | ||
527 | if (fd > devices.max_infd) | ||
528 | devices.max_infd = fd; | ||
529 | } | ||
530 | |||
531 | /*L:200 | ||
532 | * The Waker. | ||
533 | * | ||
534 | * With console, block and network devices, we can have lots of input which we | ||
535 | * need to process. We could try to tell the kernel what file descriptors to | ||
536 | * watch, but handing a file descriptor mask through to the kernel is fairly | ||
537 | * icky. | ||
538 | * | ||
539 | * Instead, we clone off a thread which watches the file descriptors and writes | ||
540 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host | ||
541 | * stop running the Guest. This causes the Launcher to return from the | ||
542 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | ||
543 | * the LHREQ_BREAK and wake us up again. | ||
544 | * | ||
545 | * This, of course, is merely a different *kind* of icky. | ||
546 | * | ||
547 | * Given my well-known antipathy to threads, I'd prefer to use processes. But | ||
548 | * it's easier to share Guest memory with threads, and trivial to share the | ||
549 | * devices.infds as the Launcher changes it. | ||
550 | */ | ||
551 | static int waker(void *unused) | ||
552 | { | ||
553 | /* Close the write end of the pipe: only the Launcher has it open. */ | ||
554 | close(waker_fds.pipe[1]); | ||
555 | |||
556 | for (;;) { | ||
557 | fd_set rfds = devices.infds; | ||
558 | unsigned long args[] = { LHREQ_BREAK, 1 }; | ||
559 | unsigned int maxfd = devices.max_infd; | ||
560 | |||
561 | /* We also listen to the pipe from the Launcher. */ | ||
562 | FD_SET(waker_fds.pipe[0], &rfds); | ||
563 | if (waker_fds.pipe[0] > maxfd) | ||
564 | maxfd = waker_fds.pipe[0]; | ||
565 | |||
566 | /* Wait until input is ready from one of the devices. */ | ||
567 | select(maxfd+1, &rfds, NULL, NULL, NULL); | ||
568 | |||
569 | /* Message from Launcher? */ | ||
570 | if (FD_ISSET(waker_fds.pipe[0], &rfds)) { | ||
571 | char c; | ||
572 | /* If this fails, then assume Launcher has exited. | ||
573 | * Don't do anything on exit: we're just a thread! */ | ||
574 | if (read(waker_fds.pipe[0], &c, 1) != 1) | ||
575 | _exit(0); | ||
576 | continue; | ||
577 | } | ||
578 | |||
579 | /* Send LHREQ_BREAK command to snap the Launcher out of it. */ | ||
580 | pwrite(lguest_fd, args, sizeof(args), cpu_id); | ||
581 | } | ||
582 | return 0; | ||
583 | } | ||
584 | |||
585 | /* This routine just sets up a pipe to the Waker process. */ | ||
586 | static void setup_waker(void) | ||
587 | { | ||
588 | /* This pipe is closed when Launcher dies, telling Waker. */ | ||
589 | if (pipe(waker_fds.pipe) != 0) | ||
590 | err(1, "Creating pipe for Waker"); | ||
591 | |||
592 | if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) | ||
593 | err(1, "Creating Waker"); | ||
594 | } | ||
595 | |||
596 | /* | 510 | /* |
597 | * Device Handling. | 511 | * Device Handling. |
598 | * | 512 | * |
@@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i) | |||
642 | * number of output then some number of input descriptors, it's actually two | 556 | * number of output then some number of input descriptors, it's actually two |
643 | * iovecs, but we pack them into one and note how many of each there were. | 557 | * iovecs, but we pack them into one and note how many of each there were. |
644 | * | 558 | * |
645 | * This function returns the descriptor number found, or vq->vring.num (which | 559 | * This function returns the descriptor number found. */ |
646 | * is never a valid descriptor number) if none was found. */ | 560 | static unsigned wait_for_vq_desc(struct virtqueue *vq, |
647 | static unsigned get_vq_desc(struct virtqueue *vq, | 561 | struct iovec iov[], |
648 | struct iovec iov[], | 562 | unsigned int *out_num, unsigned int *in_num) |
649 | unsigned int *out_num, unsigned int *in_num) | ||
650 | { | 563 | { |
651 | unsigned int i, head; | 564 | unsigned int i, head; |
652 | u16 last_avail; | 565 | u16 last_avail = lg_last_avail(vq); |
566 | |||
567 | while (last_avail == vq->vring.avail->idx) { | ||
568 | u64 event; | ||
569 | |||
570 | /* Nothing new? Wait for eventfd to tell us they refilled. */ | ||
571 | if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) | ||
572 | errx(1, "Event read failed?"); | ||
573 | } | ||
653 | 574 | ||
654 | /* Check it isn't doing very strange things with descriptor numbers. */ | 575 | /* Check it isn't doing very strange things with descriptor numbers. */ |
655 | last_avail = lg_last_avail(vq); | ||
656 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) | 576 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) |
657 | errx(1, "Guest moved used index from %u to %u", | 577 | errx(1, "Guest moved used index from %u to %u", |
658 | last_avail, vq->vring.avail->idx); | 578 | last_avail, vq->vring.avail->idx); |
659 | 579 | ||
660 | /* If there's nothing new since last we looked, return invalid. */ | ||
661 | if (vq->vring.avail->idx == last_avail) | ||
662 | return vq->vring.num; | ||
663 | |||
664 | /* Grab the next descriptor number they're advertising, and increment | 580 | /* Grab the next descriptor number they're advertising, and increment |
665 | * the index we've seen. */ | 581 | * the index we've seen. */ |
666 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; | 582 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; |
@@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) | |||
740 | /* | 656 | /* |
741 | * The Console | 657 | * The Console |
742 | * | 658 | * |
743 | * Here is the input terminal setting we save, and the routine to restore them | 659 | * We associate some data with the console for our exit hack. */ |
744 | * on exit so the user gets their terminal back. */ | ||
745 | static struct termios orig_term; | ||
746 | static void restore_term(void) | ||
747 | { | ||
748 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
749 | } | ||
750 | |||
751 | /* We associate some data with the console for our exit hack. */ | ||
752 | struct console_abort | 660 | struct console_abort |
753 | { | 661 | { |
754 | /* How many times have they hit ^C? */ | 662 | /* How many times have they hit ^C? */ |
@@ -758,245 +666,235 @@ struct console_abort | |||
758 | }; | 666 | }; |
759 | 667 | ||
760 | /* This is the routine which handles console input (ie. stdin). */ | 668 | /* This is the routine which handles console input (ie. stdin). */ |
761 | static bool handle_console_input(struct device *dev) | 669 | static void console_input(struct virtqueue *vq) |
762 | { | 670 | { |
763 | int len; | 671 | int len; |
764 | unsigned int head, in_num, out_num; | 672 | unsigned int head, in_num, out_num; |
765 | struct iovec iov[dev->vq->vring.num]; | 673 | struct console_abort *abort = vq->dev->priv; |
766 | struct console_abort *abort = dev->priv; | 674 | struct iovec iov[vq->vring.num]; |
767 | |||
768 | /* First we need a console buffer from the Guests's input virtqueue. */ | ||
769 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
770 | |||
771 | /* If they're not ready for input, stop listening to this file | ||
772 | * descriptor. We'll start again once they add an input buffer. */ | ||
773 | if (head == dev->vq->vring.num) | ||
774 | return false; | ||
775 | 675 | ||
676 | /* Make sure there's a descriptor waiting. */ | ||
677 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | ||
776 | if (out_num) | 678 | if (out_num) |
777 | errx(1, "Output buffers in console in queue?"); | 679 | errx(1, "Output buffers in console in queue?"); |
778 | 680 | ||
779 | /* This is why we convert to iovecs: the readv() call uses them, and so | 681 | /* Read it in. */ |
780 | * it reads straight into the Guest's buffer. */ | 682 | len = readv(STDIN_FILENO, iov, in_num); |
781 | len = readv(dev->fd, iov, in_num); | ||
782 | if (len <= 0) { | 683 | if (len <= 0) { |
783 | /* This implies that the console is closed, is /dev/null, or | 684 | /* Ran out of input? */ |
784 | * something went terribly wrong. */ | ||
785 | warnx("Failed to get console input, ignoring console."); | 685 | warnx("Failed to get console input, ignoring console."); |
786 | /* Put the input terminal back. */ | 686 | /* For simplicity, dying threads kill the whole Launcher. So |
787 | restore_term(); | 687 | * just nap here. */ |
788 | /* Remove callback from input vq, so it doesn't restart us. */ | 688 | for (;;) |
789 | dev->vq->handle_output = NULL; | 689 | pause(); |
790 | /* Stop listening to this fd: don't call us again. */ | ||
791 | return false; | ||
792 | } | 690 | } |
793 | 691 | ||
794 | /* Tell the Guest about the new input. */ | 692 | add_used_and_trigger(vq, head, len); |
795 | add_used_and_trigger(dev->vq, head, len); | ||
796 | 693 | ||
797 | /* Three ^C within one second? Exit. | 694 | /* Three ^C within one second? Exit. |
798 | * | 695 | * |
799 | * This is such a hack, but works surprisingly well. Each ^C has to be | 696 | * This is such a hack, but works surprisingly well. Each ^C has to |
800 | * in a buffer by itself, so they can't be too fast. But we check that | 697 | * be in a buffer by itself, so they can't be too fast. But we check |
801 | * we get three within about a second, so they can't be too slow. */ | 698 | * that we get three within about a second, so they can't be too |
802 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { | 699 | * slow. */ |
803 | if (!abort->count++) | 700 | if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { |
804 | gettimeofday(&abort->start, NULL); | ||
805 | else if (abort->count == 3) { | ||
806 | struct timeval now; | ||
807 | gettimeofday(&now, NULL); | ||
808 | if (now.tv_sec <= abort->start.tv_sec+1) { | ||
809 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
810 | /* Close the fd so Waker will know it has to | ||
811 | * exit. */ | ||
812 | close(waker_fds.pipe[1]); | ||
813 | /* Just in case Waker is blocked in BREAK, send | ||
814 | * unbreak now. */ | ||
815 | write(lguest_fd, args, sizeof(args)); | ||
816 | exit(2); | ||
817 | } | ||
818 | abort->count = 0; | ||
819 | } | ||
820 | } else | ||
821 | /* Any other key resets the abort counter. */ | ||
822 | abort->count = 0; | 701 | abort->count = 0; |
702 | return; | ||
703 | } | ||
823 | 704 | ||
824 | /* Everything went OK! */ | 705 | abort->count++; |
825 | return true; | 706 | if (abort->count == 1) |
707 | gettimeofday(&abort->start, NULL); | ||
708 | else if (abort->count == 3) { | ||
709 | struct timeval now; | ||
710 | gettimeofday(&now, NULL); | ||
711 | /* Kill all Launcher processes with SIGINT, like normal ^C */ | ||
712 | if (now.tv_sec <= abort->start.tv_sec+1) | ||
713 | kill(0, SIGINT); | ||
714 | abort->count = 0; | ||
715 | } | ||
826 | } | 716 | } |
827 | 717 | ||
828 | /* Handling output for console is simple: we just get all the output buffers | 718 | /* This is the routine which handles console output (ie. stdout). */ |
829 | * and write them to stdout. */ | 719 | static void console_output(struct virtqueue *vq) |
830 | static void handle_console_output(struct virtqueue *vq, bool timeout) | ||
831 | { | 720 | { |
832 | unsigned int head, out, in; | 721 | unsigned int head, out, in; |
833 | struct iovec iov[vq->vring.num]; | 722 | struct iovec iov[vq->vring.num]; |
834 | 723 | ||
835 | /* Keep getting output buffers from the Guest until we run out. */ | 724 | head = wait_for_vq_desc(vq, iov, &out, &in); |
836 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | 725 | if (in) |
837 | if (in) | 726 | errx(1, "Input buffers in console output queue?"); |
838 | errx(1, "Input buffers in output queue?"); | 727 | while (!iov_empty(iov, out)) { |
839 | while (!iov_empty(iov, out)) { | 728 | int len = writev(STDOUT_FILENO, iov, out); |
840 | int len = writev(STDOUT_FILENO, iov, out); | 729 | if (len <= 0) |
841 | if (len <= 0) | 730 | err(1, "Write to stdout gave %i", len); |
842 | err(1, "Write to stdout gave %i", len); | 731 | iov_consume(iov, out, len); |
843 | iov_consume(iov, out, len); | ||
844 | } | ||
845 | add_used_and_trigger(vq, head, 0); | ||
846 | } | 732 | } |
847 | } | 733 | add_used_and_trigger(vq, head, 0); |
848 | |||
849 | /* This is called when we no longer want to hear about Guest changes to a | ||
850 | * virtqueue. This is more efficient in high-traffic cases, but it means we | ||
851 | * have to set a timer to check if any more changes have occurred. */ | ||
852 | static void block_vq(struct virtqueue *vq) | ||
853 | { | ||
854 | struct itimerval itm; | ||
855 | |||
856 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
857 | vq->blocked = true; | ||
858 | |||
859 | itm.it_interval.tv_sec = 0; | ||
860 | itm.it_interval.tv_usec = 0; | ||
861 | itm.it_value.tv_sec = 0; | ||
862 | itm.it_value.tv_usec = timeout_usec; | ||
863 | |||
864 | setitimer(ITIMER_REAL, &itm, NULL); | ||
865 | } | 734 | } |
866 | 735 | ||
867 | /* | 736 | /* |
868 | * The Network | 737 | * The Network |
869 | * | 738 | * |
870 | * Handling output for network is also simple: we get all the output buffers | 739 | * Handling output for network is also simple: we get all the output buffers |
871 | * and write them (ignoring the first element) to this device's file descriptor | 740 | * and write them to /dev/net/tun. |
872 | * (/dev/net/tun). | ||
873 | */ | 741 | */ |
874 | static void handle_net_output(struct virtqueue *vq, bool timeout) | 742 | struct net_info { |
743 | int tunfd; | ||
744 | }; | ||
745 | |||
746 | static void net_output(struct virtqueue *vq) | ||
875 | { | 747 | { |
876 | unsigned int head, out, in, num = 0; | 748 | struct net_info *net_info = vq->dev->priv; |
749 | unsigned int head, out, in; | ||
877 | struct iovec iov[vq->vring.num]; | 750 | struct iovec iov[vq->vring.num]; |
878 | static int last_timeout_num; | ||
879 | |||
880 | /* Keep getting output buffers from the Guest until we run out. */ | ||
881 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | ||
882 | if (in) | ||
883 | errx(1, "Input buffers in output queue?"); | ||
884 | if (writev(vq->dev->fd, iov, out) < 0) | ||
885 | err(1, "Writing network packet to tun"); | ||
886 | add_used_and_trigger(vq, head, 0); | ||
887 | num++; | ||
888 | } | ||
889 | 751 | ||
890 | /* Block further kicks and set up a timer if we saw anything. */ | 752 | head = wait_for_vq_desc(vq, iov, &out, &in); |
891 | if (!timeout && num) | 753 | if (in) |
892 | block_vq(vq); | 754 | errx(1, "Input buffers in net output queue?"); |
893 | 755 | if (writev(net_info->tunfd, iov, out) < 0) | |
894 | /* We never quite know how long should we wait before we check the | 756 | errx(1, "Write to tun failed?"); |
895 | * queue again for more packets. We start at 500 microseconds, and if | 757 | add_used_and_trigger(vq, head, 0); |
896 | * we get fewer packets than last time, we assume we made the timeout | ||
897 | * too small and increase it by 10 microseconds. Otherwise, we drop it | ||
898 | * by one microsecond every time. It seems to work well enough. */ | ||
899 | if (timeout) { | ||
900 | if (num < last_timeout_num) | ||
901 | timeout_usec += 10; | ||
902 | else if (timeout_usec > 1) | ||
903 | timeout_usec--; | ||
904 | last_timeout_num = num; | ||
905 | } | ||
906 | } | 758 | } |
907 | 759 | ||
908 | /* This is where we handle a packet coming in from the tun device to our | 760 | /* This is where we handle packets coming in from the tun device to our |
909 | * Guest. */ | 761 | * Guest. */ |
910 | static bool handle_tun_input(struct device *dev) | 762 | static void net_input(struct virtqueue *vq) |
911 | { | 763 | { |
912 | unsigned int head, in_num, out_num; | ||
913 | int len; | 764 | int len; |
914 | struct iovec iov[dev->vq->vring.num]; | 765 | unsigned int head, out, in; |
915 | 766 | struct iovec iov[vq->vring.num]; | |
916 | /* First we need a network buffer from the Guests's recv virtqueue. */ | 767 | struct net_info *net_info = vq->dev->priv; |
917 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 768 | |
918 | if (head == dev->vq->vring.num) { | 769 | head = wait_for_vq_desc(vq, iov, &out, &in); |
919 | /* Now, it's expected that if we try to send a packet too | 770 | if (out) |
920 | * early, the Guest won't be ready yet. Wait until the device | 771 | errx(1, "Output buffers in net input queue?"); |
921 | * status says it's ready. */ | 772 | len = readv(net_info->tunfd, iov, in); |
922 | /* FIXME: Actually want DRIVER_ACTIVE here. */ | ||
923 | |||
924 | /* Now tell it we want to know if new things appear. */ | ||
925 | dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
926 | wmb(); | ||
927 | |||
928 | /* We'll turn this back on if input buffers are registered. */ | ||
929 | return false; | ||
930 | } else if (out_num) | ||
931 | errx(1, "Output buffers in network recv queue?"); | ||
932 | |||
933 | /* Read the packet from the device directly into the Guest's buffer. */ | ||
934 | len = readv(dev->fd, iov, in_num); | ||
935 | if (len <= 0) | 773 | if (len <= 0) |
936 | err(1, "reading network"); | 774 | err(1, "Failed to read from tun."); |
775 | add_used_and_trigger(vq, head, len); | ||
776 | } | ||
937 | 777 | ||
938 | /* Tell the Guest about the new packet. */ | 778 | /* This is the helper to create threads. */ |
939 | add_used_and_trigger(dev->vq, head, len); | 779 | static int do_thread(void *_vq) |
780 | { | ||
781 | struct virtqueue *vq = _vq; | ||
940 | 782 | ||
941 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 783 | for (;;) |
942 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], | 784 | vq->service(vq); |
943 | head != dev->vq->vring.num ? "sent" : "discarded"); | 785 | return 0; |
786 | } | ||
944 | 787 | ||
945 | /* All good. */ | 788 | /* When a child dies, we kill our entire process group with SIGTERM. This |
946 | return true; | 789 | * also has the side effect that the shell restores the console for us! */ |
790 | static void kill_launcher(int signal) | ||
791 | { | ||
792 | kill(0, SIGTERM); | ||
947 | } | 793 | } |
948 | 794 | ||
949 | /*L:215 This is the callback attached to the network and console input | 795 | static void reset_device(struct device *dev) |
950 | * virtqueues: it ensures we try again, in case we stopped console or net | ||
951 | * delivery because Guest didn't have any buffers. */ | ||
952 | static void enable_fd(struct virtqueue *vq, bool timeout) | ||
953 | { | 796 | { |
954 | add_device_fd(vq->dev->fd); | 797 | struct virtqueue *vq; |
955 | /* Snap the Waker out of its select loop. */ | 798 | |
956 | write(waker_fds.pipe[1], "", 1); | 799 | verbose("Resetting device %s\n", dev->name); |
800 | |||
801 | /* Clear any features they've acked. */ | ||
802 | memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); | ||
803 | |||
804 | /* We're going to be explicitly killing threads, so ignore them. */ | ||
805 | signal(SIGCHLD, SIG_IGN); | ||
806 | |||
807 | /* Zero out the virtqueues, get rid of their threads */ | ||
808 | for (vq = dev->vq; vq; vq = vq->next) { | ||
809 | if (vq->thread != (pid_t)-1) { | ||
810 | kill(vq->thread, SIGTERM); | ||
811 | waitpid(vq->thread, NULL, 0); | ||
812 | vq->thread = (pid_t)-1; | ||
813 | } | ||
814 | memset(vq->vring.desc, 0, | ||
815 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | ||
816 | lg_last_avail(vq) = 0; | ||
817 | } | ||
818 | dev->running = false; | ||
819 | |||
820 | /* Now we care if threads die. */ | ||
821 | signal(SIGCHLD, (void *)kill_launcher); | ||
957 | } | 822 | } |
958 | 823 | ||
959 | static void net_enable_fd(struct virtqueue *vq, bool timeout) | 824 | static void create_thread(struct virtqueue *vq) |
960 | { | 825 | { |
961 | /* We don't need to know again when Guest refills receive buffer. */ | 826 | /* Create stack for thread and run it. Since stack grows |
962 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | 827 | * upwards, we point the stack pointer to the end of this |
963 | enable_fd(vq, timeout); | 828 | * region. */ |
829 | char *stack = malloc(32768); | ||
830 | unsigned long args[] = { LHREQ_EVENTFD, | ||
831 | vq->config.pfn*getpagesize(), 0 }; | ||
832 | |||
833 | /* Create a zero-initialized eventfd. */ | ||
834 | vq->eventfd = eventfd(0, 0); | ||
835 | if (vq->eventfd < 0) | ||
836 | err(1, "Creating eventfd"); | ||
837 | args[2] = vq->eventfd; | ||
838 | |||
839 | /* Attach an eventfd to this virtqueue: it will go off | ||
840 | * when the Guest does an LHCALL_NOTIFY for this vq. */ | ||
841 | if (write(lguest_fd, &args, sizeof(args)) != 0) | ||
842 | err(1, "Attaching eventfd"); | ||
843 | |||
844 | /* CLONE_VM: because it has to access the Guest memory, and | ||
845 | * SIGCHLD so we get a signal if it dies. */ | ||
846 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); | ||
847 | if (vq->thread == (pid_t)-1) | ||
848 | err(1, "Creating clone"); | ||
849 | /* We close our local copy, now the child has it. */ | ||
850 | close(vq->eventfd); | ||
964 | } | 851 | } |
965 | 852 | ||
966 | /* When the Guest tells us they updated the status field, we handle it. */ | 853 | static void start_device(struct device *dev) |
967 | static void update_device_status(struct device *dev) | ||
968 | { | 854 | { |
855 | unsigned int i; | ||
969 | struct virtqueue *vq; | 856 | struct virtqueue *vq; |
970 | 857 | ||
971 | /* This is a reset. */ | 858 | verbose("Device %s OK: offered", dev->name); |
972 | if (dev->desc->status == 0) { | 859 | for (i = 0; i < dev->feature_len; i++) |
973 | verbose("Resetting device %s\n", dev->name); | 860 | verbose(" %02x", get_feature_bits(dev)[i]); |
861 | verbose(", accepted"); | ||
862 | for (i = 0; i < dev->feature_len; i++) | ||
863 | verbose(" %02x", get_feature_bits(dev) | ||
864 | [dev->feature_len+i]); | ||
865 | |||
866 | for (vq = dev->vq; vq; vq = vq->next) { | ||
867 | if (vq->service) | ||
868 | create_thread(vq); | ||
869 | } | ||
870 | dev->running = true; | ||
871 | } | ||
872 | |||
873 | static void cleanup_devices(void) | ||
874 | { | ||
875 | struct device *dev; | ||
876 | |||
877 | for (dev = devices.dev; dev; dev = dev->next) | ||
878 | reset_device(dev); | ||
974 | 879 | ||
975 | /* Clear any features they've acked. */ | 880 | /* If we saved off the original terminal settings, restore them now. */ |
976 | memset(get_feature_bits(dev) + dev->feature_len, 0, | 881 | if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) |
977 | dev->feature_len); | 882 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); |
883 | } | ||
978 | 884 | ||
979 | /* Zero out the virtqueues. */ | 885 | /* When the Guest tells us they updated the status field, we handle it. */ |
980 | for (vq = dev->vq; vq; vq = vq->next) { | 886 | static void update_device_status(struct device *dev) |
981 | memset(vq->vring.desc, 0, | 887 | { |
982 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | 888 | /* A zero status is a reset, otherwise it's a set of flags. */ |
983 | lg_last_avail(vq) = 0; | 889 | if (dev->desc->status == 0) |
984 | } | 890 | reset_device(dev); |
985 | } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | 891 | else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { |
986 | warnx("Device %s configuration FAILED", dev->name); | 892 | warnx("Device %s configuration FAILED", dev->name); |
893 | if (dev->running) | ||
894 | reset_device(dev); | ||
987 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { | 895 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { |
988 | unsigned int i; | 896 | if (!dev->running) |
989 | 897 | start_device(dev); | |
990 | verbose("Device %s OK: offered", dev->name); | ||
991 | for (i = 0; i < dev->feature_len; i++) | ||
992 | verbose(" %02x", get_feature_bits(dev)[i]); | ||
993 | verbose(", accepted"); | ||
994 | for (i = 0; i < dev->feature_len; i++) | ||
995 | verbose(" %02x", get_feature_bits(dev) | ||
996 | [dev->feature_len+i]); | ||
997 | |||
998 | if (dev->ready) | ||
999 | dev->ready(dev); | ||
1000 | } | 898 | } |
1001 | } | 899 | } |
1002 | 900 | ||
@@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev) | |||
1004 | static void handle_output(unsigned long addr) | 902 | static void handle_output(unsigned long addr) |
1005 | { | 903 | { |
1006 | struct device *i; | 904 | struct device *i; |
1007 | struct virtqueue *vq; | ||
1008 | 905 | ||
1009 | /* Check each device and virtqueue. */ | 906 | /* Check each device. */ |
1010 | for (i = devices.dev; i; i = i->next) { | 907 | for (i = devices.dev; i; i = i->next) { |
908 | struct virtqueue *vq; | ||
909 | |||
1011 | /* Notifications to device descriptors update device status. */ | 910 | /* Notifications to device descriptors update device status. */ |
1012 | if (from_guest_phys(addr) == i->desc) { | 911 | if (from_guest_phys(addr) == i->desc) { |
1013 | update_device_status(i); | 912 | update_device_status(i); |
1014 | return; | 913 | return; |
1015 | } | 914 | } |
1016 | 915 | ||
1017 | /* Notifications to virtqueues mean output has occurred. */ | 916 | /* Devices *can* be used before status is set to DRIVER_OK. */ |
1018 | for (vq = i->vq; vq; vq = vq->next) { | 917 | for (vq = i->vq; vq; vq = vq->next) { |
1019 | if (vq->config.pfn != addr/getpagesize()) | 918 | if (addr != vq->config.pfn*getpagesize()) |
1020 | continue; | 919 | continue; |
1021 | 920 | if (i->running) | |
1022 | /* Guest should acknowledge (and set features!) before | 921 | errx(1, "Notification on running %s", i->name); |
1023 | * using the device. */ | 922 | start_device(i); |
1024 | if (i->desc->status == 0) { | ||
1025 | warnx("%s gave early output", i->name); | ||
1026 | return; | ||
1027 | } | ||
1028 | |||
1029 | if (strcmp(vq->dev->name, "console") != 0) | ||
1030 | verbose("Output to %s\n", vq->dev->name); | ||
1031 | if (vq->handle_output) | ||
1032 | vq->handle_output(vq, false); | ||
1033 | return; | 923 | return; |
1034 | } | 924 | } |
1035 | } | 925 | } |
@@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr) | |||
1043 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 933 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
1044 | } | 934 | } |
1045 | 935 | ||
1046 | static void handle_timeout(void) | ||
1047 | { | ||
1048 | char buf[32]; | ||
1049 | struct device *i; | ||
1050 | struct virtqueue *vq; | ||
1051 | |||
1052 | /* Clear the pipe */ | ||
1053 | read(timeoutpipe[0], buf, sizeof(buf)); | ||
1054 | |||
1055 | /* Check each device and virtqueue: flush blocked ones. */ | ||
1056 | for (i = devices.dev; i; i = i->next) { | ||
1057 | for (vq = i->vq; vq; vq = vq->next) { | ||
1058 | if (!vq->blocked) | ||
1059 | continue; | ||
1060 | |||
1061 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
1062 | vq->blocked = false; | ||
1063 | if (vq->handle_output) | ||
1064 | vq->handle_output(vq, true); | ||
1065 | } | ||
1066 | } | ||
1067 | } | ||
1068 | |||
1069 | /* This is called when the Waker wakes us up: check for incoming file | ||
1070 | * descriptors. */ | ||
1071 | static void handle_input(void) | ||
1072 | { | ||
1073 | /* select() wants a zeroed timeval to mean "don't wait". */ | ||
1074 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | ||
1075 | |||
1076 | for (;;) { | ||
1077 | struct device *i; | ||
1078 | fd_set fds = devices.infds; | ||
1079 | int num; | ||
1080 | |||
1081 | num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); | ||
1082 | /* Could get interrupted */ | ||
1083 | if (num < 0) | ||
1084 | continue; | ||
1085 | /* If nothing is ready, we're done. */ | ||
1086 | if (num == 0) | ||
1087 | break; | ||
1088 | |||
1089 | /* Otherwise, call the device(s) which have readable file | ||
1090 | * descriptors and a method of handling them. */ | ||
1091 | for (i = devices.dev; i; i = i->next) { | ||
1092 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | ||
1093 | if (i->handle_input(i)) | ||
1094 | continue; | ||
1095 | |||
1096 | /* If handle_input() returns false, it means we | ||
1097 | * should no longer service it. Networking and | ||
1098 | * console do this when there's no input | ||
1099 | * buffers to deliver into. Console also uses | ||
1100 | * it when it discovers that stdin is closed. */ | ||
1101 | FD_CLR(i->fd, &devices.infds); | ||
1102 | } | ||
1103 | } | ||
1104 | |||
1105 | /* Is this the timeout fd? */ | ||
1106 | if (FD_ISSET(timeoutpipe[0], &fds)) | ||
1107 | handle_timeout(); | ||
1108 | } | ||
1109 | } | ||
1110 | |||
1111 | /*L:190 | 936 | /*L:190 |
1112 | * Device Setup | 937 | * Device Setup |
1113 | * | 938 | * |
@@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
1153 | /* Each device descriptor is followed by the description of its virtqueues. We | 978 | /* Each device descriptor is followed by the description of its virtqueues. We |
1154 | * specify how many descriptors the virtqueue is to have. */ | 979 | * specify how many descriptors the virtqueue is to have. */ |
1155 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 980 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
1156 | void (*handle_output)(struct virtqueue *, bool)) | 981 | void (*service)(struct virtqueue *)) |
1157 | { | 982 | { |
1158 | unsigned int pages; | 983 | unsigned int pages; |
1159 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 984 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
@@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1168 | vq->next = NULL; | 993 | vq->next = NULL; |
1169 | vq->last_avail_idx = 0; | 994 | vq->last_avail_idx = 0; |
1170 | vq->dev = dev; | 995 | vq->dev = dev; |
1171 | vq->blocked = false; | 996 | vq->service = service; |
997 | vq->thread = (pid_t)-1; | ||
1172 | 998 | ||
1173 | /* Initialize the configuration. */ | 999 | /* Initialize the configuration. */ |
1174 | vq->config.num = num_descs; | 1000 | vq->config.num = num_descs; |
@@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1193 | * second. */ | 1019 | * second. */ |
1194 | for (i = &dev->vq; *i; i = &(*i)->next); | 1020 | for (i = &dev->vq; *i; i = &(*i)->next); |
1195 | *i = vq; | 1021 | *i = vq; |
1196 | |||
1197 | /* Set the routine to call when the Guest does something to this | ||
1198 | * virtqueue. */ | ||
1199 | vq->handle_output = handle_output; | ||
1200 | |||
1201 | /* As an optimization, set the advisory "Don't Notify Me" flag if we | ||
1202 | * don't have a handler */ | ||
1203 | if (!handle_output) | ||
1204 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | ||
1205 | } | 1022 | } |
1206 | 1023 | ||
1207 | /* The first half of the feature bitmask is for us to advertise features. The | 1024 | /* The first half of the feature bitmask is for us to advertise features. The |
@@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
1237 | * calling new_dev_desc() to allocate the descriptor and device memory. | 1054 | * calling new_dev_desc() to allocate the descriptor and device memory. |
1238 | * | 1055 | * |
1239 | * See what I mean about userspace being boring? */ | 1056 | * See what I mean about userspace being boring? */ |
1240 | static struct device *new_device(const char *name, u16 type, int fd, | 1057 | static struct device *new_device(const char *name, u16 type) |
1241 | bool (*handle_input)(struct device *)) | ||
1242 | { | 1058 | { |
1243 | struct device *dev = malloc(sizeof(*dev)); | 1059 | struct device *dev = malloc(sizeof(*dev)); |
1244 | 1060 | ||
1245 | /* Now we populate the fields one at a time. */ | 1061 | /* Now we populate the fields one at a time. */ |
1246 | dev->fd = fd; | ||
1247 | /* If we have an input handler for this file descriptor, then we add it | ||
1248 | * to the device_list's fdset and maxfd. */ | ||
1249 | if (handle_input) | ||
1250 | add_device_fd(dev->fd); | ||
1251 | dev->desc = new_dev_desc(type); | 1062 | dev->desc = new_dev_desc(type); |
1252 | dev->handle_input = handle_input; | ||
1253 | dev->name = name; | 1063 | dev->name = name; |
1254 | dev->vq = NULL; | 1064 | dev->vq = NULL; |
1255 | dev->ready = NULL; | ||
1256 | dev->feature_len = 0; | 1065 | dev->feature_len = 0; |
1257 | dev->num_vq = 0; | 1066 | dev->num_vq = 0; |
1067 | dev->running = false; | ||
1258 | 1068 | ||
1259 | /* Append to device list. Prepending to a single-linked list is | 1069 | /* Append to device list. Prepending to a single-linked list is |
1260 | * easier, but the user expects the devices to be arranged on the bus | 1070 | * easier, but the user expects the devices to be arranged on the bus |
@@ -1282,13 +1092,10 @@ static void setup_console(void) | |||
1282 | * raw input stream to the Guest. */ | 1092 | * raw input stream to the Guest. */ |
1283 | term.c_lflag &= ~(ISIG|ICANON|ECHO); | 1093 | term.c_lflag &= ~(ISIG|ICANON|ECHO); |
1284 | tcsetattr(STDIN_FILENO, TCSANOW, &term); | 1094 | tcsetattr(STDIN_FILENO, TCSANOW, &term); |
1285 | /* If we exit gracefully, the original settings will be | ||
1286 | * restored so the user can see what they're typing. */ | ||
1287 | atexit(restore_term); | ||
1288 | } | 1095 | } |
1289 | 1096 | ||
1290 | dev = new_device("console", VIRTIO_ID_CONSOLE, | 1097 | dev = new_device("console", VIRTIO_ID_CONSOLE); |
1291 | STDIN_FILENO, handle_console_input); | 1098 | |
1292 | /* We store the console state in dev->priv, and initialize it. */ | 1099 | /* We store the console state in dev->priv, and initialize it. */ |
1293 | dev->priv = malloc(sizeof(struct console_abort)); | 1100 | dev->priv = malloc(sizeof(struct console_abort)); |
1294 | ((struct console_abort *)dev->priv)->count = 0; | 1101 | ((struct console_abort *)dev->priv)->count = 0; |
@@ -1297,31 +1104,13 @@ static void setup_console(void) | |||
1297 | * they put something the input queue, we make sure we're listening to | 1104 | * they put something the input queue, we make sure we're listening to |
1298 | * stdin. When they put something in the output queue, we write it to | 1105 | * stdin. When they put something in the output queue, we write it to |
1299 | * stdout. */ | 1106 | * stdout. */ |
1300 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1107 | add_virtqueue(dev, VIRTQUEUE_NUM, console_input); |
1301 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); | 1108 | add_virtqueue(dev, VIRTQUEUE_NUM, console_output); |
1302 | 1109 | ||
1303 | verbose("device %u: console\n", devices.device_num++); | 1110 | verbose("device %u: console\n", ++devices.device_num); |
1304 | } | 1111 | } |
1305 | /*:*/ | 1112 | /*:*/ |
1306 | 1113 | ||
1307 | static void timeout_alarm(int sig) | ||
1308 | { | ||
1309 | write(timeoutpipe[1], "", 1); | ||
1310 | } | ||
1311 | |||
1312 | static void setup_timeout(void) | ||
1313 | { | ||
1314 | if (pipe(timeoutpipe) != 0) | ||
1315 | err(1, "Creating timeout pipe"); | ||
1316 | |||
1317 | if (fcntl(timeoutpipe[1], F_SETFL, | ||
1318 | fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) | ||
1319 | err(1, "Making timeout pipe nonblocking"); | ||
1320 | |||
1321 | add_device_fd(timeoutpipe[0]); | ||
1322 | signal(SIGALRM, timeout_alarm); | ||
1323 | } | ||
1324 | |||
1325 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a | 1114 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
1326 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1115 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1327 | * used to send packets to another guest in a 1:1 manner. | 1116 | * used to send packets to another guest in a 1:1 manner. |
@@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) | |||
1443 | static void setup_tun_net(char *arg) | 1232 | static void setup_tun_net(char *arg) |
1444 | { | 1233 | { |
1445 | struct device *dev; | 1234 | struct device *dev; |
1446 | int netfd, ipfd; | 1235 | struct net_info *net_info = malloc(sizeof(*net_info)); |
1236 | int ipfd; | ||
1447 | u32 ip = INADDR_ANY; | 1237 | u32 ip = INADDR_ANY; |
1448 | bool bridging = false; | 1238 | bool bridging = false; |
1449 | char tapif[IFNAMSIZ], *p; | 1239 | char tapif[IFNAMSIZ], *p; |
1450 | struct virtio_net_config conf; | 1240 | struct virtio_net_config conf; |
1451 | 1241 | ||
1452 | netfd = get_tun_device(tapif); | 1242 | net_info->tunfd = get_tun_device(tapif); |
1453 | 1243 | ||
1454 | /* First we create a new network device. */ | 1244 | /* First we create a new network device. */ |
1455 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); | 1245 | dev = new_device("net", VIRTIO_ID_NET); |
1246 | dev->priv = net_info; | ||
1456 | 1247 | ||
1457 | /* Network devices need a receive and a send queue, just like | 1248 | /* Network devices need a receive and a send queue, just like |
1458 | * console. */ | 1249 | * console. */ |
1459 | add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); | 1250 | add_virtqueue(dev, VIRTQUEUE_NUM, net_input); |
1460 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); | 1251 | add_virtqueue(dev, VIRTQUEUE_NUM, net_output); |
1461 | 1252 | ||
1462 | /* We need a socket to perform the magic network ioctls to bring up the | 1253 | /* We need a socket to perform the magic network ioctls to bring up the |
1463 | * tap interface, connect to the bridge etc. Any socket will do! */ | 1254 | * tap interface, connect to the bridge etc. Any socket will do! */ |
@@ -1546,20 +1337,18 @@ struct vblk_info | |||
1546 | * Remember that the block device is handled by a separate I/O thread. We head | 1337 | * Remember that the block device is handled by a separate I/O thread. We head |
1547 | * straight into the core of that thread here: | 1338 | * straight into the core of that thread here: |
1548 | */ | 1339 | */ |
1549 | static bool service_io(struct device *dev) | 1340 | static void blk_request(struct virtqueue *vq) |
1550 | { | 1341 | { |
1551 | struct vblk_info *vblk = dev->priv; | 1342 | struct vblk_info *vblk = vq->dev->priv; |
1552 | unsigned int head, out_num, in_num, wlen; | 1343 | unsigned int head, out_num, in_num, wlen; |
1553 | int ret; | 1344 | int ret; |
1554 | u8 *in; | 1345 | u8 *in; |
1555 | struct virtio_blk_outhdr *out; | 1346 | struct virtio_blk_outhdr *out; |
1556 | struct iovec iov[dev->vq->vring.num]; | 1347 | struct iovec iov[vq->vring.num]; |
1557 | off64_t off; | 1348 | off64_t off; |
1558 | 1349 | ||
1559 | /* See if there's a request waiting. If not, nothing to do. */ | 1350 | /* Get the next request. */ |
1560 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1351 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
1561 | if (head == dev->vq->vring.num) | ||
1562 | return false; | ||
1563 | 1352 | ||
1564 | /* Every block request should contain at least one output buffer | 1353 | /* Every block request should contain at least one output buffer |
1565 | * (detailing the location on disk and the type of request) and one | 1354 | * (detailing the location on disk and the type of request) and one |
@@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev) | |||
1633 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1422 | if (out->type & VIRTIO_BLK_T_BARRIER) |
1634 | fdatasync(vblk->fd); | 1423 | fdatasync(vblk->fd); |
1635 | 1424 | ||
1636 | /* We can't trigger an IRQ, because we're not the Launcher. It does | 1425 | add_used_and_trigger(vq, head, wlen); |
1637 | * that when we tell it we're done. */ | ||
1638 | add_used(dev->vq, head, wlen); | ||
1639 | return true; | ||
1640 | } | ||
1641 | |||
1642 | /* This is the thread which actually services the I/O. */ | ||
1643 | static int io_thread(void *_dev) | ||
1644 | { | ||
1645 | struct device *dev = _dev; | ||
1646 | struct vblk_info *vblk = dev->priv; | ||
1647 | char c; | ||
1648 | |||
1649 | /* Close other side of workpipe so we get 0 read when main dies. */ | ||
1650 | close(vblk->workpipe[1]); | ||
1651 | /* Close the other side of the done_fd pipe. */ | ||
1652 | close(dev->fd); | ||
1653 | |||
1654 | /* When this read fails, it means Launcher died, so we follow. */ | ||
1655 | while (read(vblk->workpipe[0], &c, 1) == 1) { | ||
1656 | /* We acknowledge each request immediately to reduce latency, | ||
1657 | * rather than waiting until we've done them all. I haven't | ||
1658 | * measured to see if it makes any difference. | ||
1659 | * | ||
1660 | * That would be an interesting test, wouldn't it? You could | ||
1661 | * also try having more than one I/O thread. */ | ||
1662 | while (service_io(dev)) | ||
1663 | write(vblk->done_fd, &c, 1); | ||
1664 | } | ||
1665 | return 0; | ||
1666 | } | ||
1667 | |||
1668 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens | ||
1669 | * when that thread tells us it's completed some I/O. */ | ||
1670 | static bool handle_io_finish(struct device *dev) | ||
1671 | { | ||
1672 | char c; | ||
1673 | |||
1674 | /* If the I/O thread died, presumably it printed the error, so we | ||
1675 | * simply exit. */ | ||
1676 | if (read(dev->fd, &c, 1) != 1) | ||
1677 | exit(1); | ||
1678 | |||
1679 | /* It did some work, so trigger the irq. */ | ||
1680 | trigger_irq(dev->vq); | ||
1681 | return true; | ||
1682 | } | ||
1683 | |||
1684 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ | ||
1685 | static void handle_virtblk_output(struct virtqueue *vq, bool timeout) | ||
1686 | { | ||
1687 | struct vblk_info *vblk = vq->dev->priv; | ||
1688 | char c = 0; | ||
1689 | |||
1690 | /* Wake up I/O thread and tell it to go to work! */ | ||
1691 | if (write(vblk->workpipe[1], &c, 1) != 1) | ||
1692 | /* Presumably it indicated why it died. */ | ||
1693 | exit(1); | ||
1694 | } | 1426 | } |
1695 | 1427 | ||
1696 | /*L:198 This actually sets up a virtual block device. */ | 1428 | /*L:198 This actually sets up a virtual block device. */ |
1697 | static void setup_block_file(const char *filename) | 1429 | static void setup_block_file(const char *filename) |
1698 | { | 1430 | { |
1699 | int p[2]; | ||
1700 | struct device *dev; | 1431 | struct device *dev; |
1701 | struct vblk_info *vblk; | 1432 | struct vblk_info *vblk; |
1702 | void *stack; | ||
1703 | struct virtio_blk_config conf; | 1433 | struct virtio_blk_config conf; |
1704 | 1434 | ||
1705 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ | ||
1706 | pipe(p); | ||
1707 | |||
1708 | /* The device responds to return from I/O thread. */ | 1435 | /* The device responds to return from I/O thread. */ |
1709 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); | 1436 | dev = new_device("block", VIRTIO_ID_BLOCK); |
1710 | 1437 | ||
1711 | /* The device has one virtqueue, where the Guest places requests. */ | 1438 | /* The device has one virtqueue, where the Guest places requests. */ |
1712 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); | 1439 | add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); |
1713 | 1440 | ||
1714 | /* Allocate the room for our own bookkeeping */ | 1441 | /* Allocate the room for our own bookkeeping */ |
1715 | vblk = dev->priv = malloc(sizeof(*vblk)); | 1442 | vblk = dev->priv = malloc(sizeof(*vblk)); |
@@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename) | |||
1731 | 1458 | ||
1732 | set_config(dev, sizeof(conf), &conf); | 1459 | set_config(dev, sizeof(conf), &conf); |
1733 | 1460 | ||
1734 | /* The I/O thread writes to this end of the pipe when done. */ | ||
1735 | vblk->done_fd = p[1]; | ||
1736 | |||
1737 | /* This is the second pipe, which is how we tell the I/O thread about | ||
1738 | * more work. */ | ||
1739 | pipe(vblk->workpipe); | ||
1740 | |||
1741 | /* Create stack for thread and run it. Since stack grows upwards, we | ||
1742 | * point the stack pointer to the end of this region. */ | ||
1743 | stack = malloc(32768); | ||
1744 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from | ||
1745 | * becoming a zombie. */ | ||
1746 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | ||
1747 | err(1, "Creating clone"); | ||
1748 | |||
1749 | /* We don't need to keep the I/O thread's end of the pipes open. */ | ||
1750 | close(vblk->done_fd); | ||
1751 | close(vblk->workpipe[0]); | ||
1752 | |||
1753 | verbose("device %u: virtblock %llu sectors\n", | 1461 | verbose("device %u: virtblock %llu sectors\n", |
1754 | devices.device_num, le64_to_cpu(conf.capacity)); | 1462 | ++devices.device_num, le64_to_cpu(conf.capacity)); |
1755 | } | 1463 | } |
1756 | 1464 | ||
1465 | struct rng_info { | ||
1466 | int rfd; | ||
1467 | }; | ||
1468 | |||
1757 | /* Our random number generator device reads from /dev/random into the Guest's | 1469 | /* Our random number generator device reads from /dev/random into the Guest's |
1758 | * input buffers. The usual case is that the Guest doesn't want random numbers | 1470 | * input buffers. The usual case is that the Guest doesn't want random numbers |
1759 | * and so has no buffers although /dev/random is still readable, whereas | 1471 | * and so has no buffers although /dev/random is still readable, whereas |
1760 | * console is the reverse. | 1472 | * console is the reverse. |
1761 | * | 1473 | * |
1762 | * The same logic applies, however. */ | 1474 | * The same logic applies, however. */ |
1763 | static bool handle_rng_input(struct device *dev) | 1475 | static void rng_input(struct virtqueue *vq) |
1764 | { | 1476 | { |
1765 | int len; | 1477 | int len; |
1766 | unsigned int head, in_num, out_num, totlen = 0; | 1478 | unsigned int head, in_num, out_num, totlen = 0; |
1767 | struct iovec iov[dev->vq->vring.num]; | 1479 | struct rng_info *rng_info = vq->dev->priv; |
1480 | struct iovec iov[vq->vring.num]; | ||
1768 | 1481 | ||
1769 | /* First we need a buffer from the Guests's virtqueue. */ | 1482 | /* First we need a buffer from the Guests's virtqueue. */ |
1770 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1483 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
1771 | |||
1772 | /* If they're not ready for input, stop listening to this file | ||
1773 | * descriptor. We'll start again once they add an input buffer. */ | ||
1774 | if (head == dev->vq->vring.num) | ||
1775 | return false; | ||
1776 | |||
1777 | if (out_num) | 1484 | if (out_num) |
1778 | errx(1, "Output buffers in rng?"); | 1485 | errx(1, "Output buffers in rng?"); |
1779 | 1486 | ||
@@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev) | |||
1781 | * it reads straight into the Guest's buffer. We loop to make sure we | 1488 | * it reads straight into the Guest's buffer. We loop to make sure we |
1782 | * fill it. */ | 1489 | * fill it. */ |
1783 | while (!iov_empty(iov, in_num)) { | 1490 | while (!iov_empty(iov, in_num)) { |
1784 | len = readv(dev->fd, iov, in_num); | 1491 | len = readv(rng_info->rfd, iov, in_num); |
1785 | if (len <= 0) | 1492 | if (len <= 0) |
1786 | err(1, "Read from /dev/random gave %i", len); | 1493 | err(1, "Read from /dev/random gave %i", len); |
1787 | iov_consume(iov, in_num, len); | 1494 | iov_consume(iov, in_num, len); |
@@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev) | |||
1789 | } | 1496 | } |
1790 | 1497 | ||
1791 | /* Tell the Guest about the new input. */ | 1498 | /* Tell the Guest about the new input. */ |
1792 | add_used_and_trigger(dev->vq, head, totlen); | 1499 | add_used_and_trigger(vq, head, totlen); |
1793 | |||
1794 | /* Everything went OK! */ | ||
1795 | return true; | ||
1796 | } | 1500 | } |
1797 | 1501 | ||
1798 | /* And this creates a "hardware" random number device for the Guest. */ | 1502 | /* And this creates a "hardware" random number device for the Guest. */ |
1799 | static void setup_rng(void) | 1503 | static void setup_rng(void) |
1800 | { | 1504 | { |
1801 | struct device *dev; | 1505 | struct device *dev; |
1802 | int fd; | 1506 | struct rng_info *rng_info = malloc(sizeof(*rng_info)); |
1803 | 1507 | ||
1804 | fd = open_or_die("/dev/random", O_RDONLY); | 1508 | rng_info->rfd = open_or_die("/dev/random", O_RDONLY); |
1805 | 1509 | ||
1806 | /* The device responds to return from I/O thread. */ | 1510 | /* The device responds to return from I/O thread. */ |
1807 | dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); | 1511 | dev = new_device("rng", VIRTIO_ID_RNG); |
1512 | dev->priv = rng_info; | ||
1808 | 1513 | ||
1809 | /* The device has one virtqueue, where the Guest places inbufs. */ | 1514 | /* The device has one virtqueue, where the Guest places inbufs. */ |
1810 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1515 | add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); |
1811 | 1516 | ||
1812 | verbose("device %u: rng\n", devices.device_num++); | 1517 | verbose("device %u: rng\n", devices.device_num++); |
1813 | } | 1518 | } |
@@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
1823 | for (i = 3; i < FD_SETSIZE; i++) | 1528 | for (i = 3; i < FD_SETSIZE; i++) |
1824 | close(i); | 1529 | close(i); |
1825 | 1530 | ||
1826 | /* The exec automatically gets rid of the I/O and Waker threads. */ | 1531 | /* Reset all the devices (kills all threads). */ |
1532 | cleanup_devices(); | ||
1533 | |||
1827 | execv(main_args[0], main_args); | 1534 | execv(main_args[0], main_args); |
1828 | err(1, "Could not exec %s", main_args[0]); | 1535 | err(1, "Could not exec %s", main_args[0]); |
1829 | } | 1536 | } |
@@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
1833 | static void __attribute__((noreturn)) run_guest(void) | 1540 | static void __attribute__((noreturn)) run_guest(void) |
1834 | { | 1541 | { |
1835 | for (;;) { | 1542 | for (;;) { |
1836 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
1837 | unsigned long notify_addr; | 1543 | unsigned long notify_addr; |
1838 | int readval; | 1544 | int readval; |
1839 | 1545 | ||
@@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void) | |||
1845 | if (readval == sizeof(notify_addr)) { | 1551 | if (readval == sizeof(notify_addr)) { |
1846 | verbose("Notify on address %#lx\n", notify_addr); | 1552 | verbose("Notify on address %#lx\n", notify_addr); |
1847 | handle_output(notify_addr); | 1553 | handle_output(notify_addr); |
1848 | continue; | ||
1849 | /* ENOENT means the Guest died. Reading tells us why. */ | 1554 | /* ENOENT means the Guest died. Reading tells us why. */ |
1850 | } else if (errno == ENOENT) { | 1555 | } else if (errno == ENOENT) { |
1851 | char reason[1024] = { 0 }; | 1556 | char reason[1024] = { 0 }; |
@@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void) | |||
1854 | /* ERESTART means that we need to reboot the guest */ | 1559 | /* ERESTART means that we need to reboot the guest */ |
1855 | } else if (errno == ERESTART) { | 1560 | } else if (errno == ERESTART) { |
1856 | restart_guest(); | 1561 | restart_guest(); |
1857 | /* EAGAIN means a signal (timeout). | 1562 | /* Anything else means a bug or incompatible change. */ |
1858 | * Anything else means a bug or incompatible change. */ | 1563 | } else |
1859 | } else if (errno != EAGAIN) | ||
1860 | err(1, "Running guest failed"); | 1564 | err(1, "Running guest failed"); |
1861 | |||
1862 | /* Only service input on thread for CPU 0. */ | ||
1863 | if (cpu_id != 0) | ||
1864 | continue; | ||
1865 | |||
1866 | /* Service input, then unset the BREAK to release the Waker. */ | ||
1867 | handle_input(); | ||
1868 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) | ||
1869 | err(1, "Resetting break"); | ||
1870 | } | 1565 | } |
1871 | } | 1566 | } |
1872 | /*L:240 | 1567 | /*L:240 |
@@ -1909,18 +1604,10 @@ int main(int argc, char *argv[]) | |||
1909 | 1604 | ||
1910 | /* Save the args: we "reboot" by execing ourselves again. */ | 1605 | /* Save the args: we "reboot" by execing ourselves again. */ |
1911 | main_args = argv; | 1606 | main_args = argv; |
1912 | /* We don't "wait" for the children, so prevent them from becoming | ||
1913 | * zombies. */ | ||
1914 | signal(SIGCHLD, SIG_IGN); | ||
1915 | 1607 | ||
1916 | /* First we initialize the device list. Since console and network | 1608 | /* First we initialize the device list. We keep a pointer to the last |
1917 | * device receive input from a file descriptor, we keep an fdset | 1609 | * device, and the next interrupt number to use for devices (1: |
1918 | * (infds) and the maximum fd number (max_infd) with the head of the | 1610 | * remember that 0 is used by the timer). */ |
1919 | * list. We also keep a pointer to the last device. Finally, we keep | ||
1920 | * the next interrupt number to use for devices (1: remember that 0 is | ||
1921 | * used by the timer). */ | ||
1922 | FD_ZERO(&devices.infds); | ||
1923 | devices.max_infd = -1; | ||
1924 | devices.lastdev = NULL; | 1611 | devices.lastdev = NULL; |
1925 | devices.next_irq = 1; | 1612 | devices.next_irq = 1; |
1926 | 1613 | ||
@@ -1978,9 +1665,6 @@ int main(int argc, char *argv[]) | |||
1978 | /* We always have a console device */ | 1665 | /* We always have a console device */ |
1979 | setup_console(); | 1666 | setup_console(); |
1980 | 1667 | ||
1981 | /* We can timeout waiting for Guest network transmit. */ | ||
1982 | setup_timeout(); | ||
1983 | |||
1984 | /* Now we load the kernel */ | 1668 | /* Now we load the kernel */ |
1985 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); | 1669 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1986 | 1670 | ||
@@ -2021,10 +1705,11 @@ int main(int argc, char *argv[]) | |||
2021 | * /dev/lguest file descriptor. */ | 1705 | * /dev/lguest file descriptor. */ |
2022 | tell_kernel(start); | 1706 | tell_kernel(start); |
2023 | 1707 | ||
2024 | /* We clone off a thread, which wakes the Launcher whenever one of the | 1708 | /* Ensure that we terminate if a child dies. */ |
2025 | * input file descriptors needs attention. We call this the Waker, and | 1709 | signal(SIGCHLD, kill_launcher); |
2026 | * we'll cover it in a moment. */ | 1710 | |
2027 | setup_waker(); | 1711 | /* If we exit via err(), this kills all the threads, restores tty. */ |
1712 | atexit(cleanup_devices); | ||
2028 | 1713 | ||
2029 | /* Finally, run the Guest. This doesn't return. */ | 1714 | /* Finally, run the Guest. This doesn't return. */ |
2030 | run_guest(); | 1715 | run_guest(); |