diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-28 21:16:26 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-28 21:16:26 -0400 |
commit | 7874d35173d549c1a2b2f77c4b1f94379fa65698 (patch) | |
tree | 995aa7212619dbdebb43b124cae2378562dd3065 | |
parent | 5dfb66ba8c4a96eb732942c9f78629e4db1a51d4 (diff) | |
parent | 8c79873da0d2bedf4ad6b868c54e426bb0a2fe38 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest: turn Waker into a thread, not a process
lguest: Enlarge virtio rings
lguest: Use GSO/IFF_VNET_HDR extensions on tun/tap
lguest: Remove 'network: no dma buffer!' warning
lguest: Adaptive timeout
lguest: Tell Guest net not to notify us on every packet xmit
lguest: net block unneeded receive queue update notifications
lguest: wrap last_avail accesses.
lguest: use cpu capability accessors
lguest: virtio-rng support
lguest: Support assigning a MAC address
lguest: Don't leak /dev/zero fd
lguest: fix verbose printing of device features.
lguest: fix switcher_page leak on unload
lguest: Guest int3 fix
lguest: set max_pfn_mapped, growl loudly at Yinghai Lu
-rw-r--r-- | Documentation/lguest/lguest.c | 519 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 3 | ||||
-rw-r--r-- | drivers/lguest/core.c | 1 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 24 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 4 |
5 files changed, 409 insertions, 142 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 82fafe0429fe..b88b0ea54e90 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -36,11 +36,13 @@ | |||
36 | #include <sched.h> | 36 | #include <sched.h> |
37 | #include <limits.h> | 37 | #include <limits.h> |
38 | #include <stddef.h> | 38 | #include <stddef.h> |
39 | #include <signal.h> | ||
39 | #include "linux/lguest_launcher.h" | 40 | #include "linux/lguest_launcher.h" |
40 | #include "linux/virtio_config.h" | 41 | #include "linux/virtio_config.h" |
41 | #include "linux/virtio_net.h" | 42 | #include "linux/virtio_net.h" |
42 | #include "linux/virtio_blk.h" | 43 | #include "linux/virtio_blk.h" |
43 | #include "linux/virtio_console.h" | 44 | #include "linux/virtio_console.h" |
45 | #include "linux/virtio_rng.h" | ||
44 | #include "linux/virtio_ring.h" | 46 | #include "linux/virtio_ring.h" |
45 | #include "asm-x86/bootparam.h" | 47 | #include "asm-x86/bootparam.h" |
46 | /*L:110 We can ignore the 39 include files we need for this program, but I do | 48 | /*L:110 We can ignore the 39 include files we need for this program, but I do |
@@ -64,8 +66,8 @@ typedef uint8_t u8; | |||
64 | #endif | 66 | #endif |
65 | /* We can have up to 256 pages for devices. */ | 67 | /* We can have up to 256 pages for devices. */ |
66 | #define DEVICE_PAGES 256 | 68 | #define DEVICE_PAGES 256 |
67 | /* This will occupy 2 pages: it must be a power of 2. */ | 69 | /* This will occupy 3 pages: it must be a power of 2. */ |
68 | #define VIRTQUEUE_NUM 128 | 70 | #define VIRTQUEUE_NUM 256 |
69 | 71 | ||
70 | /*L:120 verbose is both a global flag and a macro. The C preprocessor allows | 72 | /*L:120 verbose is both a global flag and a macro. The C preprocessor allows |
71 | * this, and although I wouldn't recommend it, it works quite nicely here. */ | 73 | * this, and although I wouldn't recommend it, it works quite nicely here. */ |
@@ -74,12 +76,19 @@ static bool verbose; | |||
74 | do { if (verbose) printf(args); } while(0) | 76 | do { if (verbose) printf(args); } while(0) |
75 | /*:*/ | 77 | /*:*/ |
76 | 78 | ||
77 | /* The pipe to send commands to the waker process */ | 79 | /* File descriptors for the Waker. */ |
78 | static int waker_fd; | 80 | struct { |
81 | int pipe[2]; | ||
82 | int lguest_fd; | ||
83 | } waker_fds; | ||
84 | |||
79 | /* The pointer to the start of guest memory. */ | 85 | /* The pointer to the start of guest memory. */ |
80 | static void *guest_base; | 86 | static void *guest_base; |
81 | /* The maximum guest physical address allowed, and maximum possible. */ | 87 | /* The maximum guest physical address allowed, and maximum possible. */ |
82 | static unsigned long guest_limit, guest_max; | 88 | static unsigned long guest_limit, guest_max; |
89 | /* The pipe for signal hander to write to. */ | ||
90 | static int timeoutpipe[2]; | ||
91 | static unsigned int timeout_usec = 500; | ||
83 | 92 | ||
84 | /* a per-cpu variable indicating whose vcpu is currently running */ | 93 | /* a per-cpu variable indicating whose vcpu is currently running */ |
85 | static unsigned int __thread cpu_id; | 94 | static unsigned int __thread cpu_id; |
@@ -155,11 +164,14 @@ struct virtqueue | |||
155 | /* Last available index we saw. */ | 164 | /* Last available index we saw. */ |
156 | u16 last_avail_idx; | 165 | u16 last_avail_idx; |
157 | 166 | ||
158 | /* The routine to call when the Guest pings us. */ | 167 | /* The routine to call when the Guest pings us, or timeout. */ |
159 | void (*handle_output)(int fd, struct virtqueue *me); | 168 | void (*handle_output)(int fd, struct virtqueue *me, bool timeout); |
160 | 169 | ||
161 | /* Outstanding buffers */ | 170 | /* Outstanding buffers */ |
162 | unsigned int inflight; | 171 | unsigned int inflight; |
172 | |||
173 | /* Is this blocked awaiting a timer? */ | ||
174 | bool blocked; | ||
163 | }; | 175 | }; |
164 | 176 | ||
165 | /* Remember the arguments to the program so we can "reboot" */ | 177 | /* Remember the arguments to the program so we can "reboot" */ |
@@ -190,6 +202,9 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, | |||
190 | return iov->iov_base; | 202 | return iov->iov_base; |
191 | } | 203 | } |
192 | 204 | ||
205 | /* Wrapper for the last available index. Makes it easier to change. */ | ||
206 | #define lg_last_avail(vq) ((vq)->last_avail_idx) | ||
207 | |||
193 | /* The virtio configuration space is defined to be little-endian. x86 is | 208 | /* The virtio configuration space is defined to be little-endian. x86 is |
194 | * little-endian too, but it's nice to be explicit so we have these helpers. */ | 209 | * little-endian too, but it's nice to be explicit so we have these helpers. */ |
195 | #define cpu_to_le16(v16) (v16) | 210 | #define cpu_to_le16(v16) (v16) |
@@ -199,6 +214,33 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, | |||
199 | #define le32_to_cpu(v32) (v32) | 214 | #define le32_to_cpu(v32) (v32) |
200 | #define le64_to_cpu(v64) (v64) | 215 | #define le64_to_cpu(v64) (v64) |
201 | 216 | ||
217 | /* Is this iovec empty? */ | ||
218 | static bool iov_empty(const struct iovec iov[], unsigned int num_iov) | ||
219 | { | ||
220 | unsigned int i; | ||
221 | |||
222 | for (i = 0; i < num_iov; i++) | ||
223 | if (iov[i].iov_len) | ||
224 | return false; | ||
225 | return true; | ||
226 | } | ||
227 | |||
228 | /* Take len bytes from the front of this iovec. */ | ||
229 | static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) | ||
230 | { | ||
231 | unsigned int i; | ||
232 | |||
233 | for (i = 0; i < num_iov; i++) { | ||
234 | unsigned int used; | ||
235 | |||
236 | used = iov[i].iov_len < len ? iov[i].iov_len : len; | ||
237 | iov[i].iov_base += used; | ||
238 | iov[i].iov_len -= used; | ||
239 | len -= used; | ||
240 | } | ||
241 | assert(len == 0); | ||
242 | } | ||
243 | |||
202 | /* The device virtqueue descriptors are followed by feature bitmasks. */ | 244 | /* The device virtqueue descriptors are followed by feature bitmasks. */ |
203 | static u8 *get_feature_bits(struct device *dev) | 245 | static u8 *get_feature_bits(struct device *dev) |
204 | { | 246 | { |
@@ -254,6 +296,7 @@ static void *map_zeroed_pages(unsigned int num) | |||
254 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); | 296 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); |
255 | if (addr == MAP_FAILED) | 297 | if (addr == MAP_FAILED) |
256 | err(1, "Mmaping %u pages of /dev/zero", num); | 298 | err(1, "Mmaping %u pages of /dev/zero", num); |
299 | close(fd); | ||
257 | 300 | ||
258 | return addr; | 301 | return addr; |
259 | } | 302 | } |
@@ -540,69 +583,64 @@ static void add_device_fd(int fd) | |||
540 | * watch, but handing a file descriptor mask through to the kernel is fairly | 583 | * watch, but handing a file descriptor mask through to the kernel is fairly |
541 | * icky. | 584 | * icky. |
542 | * | 585 | * |
543 | * Instead, we fork off a process which watches the file descriptors and writes | 586 | * Instead, we clone off a thread which watches the file descriptors and writes |
544 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host | 587 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host |
545 | * stop running the Guest. This causes the Launcher to return from the | 588 | * stop running the Guest. This causes the Launcher to return from the |
546 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | 589 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset |
547 | * the LHREQ_BREAK and wake us up again. | 590 | * the LHREQ_BREAK and wake us up again. |
548 | * | 591 | * |
549 | * This, of course, is merely a different *kind* of icky. | 592 | * This, of course, is merely a different *kind* of icky. |
593 | * | ||
594 | * Given my well-known antipathy to threads, I'd prefer to use processes. But | ||
595 | * it's easier to share Guest memory with threads, and trivial to share the | ||
596 | * devices.infds as the Launcher changes it. | ||
550 | */ | 597 | */ |
551 | static void wake_parent(int pipefd, int lguest_fd) | 598 | static int waker(void *unused) |
552 | { | 599 | { |
553 | /* Add the pipe from the Launcher to the fdset in the device_list, so | 600 | /* Close the write end of the pipe: only the Launcher has it open. */ |
554 | * we watch it, too. */ | 601 | close(waker_fds.pipe[1]); |
555 | add_device_fd(pipefd); | ||
556 | 602 | ||
557 | for (;;) { | 603 | for (;;) { |
558 | fd_set rfds = devices.infds; | 604 | fd_set rfds = devices.infds; |
559 | unsigned long args[] = { LHREQ_BREAK, 1 }; | 605 | unsigned long args[] = { LHREQ_BREAK, 1 }; |
606 | unsigned int maxfd = devices.max_infd; | ||
607 | |||
608 | /* We also listen to the pipe from the Launcher. */ | ||
609 | FD_SET(waker_fds.pipe[0], &rfds); | ||
610 | if (waker_fds.pipe[0] > maxfd) | ||
611 | maxfd = waker_fds.pipe[0]; | ||
560 | 612 | ||
561 | /* Wait until input is ready from one of the devices. */ | 613 | /* Wait until input is ready from one of the devices. */ |
562 | select(devices.max_infd+1, &rfds, NULL, NULL, NULL); | 614 | select(maxfd+1, &rfds, NULL, NULL, NULL); |
563 | /* Is it a message from the Launcher? */ | 615 | |
564 | if (FD_ISSET(pipefd, &rfds)) { | 616 | /* Message from Launcher? */ |
565 | int fd; | 617 | if (FD_ISSET(waker_fds.pipe[0], &rfds)) { |
566 | /* If read() returns 0, it means the Launcher has | 618 | char c; |
567 | * exited. We silently follow. */ | 619 | /* If this fails, then assume Launcher has exited. |
568 | if (read(pipefd, &fd, sizeof(fd)) == 0) | 620 | * Don't do anything on exit: we're just a thread! */ |
569 | exit(0); | 621 | if (read(waker_fds.pipe[0], &c, 1) != 1) |
570 | /* Otherwise it's telling us to change what file | 622 | _exit(0); |
571 | * descriptors we're to listen to. Positive means | 623 | continue; |
572 | * listen to a new one, negative means stop | 624 | } |
573 | * listening. */ | 625 | |
574 | if (fd >= 0) | 626 | /* Send LHREQ_BREAK command to snap the Launcher out of it. */ |
575 | FD_SET(fd, &devices.infds); | 627 | pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); |
576 | else | ||
577 | FD_CLR(-fd - 1, &devices.infds); | ||
578 | } else /* Send LHREQ_BREAK command. */ | ||
579 | pwrite(lguest_fd, args, sizeof(args), cpu_id); | ||
580 | } | 628 | } |
629 | return 0; | ||
581 | } | 630 | } |
582 | 631 | ||
583 | /* This routine just sets up a pipe to the Waker process. */ | 632 | /* This routine just sets up a pipe to the Waker process. */ |
584 | static int setup_waker(int lguest_fd) | 633 | static void setup_waker(int lguest_fd) |
585 | { | 634 | { |
586 | int pipefd[2], child; | 635 | /* This pipe is closed when Launcher dies, telling Waker. */ |
587 | 636 | if (pipe(waker_fds.pipe) != 0) | |
588 | /* We create a pipe to talk to the Waker, and also so it knows when the | 637 | err(1, "Creating pipe for Waker"); |
589 | * Launcher dies (and closes pipe). */ | ||
590 | pipe(pipefd); | ||
591 | child = fork(); | ||
592 | if (child == -1) | ||
593 | err(1, "forking"); | ||
594 | |||
595 | if (child == 0) { | ||
596 | /* We are the Waker: close the "writing" end of our copy of the | ||
597 | * pipe and start waiting for input. */ | ||
598 | close(pipefd[1]); | ||
599 | wake_parent(pipefd[0], lguest_fd); | ||
600 | } | ||
601 | /* Close the reading end of our copy of the pipe. */ | ||
602 | close(pipefd[0]); | ||
603 | 638 | ||
604 | /* Here is the fd used to talk to the waker. */ | 639 | /* Waker also needs to know the lguest fd */ |
605 | return pipefd[1]; | 640 | waker_fds.lguest_fd = lguest_fd; |
641 | |||
642 | if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) | ||
643 | err(1, "Creating Waker"); | ||
606 | } | 644 | } |
607 | 645 | ||
608 | /* | 646 | /* |
@@ -661,19 +699,22 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
661 | unsigned int *out_num, unsigned int *in_num) | 699 | unsigned int *out_num, unsigned int *in_num) |
662 | { | 700 | { |
663 | unsigned int i, head; | 701 | unsigned int i, head; |
702 | u16 last_avail; | ||
664 | 703 | ||
665 | /* Check it isn't doing very strange things with descriptor numbers. */ | 704 | /* Check it isn't doing very strange things with descriptor numbers. */ |
666 | if ((u16)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num) | 705 | last_avail = lg_last_avail(vq); |
706 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) | ||
667 | errx(1, "Guest moved used index from %u to %u", | 707 | errx(1, "Guest moved used index from %u to %u", |
668 | vq->last_avail_idx, vq->vring.avail->idx); | 708 | last_avail, vq->vring.avail->idx); |
669 | 709 | ||
670 | /* If there's nothing new since last we looked, return invalid. */ | 710 | /* If there's nothing new since last we looked, return invalid. */ |
671 | if (vq->vring.avail->idx == vq->last_avail_idx) | 711 | if (vq->vring.avail->idx == last_avail) |
672 | return vq->vring.num; | 712 | return vq->vring.num; |
673 | 713 | ||
674 | /* Grab the next descriptor number they're advertising, and increment | 714 | /* Grab the next descriptor number they're advertising, and increment |
675 | * the index we've seen. */ | 715 | * the index we've seen. */ |
676 | head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num]; | 716 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; |
717 | lg_last_avail(vq)++; | ||
677 | 718 | ||
678 | /* If their number is silly, that's a fatal mistake. */ | 719 | /* If their number is silly, that's a fatal mistake. */ |
679 | if (head >= vq->vring.num) | 720 | if (head >= vq->vring.num) |
@@ -821,8 +862,8 @@ static bool handle_console_input(int fd, struct device *dev) | |||
821 | unsigned long args[] = { LHREQ_BREAK, 0 }; | 862 | unsigned long args[] = { LHREQ_BREAK, 0 }; |
822 | /* Close the fd so Waker will know it has to | 863 | /* Close the fd so Waker will know it has to |
823 | * exit. */ | 864 | * exit. */ |
824 | close(waker_fd); | 865 | close(waker_fds.pipe[1]); |
825 | /* Just in case waker is blocked in BREAK, send | 866 | /* Just in case Waker is blocked in BREAK, send |
826 | * unbreak now. */ | 867 | * unbreak now. */ |
827 | write(fd, args, sizeof(args)); | 868 | write(fd, args, sizeof(args)); |
828 | exit(2); | 869 | exit(2); |
@@ -839,7 +880,7 @@ static bool handle_console_input(int fd, struct device *dev) | |||
839 | 880 | ||
840 | /* Handling output for console is simple: we just get all the output buffers | 881 | /* Handling output for console is simple: we just get all the output buffers |
841 | * and write them to stdout. */ | 882 | * and write them to stdout. */ |
842 | static void handle_console_output(int fd, struct virtqueue *vq) | 883 | static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) |
843 | { | 884 | { |
844 | unsigned int head, out, in; | 885 | unsigned int head, out, in; |
845 | int len; | 886 | int len; |
@@ -854,6 +895,21 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
854 | } | 895 | } |
855 | } | 896 | } |
856 | 897 | ||
898 | static void block_vq(struct virtqueue *vq) | ||
899 | { | ||
900 | struct itimerval itm; | ||
901 | |||
902 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
903 | vq->blocked = true; | ||
904 | |||
905 | itm.it_interval.tv_sec = 0; | ||
906 | itm.it_interval.tv_usec = 0; | ||
907 | itm.it_value.tv_sec = 0; | ||
908 | itm.it_value.tv_usec = timeout_usec; | ||
909 | |||
910 | setitimer(ITIMER_REAL, &itm, NULL); | ||
911 | } | ||
912 | |||
857 | /* | 913 | /* |
858 | * The Network | 914 | * The Network |
859 | * | 915 | * |
@@ -861,22 +917,34 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
861 | * and write them (ignoring the first element) to this device's file descriptor | 917 | * and write them (ignoring the first element) to this device's file descriptor |
862 | * (/dev/net/tun). | 918 | * (/dev/net/tun). |
863 | */ | 919 | */ |
864 | static void handle_net_output(int fd, struct virtqueue *vq) | 920 | static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) |
865 | { | 921 | { |
866 | unsigned int head, out, in; | 922 | unsigned int head, out, in, num = 0; |
867 | int len; | 923 | int len; |
868 | struct iovec iov[vq->vring.num]; | 924 | struct iovec iov[vq->vring.num]; |
925 | static int last_timeout_num; | ||
869 | 926 | ||
870 | /* Keep getting output buffers from the Guest until we run out. */ | 927 | /* Keep getting output buffers from the Guest until we run out. */ |
871 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | 928 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { |
872 | if (in) | 929 | if (in) |
873 | errx(1, "Input buffers in output queue?"); | 930 | errx(1, "Input buffers in output queue?"); |
874 | /* Check header, but otherwise ignore it (we told the Guest we | 931 | len = writev(vq->dev->fd, iov, out); |
875 | * supported no features, so it shouldn't have anything | 932 | if (len < 0) |
876 | * interesting). */ | 933 | err(1, "Writing network packet to tun"); |
877 | (void)convert(&iov[0], struct virtio_net_hdr); | ||
878 | len = writev(vq->dev->fd, iov+1, out-1); | ||
879 | add_used_and_trigger(fd, vq, head, len); | 934 | add_used_and_trigger(fd, vq, head, len); |
935 | num++; | ||
936 | } | ||
937 | |||
938 | /* Block further kicks and set up a timer if we saw anything. */ | ||
939 | if (!timeout && num) | ||
940 | block_vq(vq); | ||
941 | |||
942 | if (timeout) { | ||
943 | if (num < last_timeout_num) | ||
944 | timeout_usec += 10; | ||
945 | else if (timeout_usec > 1) | ||
946 | timeout_usec--; | ||
947 | last_timeout_num = num; | ||
880 | } | 948 | } |
881 | } | 949 | } |
882 | 950 | ||
@@ -887,7 +955,6 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
887 | unsigned int head, in_num, out_num; | 955 | unsigned int head, in_num, out_num; |
888 | int len; | 956 | int len; |
889 | struct iovec iov[dev->vq->vring.num]; | 957 | struct iovec iov[dev->vq->vring.num]; |
890 | struct virtio_net_hdr *hdr; | ||
891 | 958 | ||
892 | /* First we need a network buffer from the Guests's recv virtqueue. */ | 959 | /* First we need a network buffer from the Guests's recv virtqueue. */ |
893 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 960 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); |
@@ -896,25 +963,23 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
896 | * early, the Guest won't be ready yet. Wait until the device | 963 | * early, the Guest won't be ready yet. Wait until the device |
897 | * status says it's ready. */ | 964 | * status says it's ready. */ |
898 | /* FIXME: Actually want DRIVER_ACTIVE here. */ | 965 | /* FIXME: Actually want DRIVER_ACTIVE here. */ |
899 | if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) | 966 | |
900 | warn("network: no dma buffer!"); | 967 | /* Now tell it we want to know if new things appear. */ |
968 | dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
969 | wmb(); | ||
970 | |||
901 | /* We'll turn this back on if input buffers are registered. */ | 971 | /* We'll turn this back on if input buffers are registered. */ |
902 | return false; | 972 | return false; |
903 | } else if (out_num) | 973 | } else if (out_num) |
904 | errx(1, "Output buffers in network recv queue?"); | 974 | errx(1, "Output buffers in network recv queue?"); |
905 | 975 | ||
906 | /* First element is the header: we set it to 0 (no features). */ | ||
907 | hdr = convert(&iov[0], struct virtio_net_hdr); | ||
908 | hdr->flags = 0; | ||
909 | hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE; | ||
910 | |||
911 | /* Read the packet from the device directly into the Guest's buffer. */ | 976 | /* Read the packet from the device directly into the Guest's buffer. */ |
912 | len = readv(dev->fd, iov+1, in_num-1); | 977 | len = readv(dev->fd, iov, in_num); |
913 | if (len <= 0) | 978 | if (len <= 0) |
914 | err(1, "reading network"); | 979 | err(1, "reading network"); |
915 | 980 | ||
916 | /* Tell the Guest about the new packet. */ | 981 | /* Tell the Guest about the new packet. */ |
917 | add_used_and_trigger(fd, dev->vq, head, sizeof(*hdr) + len); | 982 | add_used_and_trigger(fd, dev->vq, head, len); |
918 | 983 | ||
919 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 984 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, |
920 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], | 985 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], |
@@ -927,11 +992,18 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
927 | /*L:215 This is the callback attached to the network and console input | 992 | /*L:215 This is the callback attached to the network and console input |
928 | * virtqueues: it ensures we try again, in case we stopped console or net | 993 | * virtqueues: it ensures we try again, in case we stopped console or net |
929 | * delivery because Guest didn't have any buffers. */ | 994 | * delivery because Guest didn't have any buffers. */ |
930 | static void enable_fd(int fd, struct virtqueue *vq) | 995 | static void enable_fd(int fd, struct virtqueue *vq, bool timeout) |
931 | { | 996 | { |
932 | add_device_fd(vq->dev->fd); | 997 | add_device_fd(vq->dev->fd); |
933 | /* Tell waker to listen to it again */ | 998 | /* Snap the Waker out of its select loop. */ |
934 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); | 999 | write(waker_fds.pipe[1], "", 1); |
1000 | } | ||
1001 | |||
1002 | static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) | ||
1003 | { | ||
1004 | /* We don't need to know again when Guest refills receive buffer. */ | ||
1005 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
1006 | enable_fd(fd, vq, timeout); | ||
935 | } | 1007 | } |
936 | 1008 | ||
937 | /* When the Guest tells us they updated the status field, we handle it. */ | 1009 | /* When the Guest tells us they updated the status field, we handle it. */ |
@@ -951,7 +1023,7 @@ static void update_device_status(struct device *dev) | |||
951 | for (vq = dev->vq; vq; vq = vq->next) { | 1023 | for (vq = dev->vq; vq; vq = vq->next) { |
952 | memset(vq->vring.desc, 0, | 1024 | memset(vq->vring.desc, 0, |
953 | vring_size(vq->config.num, getpagesize())); | 1025 | vring_size(vq->config.num, getpagesize())); |
954 | vq->last_avail_idx = 0; | 1026 | lg_last_avail(vq) = 0; |
955 | } | 1027 | } |
956 | } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | 1028 | } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { |
957 | warnx("Device %s configuration FAILED", dev->name); | 1029 | warnx("Device %s configuration FAILED", dev->name); |
@@ -960,10 +1032,10 @@ static void update_device_status(struct device *dev) | |||
960 | 1032 | ||
961 | verbose("Device %s OK: offered", dev->name); | 1033 | verbose("Device %s OK: offered", dev->name); |
962 | for (i = 0; i < dev->desc->feature_len; i++) | 1034 | for (i = 0; i < dev->desc->feature_len; i++) |
963 | verbose(" %08x", get_feature_bits(dev)[i]); | 1035 | verbose(" %02x", get_feature_bits(dev)[i]); |
964 | verbose(", accepted"); | 1036 | verbose(", accepted"); |
965 | for (i = 0; i < dev->desc->feature_len; i++) | 1037 | for (i = 0; i < dev->desc->feature_len; i++) |
966 | verbose(" %08x", get_feature_bits(dev) | 1038 | verbose(" %02x", get_feature_bits(dev) |
967 | [dev->desc->feature_len+i]); | 1039 | [dev->desc->feature_len+i]); |
968 | 1040 | ||
969 | if (dev->ready) | 1041 | if (dev->ready) |
@@ -1000,7 +1072,7 @@ static void handle_output(int fd, unsigned long addr) | |||
1000 | if (strcmp(vq->dev->name, "console") != 0) | 1072 | if (strcmp(vq->dev->name, "console") != 0) |
1001 | verbose("Output to %s\n", vq->dev->name); | 1073 | verbose("Output to %s\n", vq->dev->name); |
1002 | if (vq->handle_output) | 1074 | if (vq->handle_output) |
1003 | vq->handle_output(fd, vq); | 1075 | vq->handle_output(fd, vq, false); |
1004 | return; | 1076 | return; |
1005 | } | 1077 | } |
1006 | } | 1078 | } |
@@ -1014,6 +1086,29 @@ static void handle_output(int fd, unsigned long addr) | |||
1014 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 1086 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
1015 | } | 1087 | } |
1016 | 1088 | ||
1089 | static void handle_timeout(int fd) | ||
1090 | { | ||
1091 | char buf[32]; | ||
1092 | struct device *i; | ||
1093 | struct virtqueue *vq; | ||
1094 | |||
1095 | /* Clear the pipe */ | ||
1096 | read(timeoutpipe[0], buf, sizeof(buf)); | ||
1097 | |||
1098 | /* Check each device and virtqueue: flush blocked ones. */ | ||
1099 | for (i = devices.dev; i; i = i->next) { | ||
1100 | for (vq = i->vq; vq; vq = vq->next) { | ||
1101 | if (!vq->blocked) | ||
1102 | continue; | ||
1103 | |||
1104 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
1105 | vq->blocked = false; | ||
1106 | if (vq->handle_output) | ||
1107 | vq->handle_output(fd, vq, true); | ||
1108 | } | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1017 | /* This is called when the Waker wakes us up: check for incoming file | 1112 | /* This is called when the Waker wakes us up: check for incoming file |
1018 | * descriptors. */ | 1113 | * descriptors. */ |
1019 | static void handle_input(int fd) | 1114 | static void handle_input(int fd) |
@@ -1024,16 +1119,20 @@ static void handle_input(int fd) | |||
1024 | for (;;) { | 1119 | for (;;) { |
1025 | struct device *i; | 1120 | struct device *i; |
1026 | fd_set fds = devices.infds; | 1121 | fd_set fds = devices.infds; |
1122 | int num; | ||
1027 | 1123 | ||
1124 | num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); | ||
1125 | /* Could get interrupted */ | ||
1126 | if (num < 0) | ||
1127 | continue; | ||
1028 | /* If nothing is ready, we're done. */ | 1128 | /* If nothing is ready, we're done. */ |
1029 | if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) | 1129 | if (num == 0) |
1030 | break; | 1130 | break; |
1031 | 1131 | ||
1032 | /* Otherwise, call the device(s) which have readable file | 1132 | /* Otherwise, call the device(s) which have readable file |
1033 | * descriptors and a method of handling them. */ | 1133 | * descriptors and a method of handling them. */ |
1034 | for (i = devices.dev; i; i = i->next) { | 1134 | for (i = devices.dev; i; i = i->next) { |
1035 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | 1135 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { |
1036 | int dev_fd; | ||
1037 | if (i->handle_input(fd, i)) | 1136 | if (i->handle_input(fd, i)) |
1038 | continue; | 1137 | continue; |
1039 | 1138 | ||
@@ -1043,13 +1142,12 @@ static void handle_input(int fd) | |||
1043 | * buffers to deliver into. Console also uses | 1142 | * buffers to deliver into. Console also uses |
1044 | * it when it discovers that stdin is closed. */ | 1143 | * it when it discovers that stdin is closed. */ |
1045 | FD_CLR(i->fd, &devices.infds); | 1144 | FD_CLR(i->fd, &devices.infds); |
1046 | /* Tell waker to ignore it too, by sending a | ||
1047 | * negative fd number (-1, since 0 is a valid | ||
1048 | * FD number). */ | ||
1049 | dev_fd = -i->fd - 1; | ||
1050 | write(waker_fd, &dev_fd, sizeof(dev_fd)); | ||
1051 | } | 1145 | } |
1052 | } | 1146 | } |
1147 | |||
1148 | /* Is this the timeout fd? */ | ||
1149 | if (FD_ISSET(timeoutpipe[0], &fds)) | ||
1150 | handle_timeout(fd); | ||
1053 | } | 1151 | } |
1054 | } | 1152 | } |
1055 | 1153 | ||
@@ -1098,7 +1196,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
1098 | /* Each device descriptor is followed by the description of its virtqueues. We | 1196 | /* Each device descriptor is followed by the description of its virtqueues. We |
1099 | * specify how many descriptors the virtqueue is to have. */ | 1197 | * specify how many descriptors the virtqueue is to have. */ |
1100 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 1198 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
1101 | void (*handle_output)(int fd, struct virtqueue *me)) | 1199 | void (*handle_output)(int, struct virtqueue *, bool)) |
1102 | { | 1200 | { |
1103 | unsigned int pages; | 1201 | unsigned int pages; |
1104 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 1202 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
@@ -1114,6 +1212,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1114 | vq->last_avail_idx = 0; | 1212 | vq->last_avail_idx = 0; |
1115 | vq->dev = dev; | 1213 | vq->dev = dev; |
1116 | vq->inflight = 0; | 1214 | vq->inflight = 0; |
1215 | vq->blocked = false; | ||
1117 | 1216 | ||
1118 | /* Initialize the configuration. */ | 1217 | /* Initialize the configuration. */ |
1119 | vq->config.num = num_descs; | 1218 | vq->config.num = num_descs; |
@@ -1246,6 +1345,24 @@ static void setup_console(void) | |||
1246 | } | 1345 | } |
1247 | /*:*/ | 1346 | /*:*/ |
1248 | 1347 | ||
1348 | static void timeout_alarm(int sig) | ||
1349 | { | ||
1350 | write(timeoutpipe[1], "", 1); | ||
1351 | } | ||
1352 | |||
1353 | static void setup_timeout(void) | ||
1354 | { | ||
1355 | if (pipe(timeoutpipe) != 0) | ||
1356 | err(1, "Creating timeout pipe"); | ||
1357 | |||
1358 | if (fcntl(timeoutpipe[1], F_SETFL, | ||
1359 | fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) | ||
1360 | err(1, "Making timeout pipe nonblocking"); | ||
1361 | |||
1362 | add_device_fd(timeoutpipe[0]); | ||
1363 | signal(SIGALRM, timeout_alarm); | ||
1364 | } | ||
1365 | |||
1249 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a | 1366 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
1250 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1367 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1251 | * used to send packets to another guest in a 1:1 manner. | 1368 | * used to send packets to another guest in a 1:1 manner. |
@@ -1264,10 +1381,25 @@ static void setup_console(void) | |||
1264 | 1381 | ||
1265 | static u32 str2ip(const char *ipaddr) | 1382 | static u32 str2ip(const char *ipaddr) |
1266 | { | 1383 | { |
1267 | unsigned int byte[4]; | 1384 | unsigned int b[4]; |
1268 | 1385 | ||
1269 | sscanf(ipaddr, "%u.%u.%u.%u", &byte[0], &byte[1], &byte[2], &byte[3]); | 1386 | if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4) |
1270 | return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3]; | 1387 | errx(1, "Failed to parse IP address '%s'", ipaddr); |
1388 | return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3]; | ||
1389 | } | ||
1390 | |||
1391 | static void str2mac(const char *macaddr, unsigned char mac[6]) | ||
1392 | { | ||
1393 | unsigned int m[6]; | ||
1394 | if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x", | ||
1395 | &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6) | ||
1396 | errx(1, "Failed to parse mac address '%s'", macaddr); | ||
1397 | mac[0] = m[0]; | ||
1398 | mac[1] = m[1]; | ||
1399 | mac[2] = m[2]; | ||
1400 | mac[3] = m[3]; | ||
1401 | mac[4] = m[4]; | ||
1402 | mac[5] = m[5]; | ||
1271 | } | 1403 | } |
1272 | 1404 | ||
1273 | /* This code is "adapted" from libbridge: it attaches the Host end of the | 1405 | /* This code is "adapted" from libbridge: it attaches the Host end of the |
@@ -1288,6 +1420,7 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) | |||
1288 | errx(1, "interface %s does not exist!", if_name); | 1420 | errx(1, "interface %s does not exist!", if_name); |
1289 | 1421 | ||
1290 | strncpy(ifr.ifr_name, br_name, IFNAMSIZ); | 1422 | strncpy(ifr.ifr_name, br_name, IFNAMSIZ); |
1423 | ifr.ifr_name[IFNAMSIZ-1] = '\0'; | ||
1291 | ifr.ifr_ifindex = ifidx; | 1424 | ifr.ifr_ifindex = ifidx; |
1292 | if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) | 1425 | if (ioctl(fd, SIOCBRADDIF, &ifr) < 0) |
1293 | err(1, "can't add %s to bridge %s", if_name, br_name); | 1426 | err(1, "can't add %s to bridge %s", if_name, br_name); |
@@ -1296,64 +1429,90 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) | |||
1296 | /* This sets up the Host end of the network device with an IP address, brings | 1429 | /* This sets up the Host end of the network device with an IP address, brings |
1297 | * it up so packets will flow, the copies the MAC address into the hwaddr | 1430 | * it up so packets will flow, the copies the MAC address into the hwaddr |
1298 | * pointer. */ | 1431 | * pointer. */ |
1299 | static void configure_device(int fd, const char *devname, u32 ipaddr, | 1432 | static void configure_device(int fd, const char *tapif, u32 ipaddr) |
1300 | unsigned char hwaddr[6]) | ||
1301 | { | 1433 | { |
1302 | struct ifreq ifr; | 1434 | struct ifreq ifr; |
1303 | struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; | 1435 | struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; |
1304 | 1436 | ||
1305 | /* Don't read these incantations. Just cut & paste them like I did! */ | ||
1306 | memset(&ifr, 0, sizeof(ifr)); | 1437 | memset(&ifr, 0, sizeof(ifr)); |
1307 | strcpy(ifr.ifr_name, devname); | 1438 | strcpy(ifr.ifr_name, tapif); |
1439 | |||
1440 | /* Don't read these incantations. Just cut & paste them like I did! */ | ||
1308 | sin->sin_family = AF_INET; | 1441 | sin->sin_family = AF_INET; |
1309 | sin->sin_addr.s_addr = htonl(ipaddr); | 1442 | sin->sin_addr.s_addr = htonl(ipaddr); |
1310 | if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) | 1443 | if (ioctl(fd, SIOCSIFADDR, &ifr) != 0) |
1311 | err(1, "Setting %s interface address", devname); | 1444 | err(1, "Setting %s interface address", tapif); |
1312 | ifr.ifr_flags = IFF_UP; | 1445 | ifr.ifr_flags = IFF_UP; |
1313 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) | 1446 | if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0) |
1314 | err(1, "Bringing interface %s up", devname); | 1447 | err(1, "Bringing interface %s up", tapif); |
1448 | } | ||
1449 | |||
1450 | static void get_mac(int fd, const char *tapif, unsigned char hwaddr[6]) | ||
1451 | { | ||
1452 | struct ifreq ifr; | ||
1453 | |||
1454 | memset(&ifr, 0, sizeof(ifr)); | ||
1455 | strcpy(ifr.ifr_name, tapif); | ||
1315 | 1456 | ||
1316 | /* SIOC stands for Socket I/O Control. G means Get (vs S for Set | 1457 | /* SIOC stands for Socket I/O Control. G means Get (vs S for Set |
1317 | * above). IF means Interface, and HWADDR is hardware address. | 1458 | * above). IF means Interface, and HWADDR is hardware address. |
1318 | * Simple! */ | 1459 | * Simple! */ |
1319 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) | 1460 | if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0) |
1320 | err(1, "getting hw address for %s", devname); | 1461 | err(1, "getting hw address for %s", tapif); |
1321 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); | 1462 | memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6); |
1322 | } | 1463 | } |
1323 | 1464 | ||
1324 | /*L:195 Our network is a Host<->Guest network. This can either use bridging or | 1465 | static int get_tun_device(char tapif[IFNAMSIZ]) |
1325 | * routing, but the principle is the same: it uses the "tun" device to inject | ||
1326 | * packets into the Host as if they came in from a normal network card. We | ||
1327 | * just shunt packets between the Guest and the tun device. */ | ||
1328 | static void setup_tun_net(const char *arg) | ||
1329 | { | 1466 | { |
1330 | struct device *dev; | ||
1331 | struct ifreq ifr; | 1467 | struct ifreq ifr; |
1332 | int netfd, ipfd; | 1468 | int netfd; |
1333 | u32 ip; | 1469 | |
1334 | const char *br_name = NULL; | 1470 | /* Start with this zeroed. Messy but sure. */ |
1335 | struct virtio_net_config conf; | 1471 | memset(&ifr, 0, sizeof(ifr)); |
1336 | 1472 | ||
1337 | /* We open the /dev/net/tun device and tell it we want a tap device. A | 1473 | /* We open the /dev/net/tun device and tell it we want a tap device. A |
1338 | * tap device is like a tun device, only somehow different. To tell | 1474 | * tap device is like a tun device, only somehow different. To tell |
1339 | * the truth, I completely blundered my way through this code, but it | 1475 | * the truth, I completely blundered my way through this code, but it |
1340 | * works now! */ | 1476 | * works now! */ |
1341 | netfd = open_or_die("/dev/net/tun", O_RDWR); | 1477 | netfd = open_or_die("/dev/net/tun", O_RDWR); |
1342 | memset(&ifr, 0, sizeof(ifr)); | 1478 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; |
1343 | ifr.ifr_flags = IFF_TAP | IFF_NO_PI; | ||
1344 | strcpy(ifr.ifr_name, "tap%d"); | 1479 | strcpy(ifr.ifr_name, "tap%d"); |
1345 | if (ioctl(netfd, TUNSETIFF, &ifr) != 0) | 1480 | if (ioctl(netfd, TUNSETIFF, &ifr) != 0) |
1346 | err(1, "configuring /dev/net/tun"); | 1481 | err(1, "configuring /dev/net/tun"); |
1482 | |||
1483 | if (ioctl(netfd, TUNSETOFFLOAD, | ||
1484 | TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) | ||
1485 | err(1, "Could not set features for tun device"); | ||
1486 | |||
1347 | /* We don't need checksums calculated for packets coming in this | 1487 | /* We don't need checksums calculated for packets coming in this |
1348 | * device: trust us! */ | 1488 | * device: trust us! */ |
1349 | ioctl(netfd, TUNSETNOCSUM, 1); | 1489 | ioctl(netfd, TUNSETNOCSUM, 1); |
1350 | 1490 | ||
1491 | memcpy(tapif, ifr.ifr_name, IFNAMSIZ); | ||
1492 | return netfd; | ||
1493 | } | ||
1494 | |||
1495 | /*L:195 Our network is a Host<->Guest network. This can either use bridging or | ||
1496 | * routing, but the principle is the same: it uses the "tun" device to inject | ||
1497 | * packets into the Host as if they came in from a normal network card. We | ||
1498 | * just shunt packets between the Guest and the tun device. */ | ||
1499 | static void setup_tun_net(char *arg) | ||
1500 | { | ||
1501 | struct device *dev; | ||
1502 | int netfd, ipfd; | ||
1503 | u32 ip = INADDR_ANY; | ||
1504 | bool bridging = false; | ||
1505 | char tapif[IFNAMSIZ], *p; | ||
1506 | struct virtio_net_config conf; | ||
1507 | |||
1508 | netfd = get_tun_device(tapif); | ||
1509 | |||
1351 | /* First we create a new network device. */ | 1510 | /* First we create a new network device. */ |
1352 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); | 1511 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); |
1353 | 1512 | ||
1354 | /* Network devices need a receive and a send queue, just like | 1513 | /* Network devices need a receive and a send queue, just like |
1355 | * console. */ | 1514 | * console. */ |
1356 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1515 | add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); |
1357 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); | 1516 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); |
1358 | 1517 | ||
1359 | /* We need a socket to perform the magic network ioctls to bring up the | 1518 | /* We need a socket to perform the magic network ioctls to bring up the |
@@ -1364,28 +1523,56 @@ static void setup_tun_net(const char *arg) | |||
1364 | 1523 | ||
1365 | /* If the command line was --tunnet=bridge:<name> do bridging. */ | 1524 | /* If the command line was --tunnet=bridge:<name> do bridging. */ |
1366 | if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { | 1525 | if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) { |
1367 | ip = INADDR_ANY; | 1526 | arg += strlen(BRIDGE_PFX); |
1368 | br_name = arg + strlen(BRIDGE_PFX); | 1527 | bridging = true; |
1369 | add_to_bridge(ipfd, ifr.ifr_name, br_name); | 1528 | } |
1370 | } else /* It is an IP address to set up the device with */ | 1529 | |
1530 | /* A mac address may follow the bridge name or IP address */ | ||
1531 | p = strchr(arg, ':'); | ||
1532 | if (p) { | ||
1533 | str2mac(p+1, conf.mac); | ||
1534 | *p = '\0'; | ||
1535 | } else { | ||
1536 | p = arg + strlen(arg); | ||
1537 | /* None supplied; query the randomly assigned mac. */ | ||
1538 | get_mac(ipfd, tapif, conf.mac); | ||
1539 | } | ||
1540 | |||
1541 | /* arg is now either an IP address or a bridge name */ | ||
1542 | if (bridging) | ||
1543 | add_to_bridge(ipfd, tapif, arg); | ||
1544 | else | ||
1371 | ip = str2ip(arg); | 1545 | ip = str2ip(arg); |
1372 | 1546 | ||
1373 | /* Set up the tun device, and get the mac address for the interface. */ | 1547 | /* Set up the tun device. */ |
1374 | configure_device(ipfd, ifr.ifr_name, ip, conf.mac); | 1548 | configure_device(ipfd, tapif, ip); |
1375 | 1549 | ||
1376 | /* Tell Guest what MAC address to use. */ | 1550 | /* Tell Guest what MAC address to use. */ |
1377 | add_feature(dev, VIRTIO_NET_F_MAC); | 1551 | add_feature(dev, VIRTIO_NET_F_MAC); |
1378 | add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); | 1552 | add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); |
1553 | /* Expect Guest to handle everything except UFO */ | ||
1554 | add_feature(dev, VIRTIO_NET_F_CSUM); | ||
1555 | add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); | ||
1556 | add_feature(dev, VIRTIO_NET_F_MAC); | ||
1557 | add_feature(dev, VIRTIO_NET_F_GUEST_TSO4); | ||
1558 | add_feature(dev, VIRTIO_NET_F_GUEST_TSO6); | ||
1559 | add_feature(dev, VIRTIO_NET_F_GUEST_ECN); | ||
1560 | add_feature(dev, VIRTIO_NET_F_HOST_TSO4); | ||
1561 | add_feature(dev, VIRTIO_NET_F_HOST_TSO6); | ||
1562 | add_feature(dev, VIRTIO_NET_F_HOST_ECN); | ||
1379 | set_config(dev, sizeof(conf), &conf); | 1563 | set_config(dev, sizeof(conf), &conf); |
1380 | 1564 | ||
1381 | /* We don't need the socket any more; setup is done. */ | 1565 | /* We don't need the socket any more; setup is done. */ |
1382 | close(ipfd); | 1566 | close(ipfd); |
1383 | 1567 | ||
1384 | verbose("device %u: tun net %u.%u.%u.%u\n", | 1568 | devices.device_num++; |
1385 | devices.device_num++, | 1569 | |
1386 | (u8)(ip>>24),(u8)(ip>>16),(u8)(ip>>8),(u8)ip); | 1570 | if (bridging) |
1387 | if (br_name) | 1571 | verbose("device %u: tun %s attached to bridge: %s\n", |
1388 | verbose("attached to bridge: %s\n", br_name); | 1572 | devices.device_num, tapif, arg); |
1573 | else | ||
1574 | verbose("device %u: tun %s: %s\n", | ||
1575 | devices.device_num, tapif, arg); | ||
1389 | } | 1576 | } |
1390 | 1577 | ||
1391 | /* Our block (disk) device should be really simple: the Guest asks for a block | 1578 | /* Our block (disk) device should be really simple: the Guest asks for a block |
@@ -1550,7 +1737,7 @@ static bool handle_io_finish(int fd, struct device *dev) | |||
1550 | } | 1737 | } |
1551 | 1738 | ||
1552 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ | 1739 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ |
1553 | static void handle_virtblk_output(int fd, struct virtqueue *vq) | 1740 | static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) |
1554 | { | 1741 | { |
1555 | struct vblk_info *vblk = vq->dev->priv; | 1742 | struct vblk_info *vblk = vq->dev->priv; |
1556 | char c = 0; | 1743 | char c = 0; |
@@ -1621,6 +1808,64 @@ static void setup_block_file(const char *filename) | |||
1621 | verbose("device %u: virtblock %llu sectors\n", | 1808 | verbose("device %u: virtblock %llu sectors\n", |
1622 | devices.device_num, le64_to_cpu(conf.capacity)); | 1809 | devices.device_num, le64_to_cpu(conf.capacity)); |
1623 | } | 1810 | } |
1811 | |||
1812 | /* Our random number generator device reads from /dev/random into the Guest's | ||
1813 | * input buffers. The usual case is that the Guest doesn't want random numbers | ||
1814 | * and so has no buffers although /dev/random is still readable, whereas | ||
1815 | * console is the reverse. | ||
1816 | * | ||
1817 | * The same logic applies, however. */ | ||
1818 | static bool handle_rng_input(int fd, struct device *dev) | ||
1819 | { | ||
1820 | int len; | ||
1821 | unsigned int head, in_num, out_num, totlen = 0; | ||
1822 | struct iovec iov[dev->vq->vring.num]; | ||
1823 | |||
1824 | /* First we need a buffer from the Guests's virtqueue. */ | ||
1825 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
1826 | |||
1827 | /* If they're not ready for input, stop listening to this file | ||
1828 | * descriptor. We'll start again once they add an input buffer. */ | ||
1829 | if (head == dev->vq->vring.num) | ||
1830 | return false; | ||
1831 | |||
1832 | if (out_num) | ||
1833 | errx(1, "Output buffers in rng?"); | ||
1834 | |||
1835 | /* This is why we convert to iovecs: the readv() call uses them, and so | ||
1836 | * it reads straight into the Guest's buffer. We loop to make sure we | ||
1837 | * fill it. */ | ||
1838 | while (!iov_empty(iov, in_num)) { | ||
1839 | len = readv(dev->fd, iov, in_num); | ||
1840 | if (len <= 0) | ||
1841 | err(1, "Read from /dev/random gave %i", len); | ||
1842 | iov_consume(iov, in_num, len); | ||
1843 | totlen += len; | ||
1844 | } | ||
1845 | |||
1846 | /* Tell the Guest about the new input. */ | ||
1847 | add_used_and_trigger(fd, dev->vq, head, totlen); | ||
1848 | |||
1849 | /* Everything went OK! */ | ||
1850 | return true; | ||
1851 | } | ||
1852 | |||
1853 | /* And this creates a "hardware" random number device for the Guest. */ | ||
1854 | static void setup_rng(void) | ||
1855 | { | ||
1856 | struct device *dev; | ||
1857 | int fd; | ||
1858 | |||
1859 | fd = open_or_die("/dev/random", O_RDONLY); | ||
1860 | |||
1861 | /* The device responds to return from I/O thread. */ | ||
1862 | dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); | ||
1863 | |||
1864 | /* The device has one virtqueue, where the Guest places inbufs. */ | ||
1865 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | ||
1866 | |||
1867 | verbose("device %u: rng\n", devices.device_num++); | ||
1868 | } | ||
1624 | /* That's the end of device setup. */ | 1869 | /* That's the end of device setup. */ |
1625 | 1870 | ||
1626 | /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ | 1871 | /*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */ |
@@ -1628,11 +1873,12 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
1628 | { | 1873 | { |
1629 | unsigned int i; | 1874 | unsigned int i; |
1630 | 1875 | ||
1631 | /* Closing pipes causes the Waker thread and io_threads to die, and | 1876 | /* Since we don't track all open fds, we simply close everything beyond |
1632 | * closing /dev/lguest cleans up the Guest. Since we don't track all | 1877 | * stderr. */ |
1633 | * open fds, we simply close everything beyond stderr. */ | ||
1634 | for (i = 3; i < FD_SETSIZE; i++) | 1878 | for (i = 3; i < FD_SETSIZE; i++) |
1635 | close(i); | 1879 | close(i); |
1880 | |||
1881 | /* The exec automatically gets rid of the I/O and Waker threads. */ | ||
1636 | execv(main_args[0], main_args); | 1882 | execv(main_args[0], main_args); |
1637 | err(1, "Could not exec %s", main_args[0]); | 1883 | err(1, "Could not exec %s", main_args[0]); |
1638 | } | 1884 | } |
@@ -1663,7 +1909,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1663 | /* ERESTART means that we need to reboot the guest */ | 1909 | /* ERESTART means that we need to reboot the guest */ |
1664 | } else if (errno == ERESTART) { | 1910 | } else if (errno == ERESTART) { |
1665 | restart_guest(); | 1911 | restart_guest(); |
1666 | /* EAGAIN means the Waker wanted us to look at some input. | 1912 | /* EAGAIN means a signal (timeout). |
1667 | * Anything else means a bug or incompatible change. */ | 1913 | * Anything else means a bug or incompatible change. */ |
1668 | } else if (errno != EAGAIN) | 1914 | } else if (errno != EAGAIN) |
1669 | err(1, "Running guest failed"); | 1915 | err(1, "Running guest failed"); |
@@ -1691,13 +1937,14 @@ static struct option opts[] = { | |||
1691 | { "verbose", 0, NULL, 'v' }, | 1937 | { "verbose", 0, NULL, 'v' }, |
1692 | { "tunnet", 1, NULL, 't' }, | 1938 | { "tunnet", 1, NULL, 't' }, |
1693 | { "block", 1, NULL, 'b' }, | 1939 | { "block", 1, NULL, 'b' }, |
1940 | { "rng", 0, NULL, 'r' }, | ||
1694 | { "initrd", 1, NULL, 'i' }, | 1941 | { "initrd", 1, NULL, 'i' }, |
1695 | { NULL }, | 1942 | { NULL }, |
1696 | }; | 1943 | }; |
1697 | static void usage(void) | 1944 | static void usage(void) |
1698 | { | 1945 | { |
1699 | errx(1, "Usage: lguest [--verbose] " | 1946 | errx(1, "Usage: lguest [--verbose] " |
1700 | "[--tunnet=(<ipaddr>|bridge:<bridgename>)\n" | 1947 | "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n" |
1701 | "|--block=<filename>|--initrd=<filename>]...\n" | 1948 | "|--block=<filename>|--initrd=<filename>]...\n" |
1702 | "<mem-in-mb> vmlinux [args...]"); | 1949 | "<mem-in-mb> vmlinux [args...]"); |
1703 | } | 1950 | } |
@@ -1765,6 +2012,9 @@ int main(int argc, char *argv[]) | |||
1765 | case 'b': | 2012 | case 'b': |
1766 | setup_block_file(optarg); | 2013 | setup_block_file(optarg); |
1767 | break; | 2014 | break; |
2015 | case 'r': | ||
2016 | setup_rng(); | ||
2017 | break; | ||
1768 | case 'i': | 2018 | case 'i': |
1769 | initrd_name = optarg; | 2019 | initrd_name = optarg; |
1770 | break; | 2020 | break; |
@@ -1783,6 +2033,9 @@ int main(int argc, char *argv[]) | |||
1783 | /* We always have a console device */ | 2033 | /* We always have a console device */ |
1784 | setup_console(); | 2034 | setup_console(); |
1785 | 2035 | ||
2036 | /* We can timeout waiting for Guest network transmit. */ | ||
2037 | setup_timeout(); | ||
2038 | |||
1786 | /* Now we load the kernel */ | 2039 | /* Now we load the kernel */ |
1787 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); | 2040 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1788 | 2041 | ||
@@ -1826,10 +2079,10 @@ int main(int argc, char *argv[]) | |||
1826 | * /dev/lguest file descriptor. */ | 2079 | * /dev/lguest file descriptor. */ |
1827 | lguest_fd = tell_kernel(pgdir, start); | 2080 | lguest_fd = tell_kernel(pgdir, start); |
1828 | 2081 | ||
1829 | /* We fork off a child process, which wakes the Launcher whenever one | 2082 | /* We clone off a thread, which wakes the Launcher whenever one of the |
1830 | * of the input file descriptors needs attention. We call this the | 2083 | * input file descriptors needs attention. We call this the Waker, and |
1831 | * Waker, and we'll cover it in a moment. */ | 2084 | * we'll cover it in a moment. */ |
1832 | waker_fd = setup_waker(lguest_fd); | 2085 | setup_waker(lguest_fd); |
1833 | 2086 | ||
1834 | /* Finally, run the Guest. This doesn't return. */ | 2087 | /* Finally, run the Guest. This doesn't return. */ |
1835 | run_guest(lguest_fd); | 2088 | run_guest(lguest_fd); |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 0313a5eec412..d9249a882aa5 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -1014,6 +1014,9 @@ __init void lguest_init(void) | |||
1014 | init_pg_tables_start = __pa(pg0); | 1014 | init_pg_tables_start = __pa(pg0); |
1015 | init_pg_tables_end = __pa(pg0); | 1015 | init_pg_tables_end = __pa(pg0); |
1016 | 1016 | ||
1017 | /* As described in head_32.S, we map the first 128M of memory. */ | ||
1018 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | ||
1019 | |||
1017 | /* Load the %fs segment register (the per-cpu segment register) with | 1020 | /* Load the %fs segment register (the per-cpu segment register) with |
1018 | * the normal data segment to get through booting. */ | 1021 | * the normal data segment to get through booting. */ |
1019 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); | 1022 | asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 5eea4356d703..90663e01a56e 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -135,6 +135,7 @@ static void unmap_switcher(void) | |||
135 | /* Now we just need to free the pages we copied the switcher into */ | 135 | /* Now we just need to free the pages we copied the switcher into */ |
136 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) | 136 | for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) |
137 | __free_pages(switcher_page[i], 0); | 137 | __free_pages(switcher_page[i], 0); |
138 | kfree(switcher_page); | ||
138 | } | 139 | } |
139 | 140 | ||
140 | /*H:032 | 141 | /*H:032 |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0414ddf87587..a1039068f95c 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
@@ -406,7 +406,8 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) | |||
406 | * deliver_trap() to bounce it back into the Guest. */ | 406 | * deliver_trap() to bounce it back into the Guest. */ |
407 | static void default_idt_entry(struct desc_struct *idt, | 407 | static void default_idt_entry(struct desc_struct *idt, |
408 | int trap, | 408 | int trap, |
409 | const unsigned long handler) | 409 | const unsigned long handler, |
410 | const struct desc_struct *base) | ||
410 | { | 411 | { |
411 | /* A present interrupt gate. */ | 412 | /* A present interrupt gate. */ |
412 | u32 flags = 0x8e00; | 413 | u32 flags = 0x8e00; |
@@ -415,6 +416,10 @@ static void default_idt_entry(struct desc_struct *idt, | |||
415 | * the Guest to use the "int" instruction to trigger it. */ | 416 | * the Guest to use the "int" instruction to trigger it. */ |
416 | if (trap == LGUEST_TRAP_ENTRY) | 417 | if (trap == LGUEST_TRAP_ENTRY) |
417 | flags |= (GUEST_PL << 13); | 418 | flags |= (GUEST_PL << 13); |
419 | else if (base) | ||
420 | /* Copy priv. level from what Guest asked for. This allows | ||
421 | * debug (int 3) traps from Guest userspace, for example. */ | ||
422 | flags |= (base->b & 0x6000); | ||
418 | 423 | ||
419 | /* Now pack it into the IDT entry in its weird format. */ | 424 | /* Now pack it into the IDT entry in its weird format. */ |
420 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); | 425 | idt->a = (LGUEST_CS<<16) | (handler&0x0000FFFF); |
@@ -428,7 +433,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
428 | unsigned int i; | 433 | unsigned int i; |
429 | 434 | ||
430 | for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) | 435 | for (i = 0; i < ARRAY_SIZE(state->guest_idt); i++) |
431 | default_idt_entry(&state->guest_idt[i], i, def[i]); | 436 | default_idt_entry(&state->guest_idt[i], i, def[i], NULL); |
432 | } | 437 | } |
433 | 438 | ||
434 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead | 439 | /*H:240 We don't use the IDT entries in the "struct lguest" directly, instead |
@@ -442,6 +447,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | |||
442 | /* We can simply copy the direct traps, otherwise we use the default | 447 | /* We can simply copy the direct traps, otherwise we use the default |
443 | * ones in the Switcher: they will return to the Host. */ | 448 | * ones in the Switcher: they will return to the Host. */ |
444 | for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { | 449 | for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { |
450 | const struct desc_struct *gidt = &cpu->arch.idt[i]; | ||
451 | |||
445 | /* If no Guest can ever override this trap, leave it alone. */ | 452 | /* If no Guest can ever override this trap, leave it alone. */ |
446 | if (!direct_trap(i)) | 453 | if (!direct_trap(i)) |
447 | continue; | 454 | continue; |
@@ -449,12 +456,15 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | |||
449 | /* Only trap gates (type 15) can go direct to the Guest. | 456 | /* Only trap gates (type 15) can go direct to the Guest. |
450 | * Interrupt gates (type 14) disable interrupts as they are | 457 | * Interrupt gates (type 14) disable interrupts as they are |
451 | * entered, which we never let the Guest do. Not present | 458 | * entered, which we never let the Guest do. Not present |
452 | * entries (type 0x0) also can't go direct, of course. */ | 459 | * entries (type 0x0) also can't go direct, of course. |
453 | if (idt_type(cpu->arch.idt[i].a, cpu->arch.idt[i].b) == 0xF) | 460 | * |
454 | idt[i] = cpu->arch.idt[i]; | 461 | * If it can't go direct, we still need to copy the priv. level: |
462 | * they might want to give userspace access to a software | ||
463 | * interrupt. */ | ||
464 | if (idt_type(gidt->a, gidt->b) == 0xF) | ||
465 | idt[i] = *gidt; | ||
455 | else | 466 | else |
456 | /* Reset it to the default. */ | 467 | default_idt_entry(&idt[i], i, def[i], gidt); |
457 | default_idt_entry(&idt[i], i, def[i]); | ||
458 | } | 468 | } |
459 | } | 469 | } |
460 | 470 | ||
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 95dfda52b4f9..bf7942327bda 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -480,7 +480,7 @@ void __init lguest_arch_host_init(void) | |||
480 | * bit on its CPU, depending on the argument (0 == unset). */ | 480 | * bit on its CPU, depending on the argument (0 == unset). */ |
481 | on_each_cpu(adjust_pge, (void *)0, 1); | 481 | on_each_cpu(adjust_pge, (void *)0, 1); |
482 | /* Turn off the feature in the global feature set. */ | 482 | /* Turn off the feature in the global feature set. */ |
483 | clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 483 | clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); |
484 | } | 484 | } |
485 | put_online_cpus(); | 485 | put_online_cpus(); |
486 | }; | 486 | }; |
@@ -491,7 +491,7 @@ void __exit lguest_arch_host_fini(void) | |||
491 | /* If we had PGE before we started, turn it back on now. */ | 491 | /* If we had PGE before we started, turn it back on now. */ |
492 | get_online_cpus(); | 492 | get_online_cpus(); |
493 | if (cpu_had_pge) { | 493 | if (cpu_had_pge) { |
494 | set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability); | 494 | set_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); |
495 | /* adjust_pge's argument "1" means set PGE. */ | 495 | /* adjust_pge's argument "1" means set PGE. */ |
496 | on_each_cpu(adjust_pge, (void *)1, 1); | 496 | on_each_cpu(adjust_pge, (void *)1, 1); |
497 | } | 497 | } |