diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2008-07-29 10:58:35 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2008-07-28 19:58:36 -0400 |
commit | a161883a29bf6100efe7b5346bec274e5023c29c (patch) | |
tree | 5d5b65172a64789eada0e3b824564a793033d0ad /Documentation/lguest | |
parent | 5dae785a82c1a8c05b5b4f9709bd9ce658dcf1b6 (diff) |
lguest: Tell Guest net not to notify us on every packet xmit
virtio_ring has the ability to suppress notifications. This prevents
a guest exit for every packet, but we need to set a timer on packet
receipt to re-check if there were any remaining packets.
Here are the times for 1G TCP Guest->Host with different timeout
settings (it matters because the TCP window doesn't grow big enough to
fill the entire buffer):
Timeout value Seconds Xmit/Recv/Timeout
None (before) 25.3784 xmit 7750233 recv 1
2500 usec 62.5119 xmit 207020 recv 2 timeout 207020
1000 usec 34.5379 xmit 207003 recv 2 timeout 207003
750 usec 29.2305 xmit 207002 recv 1 timeout 207002
500 usec 19.1887 xmit 561141 recv 1 timeout 559657
250 usec 20.0465 xmit 214128 recv 2 timeout 214110
100 usec 19.2583 xmit 561621 recv 1 timeout 560153
(Note that these values are sensitive to the GSO patches which come
later, and probably other traffic-related variables, so take with a
large grain of salt).
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Diffstat (limited to 'Documentation/lguest')
-rw-r--r-- | Documentation/lguest/lguest.c | 106 |
1 files changed, 93 insertions, 13 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 46f4c5b09e9e..018472cee151 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -36,6 +36,7 @@ | |||
36 | #include <sched.h> | 36 | #include <sched.h> |
37 | #include <limits.h> | 37 | #include <limits.h> |
38 | #include <stddef.h> | 38 | #include <stddef.h> |
39 | #include <signal.h> | ||
39 | #include "linux/lguest_launcher.h" | 40 | #include "linux/lguest_launcher.h" |
40 | #include "linux/virtio_config.h" | 41 | #include "linux/virtio_config.h" |
41 | #include "linux/virtio_net.h" | 42 | #include "linux/virtio_net.h" |
@@ -81,6 +82,8 @@ static int waker_fd; | |||
81 | static void *guest_base; | 82 | static void *guest_base; |
82 | /* The maximum guest physical address allowed, and maximum possible. */ | 83 | /* The maximum guest physical address allowed, and maximum possible. */ |
83 | static unsigned long guest_limit, guest_max; | 84 | static unsigned long guest_limit, guest_max; |
85 | /* The pipe for signal hander to write to. */ | ||
86 | static int timeoutpipe[2]; | ||
84 | 87 | ||
85 | /* a per-cpu variable indicating whose vcpu is currently running */ | 88 | /* a per-cpu variable indicating whose vcpu is currently running */ |
86 | static unsigned int __thread cpu_id; | 89 | static unsigned int __thread cpu_id; |
@@ -156,11 +159,14 @@ struct virtqueue | |||
156 | /* Last available index we saw. */ | 159 | /* Last available index we saw. */ |
157 | u16 last_avail_idx; | 160 | u16 last_avail_idx; |
158 | 161 | ||
159 | /* The routine to call when the Guest pings us. */ | 162 | /* The routine to call when the Guest pings us, or timeout. */ |
160 | void (*handle_output)(int fd, struct virtqueue *me); | 163 | void (*handle_output)(int fd, struct virtqueue *me, bool timeout); |
161 | 164 | ||
162 | /* Outstanding buffers */ | 165 | /* Outstanding buffers */ |
163 | unsigned int inflight; | 166 | unsigned int inflight; |
167 | |||
168 | /* Is this blocked awaiting a timer? */ | ||
169 | bool blocked; | ||
164 | }; | 170 | }; |
165 | 171 | ||
166 | /* Remember the arguments to the program so we can "reboot" */ | 172 | /* Remember the arguments to the program so we can "reboot" */ |
@@ -874,7 +880,7 @@ static bool handle_console_input(int fd, struct device *dev) | |||
874 | 880 | ||
875 | /* Handling output for console is simple: we just get all the output buffers | 881 | /* Handling output for console is simple: we just get all the output buffers |
876 | * and write them to stdout. */ | 882 | * and write them to stdout. */ |
877 | static void handle_console_output(int fd, struct virtqueue *vq) | 883 | static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) |
878 | { | 884 | { |
879 | unsigned int head, out, in; | 885 | unsigned int head, out, in; |
880 | int len; | 886 | int len; |
@@ -889,6 +895,21 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
889 | } | 895 | } |
890 | } | 896 | } |
891 | 897 | ||
898 | static void block_vq(struct virtqueue *vq) | ||
899 | { | ||
900 | struct itimerval itm; | ||
901 | |||
902 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
903 | vq->blocked = true; | ||
904 | |||
905 | itm.it_interval.tv_sec = 0; | ||
906 | itm.it_interval.tv_usec = 0; | ||
907 | itm.it_value.tv_sec = 0; | ||
908 | itm.it_value.tv_usec = 500; | ||
909 | |||
910 | setitimer(ITIMER_REAL, &itm, NULL); | ||
911 | } | ||
912 | |||
892 | /* | 913 | /* |
893 | * The Network | 914 | * The Network |
894 | * | 915 | * |
@@ -896,9 +917,9 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
896 | * and write them (ignoring the first element) to this device's file descriptor | 917 | * and write them (ignoring the first element) to this device's file descriptor |
897 | * (/dev/net/tun). | 918 | * (/dev/net/tun). |
898 | */ | 919 | */ |
899 | static void handle_net_output(int fd, struct virtqueue *vq) | 920 | static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) |
900 | { | 921 | { |
901 | unsigned int head, out, in; | 922 | unsigned int head, out, in, num = 0; |
902 | int len; | 923 | int len; |
903 | struct iovec iov[vq->vring.num]; | 924 | struct iovec iov[vq->vring.num]; |
904 | 925 | ||
@@ -912,7 +933,12 @@ static void handle_net_output(int fd, struct virtqueue *vq) | |||
912 | (void)convert(&iov[0], struct virtio_net_hdr); | 933 | (void)convert(&iov[0], struct virtio_net_hdr); |
913 | len = writev(vq->dev->fd, iov+1, out-1); | 934 | len = writev(vq->dev->fd, iov+1, out-1); |
914 | add_used_and_trigger(fd, vq, head, len); | 935 | add_used_and_trigger(fd, vq, head, len); |
936 | num++; | ||
915 | } | 937 | } |
938 | |||
939 | /* Block further kicks and set up a timer if we saw anything. */ | ||
940 | if (!timeout && num) | ||
941 | block_vq(vq); | ||
916 | } | 942 | } |
917 | 943 | ||
918 | /* This is where we handle a packet coming in from the tun device to our | 944 | /* This is where we handle a packet coming in from the tun device to our |
@@ -967,18 +993,18 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
967 | /*L:215 This is the callback attached to the network and console input | 993 | /*L:215 This is the callback attached to the network and console input |
968 | * virtqueues: it ensures we try again, in case we stopped console or net | 994 | * virtqueues: it ensures we try again, in case we stopped console or net |
969 | * delivery because Guest didn't have any buffers. */ | 995 | * delivery because Guest didn't have any buffers. */ |
970 | static void enable_fd(int fd, struct virtqueue *vq) | 996 | static void enable_fd(int fd, struct virtqueue *vq, bool timeout) |
971 | { | 997 | { |
972 | add_device_fd(vq->dev->fd); | 998 | add_device_fd(vq->dev->fd); |
973 | /* Tell waker to listen to it again */ | 999 | /* Tell waker to listen to it again */ |
974 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); | 1000 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); |
975 | } | 1001 | } |
976 | 1002 | ||
977 | static void net_enable_fd(int fd, struct virtqueue *vq) | 1003 | static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) |
978 | { | 1004 | { |
979 | /* We don't need to know again when Guest refills receive buffer. */ | 1005 | /* We don't need to know again when Guest refills receive buffer. */ |
980 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | 1006 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; |
981 | enable_fd(fd, vq); | 1007 | enable_fd(fd, vq, timeout); |
982 | } | 1008 | } |
983 | 1009 | ||
984 | /* When the Guest tells us they updated the status field, we handle it. */ | 1010 | /* When the Guest tells us they updated the status field, we handle it. */ |
@@ -1047,7 +1073,7 @@ static void handle_output(int fd, unsigned long addr) | |||
1047 | if (strcmp(vq->dev->name, "console") != 0) | 1073 | if (strcmp(vq->dev->name, "console") != 0) |
1048 | verbose("Output to %s\n", vq->dev->name); | 1074 | verbose("Output to %s\n", vq->dev->name); |
1049 | if (vq->handle_output) | 1075 | if (vq->handle_output) |
1050 | vq->handle_output(fd, vq); | 1076 | vq->handle_output(fd, vq, false); |
1051 | return; | 1077 | return; |
1052 | } | 1078 | } |
1053 | } | 1079 | } |
@@ -1061,6 +1087,29 @@ static void handle_output(int fd, unsigned long addr) | |||
1061 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 1087 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
1062 | } | 1088 | } |
1063 | 1089 | ||
1090 | static void handle_timeout(int fd) | ||
1091 | { | ||
1092 | char buf[32]; | ||
1093 | struct device *i; | ||
1094 | struct virtqueue *vq; | ||
1095 | |||
1096 | /* Clear the pipe */ | ||
1097 | read(timeoutpipe[0], buf, sizeof(buf)); | ||
1098 | |||
1099 | /* Check each device and virtqueue: flush blocked ones. */ | ||
1100 | for (i = devices.dev; i; i = i->next) { | ||
1101 | for (vq = i->vq; vq; vq = vq->next) { | ||
1102 | if (!vq->blocked) | ||
1103 | continue; | ||
1104 | |||
1105 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
1106 | vq->blocked = false; | ||
1107 | if (vq->handle_output) | ||
1108 | vq->handle_output(fd, vq, true); | ||
1109 | } | ||
1110 | } | ||
1111 | } | ||
1112 | |||
1064 | /* This is called when the Waker wakes us up: check for incoming file | 1113 | /* This is called when the Waker wakes us up: check for incoming file |
1065 | * descriptors. */ | 1114 | * descriptors. */ |
1066 | static void handle_input(int fd) | 1115 | static void handle_input(int fd) |
@@ -1071,9 +1120,14 @@ static void handle_input(int fd) | |||
1071 | for (;;) { | 1120 | for (;;) { |
1072 | struct device *i; | 1121 | struct device *i; |
1073 | fd_set fds = devices.infds; | 1122 | fd_set fds = devices.infds; |
1123 | int num; | ||
1074 | 1124 | ||
1125 | num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); | ||
1126 | /* Could get interrupted */ | ||
1127 | if (num < 0) | ||
1128 | continue; | ||
1075 | /* If nothing is ready, we're done. */ | 1129 | /* If nothing is ready, we're done. */ |
1076 | if (select(devices.max_infd+1, &fds, NULL, NULL, &poll) == 0) | 1130 | if (num == 0) |
1077 | break; | 1131 | break; |
1078 | 1132 | ||
1079 | /* Otherwise, call the device(s) which have readable file | 1133 | /* Otherwise, call the device(s) which have readable file |
@@ -1097,6 +1151,10 @@ static void handle_input(int fd) | |||
1097 | write(waker_fd, &dev_fd, sizeof(dev_fd)); | 1151 | write(waker_fd, &dev_fd, sizeof(dev_fd)); |
1098 | } | 1152 | } |
1099 | } | 1153 | } |
1154 | |||
1155 | /* Is this the timeout fd? */ | ||
1156 | if (FD_ISSET(timeoutpipe[0], &fds)) | ||
1157 | handle_timeout(fd); | ||
1100 | } | 1158 | } |
1101 | } | 1159 | } |
1102 | 1160 | ||
@@ -1145,7 +1203,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
1145 | /* Each device descriptor is followed by the description of its virtqueues. We | 1203 | /* Each device descriptor is followed by the description of its virtqueues. We |
1146 | * specify how many descriptors the virtqueue is to have. */ | 1204 | * specify how many descriptors the virtqueue is to have. */ |
1147 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 1205 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
1148 | void (*handle_output)(int fd, struct virtqueue *me)) | 1206 | void (*handle_output)(int, struct virtqueue *, bool)) |
1149 | { | 1207 | { |
1150 | unsigned int pages; | 1208 | unsigned int pages; |
1151 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 1209 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
@@ -1161,6 +1219,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1161 | vq->last_avail_idx = 0; | 1219 | vq->last_avail_idx = 0; |
1162 | vq->dev = dev; | 1220 | vq->dev = dev; |
1163 | vq->inflight = 0; | 1221 | vq->inflight = 0; |
1222 | vq->blocked = false; | ||
1164 | 1223 | ||
1165 | /* Initialize the configuration. */ | 1224 | /* Initialize the configuration. */ |
1166 | vq->config.num = num_descs; | 1225 | vq->config.num = num_descs; |
@@ -1293,6 +1352,24 @@ static void setup_console(void) | |||
1293 | } | 1352 | } |
1294 | /*:*/ | 1353 | /*:*/ |
1295 | 1354 | ||
1355 | static void timeout_alarm(int sig) | ||
1356 | { | ||
1357 | write(timeoutpipe[1], "", 1); | ||
1358 | } | ||
1359 | |||
1360 | static void setup_timeout(void) | ||
1361 | { | ||
1362 | if (pipe(timeoutpipe) != 0) | ||
1363 | err(1, "Creating timeout pipe"); | ||
1364 | |||
1365 | if (fcntl(timeoutpipe[1], F_SETFL, | ||
1366 | fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) | ||
1367 | err(1, "Making timeout pipe nonblocking"); | ||
1368 | |||
1369 | add_device_fd(timeoutpipe[0]); | ||
1370 | signal(SIGALRM, timeout_alarm); | ||
1371 | } | ||
1372 | |||
1296 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a | 1373 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
1297 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1374 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
1298 | * used to send packets to another guest in a 1:1 manner. | 1375 | * used to send packets to another guest in a 1:1 manner. |
@@ -1653,7 +1730,7 @@ static bool handle_io_finish(int fd, struct device *dev) | |||
1653 | } | 1730 | } |
1654 | 1731 | ||
1655 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ | 1732 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ |
1656 | static void handle_virtblk_output(int fd, struct virtqueue *vq) | 1733 | static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) |
1657 | { | 1734 | { |
1658 | struct vblk_info *vblk = vq->dev->priv; | 1735 | struct vblk_info *vblk = vq->dev->priv; |
1659 | char c = 0; | 1736 | char c = 0; |
@@ -1824,7 +1901,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1824 | /* ERESTART means that we need to reboot the guest */ | 1901 | /* ERESTART means that we need to reboot the guest */ |
1825 | } else if (errno == ERESTART) { | 1902 | } else if (errno == ERESTART) { |
1826 | restart_guest(); | 1903 | restart_guest(); |
1827 | /* EAGAIN means the Waker wanted us to look at some input. | 1904 | /* EAGAIN means a signal (timeout). |
1828 | * Anything else means a bug or incompatible change. */ | 1905 | * Anything else means a bug or incompatible change. */ |
1829 | } else if (errno != EAGAIN) | 1906 | } else if (errno != EAGAIN) |
1830 | err(1, "Running guest failed"); | 1907 | err(1, "Running guest failed"); |
@@ -1948,6 +2025,9 @@ int main(int argc, char *argv[]) | |||
1948 | /* We always have a console device */ | 2025 | /* We always have a console device */ |
1949 | setup_console(); | 2026 | setup_console(); |
1950 | 2027 | ||
2028 | /* We can timeout waiting for Guest network transmit. */ | ||
2029 | setup_timeout(); | ||
2030 | |||
1951 | /* Now we load the kernel */ | 2031 | /* Now we load the kernel */ |
1952 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); | 2032 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
1953 | 2033 | ||