diff options
Diffstat (limited to 'Documentation/lguest/lguest.c')
-rw-r--r-- | Documentation/lguest/lguest.c | 280 |
1 files changed, 197 insertions, 83 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9b0e322118b5..0f23d67f958f 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -34,6 +34,8 @@ | |||
34 | #include <zlib.h> | 34 | #include <zlib.h> |
35 | #include <assert.h> | 35 | #include <assert.h> |
36 | #include <sched.h> | 36 | #include <sched.h> |
37 | #include <limits.h> | ||
38 | #include <stddef.h> | ||
37 | #include "linux/lguest_launcher.h" | 39 | #include "linux/lguest_launcher.h" |
38 | #include "linux/virtio_config.h" | 40 | #include "linux/virtio_config.h" |
39 | #include "linux/virtio_net.h" | 41 | #include "linux/virtio_net.h" |
@@ -79,6 +81,9 @@ static void *guest_base; | |||
79 | /* The maximum guest physical address allowed, and maximum possible. */ | 81 | /* The maximum guest physical address allowed, and maximum possible. */ |
80 | static unsigned long guest_limit, guest_max; | 82 | static unsigned long guest_limit, guest_max; |
81 | 83 | ||
84 | /* a per-cpu variable indicating whose vcpu is currently running */ | ||
85 | static unsigned int __thread cpu_id; | ||
86 | |||
82 | /* This is our list of devices. */ | 87 | /* This is our list of devices. */ |
83 | struct device_list | 88 | struct device_list |
84 | { | 89 | { |
@@ -96,13 +101,11 @@ struct device_list | |||
96 | /* The descriptor page for the devices. */ | 101 | /* The descriptor page for the devices. */ |
97 | u8 *descpage; | 102 | u8 *descpage; |
98 | 103 | ||
99 | /* The tail of the last descriptor. */ | ||
100 | unsigned int desc_used; | ||
101 | |||
102 | /* A single linked list of devices. */ | 104 | /* A single linked list of devices. */ |
103 | struct device *dev; | 105 | struct device *dev; |
104 | /* ... And an end pointer so we can easily append new devices */ | 106 | /* And a pointer to the last device for easy append and also for |
105 | struct device **lastdev; | 107 | * configuration appending. */ |
108 | struct device *lastdev; | ||
106 | }; | 109 | }; |
107 | 110 | ||
108 | /* The list of Guest devices, based on command line arguments. */ | 111 | /* The list of Guest devices, based on command line arguments. */ |
@@ -153,6 +156,9 @@ struct virtqueue | |||
153 | void (*handle_output)(int fd, struct virtqueue *me); | 156 | void (*handle_output)(int fd, struct virtqueue *me); |
154 | }; | 157 | }; |
155 | 158 | ||
159 | /* Remember the arguments to the program so we can "reboot" */ | ||
160 | static char **main_args; | ||
161 | |||
156 | /* Since guest is UP and we don't run at the same time, we don't need barriers. | 162 | /* Since guest is UP and we don't run at the same time, we don't need barriers. |
157 | * But I include them in the code in case others copy it. */ | 163 | * But I include them in the code in case others copy it. */ |
158 | #define wmb() | 164 | #define wmb() |
@@ -185,7 +191,14 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, | |||
185 | #define cpu_to_le64(v64) (v64) | 191 | #define cpu_to_le64(v64) (v64) |
186 | #define le16_to_cpu(v16) (v16) | 192 | #define le16_to_cpu(v16) (v16) |
187 | #define le32_to_cpu(v32) (v32) | 193 | #define le32_to_cpu(v32) (v32) |
188 | #define le64_to_cpu(v32) (v64) | 194 | #define le64_to_cpu(v64) (v64) |
195 | |||
196 | /* The device virtqueue descriptors are followed by feature bitmasks. */ | ||
197 | static u8 *get_feature_bits(struct device *dev) | ||
198 | { | ||
199 | return (u8 *)(dev->desc + 1) | ||
200 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig); | ||
201 | } | ||
189 | 202 | ||
190 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | 203 | /*L:100 The Launcher code itself takes us out into userspace, that scary place |
191 | * where pointers run wild and free! Unfortunately, like most userspace | 204 | * where pointers run wild and free! Unfortunately, like most userspace |
@@ -554,7 +567,7 @@ static void wake_parent(int pipefd, int lguest_fd) | |||
554 | else | 567 | else |
555 | FD_CLR(-fd - 1, &devices.infds); | 568 | FD_CLR(-fd - 1, &devices.infds); |
556 | } else /* Send LHREQ_BREAK command. */ | 569 | } else /* Send LHREQ_BREAK command. */ |
557 | write(lguest_fd, args, sizeof(args)); | 570 | pwrite(lguest_fd, args, sizeof(args), cpu_id); |
558 | } | 571 | } |
559 | } | 572 | } |
560 | 573 | ||
@@ -908,21 +921,58 @@ static void enable_fd(int fd, struct virtqueue *vq) | |||
908 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); | 921 | write(waker_fd, &vq->dev->fd, sizeof(vq->dev->fd)); |
909 | } | 922 | } |
910 | 923 | ||
924 | /* Resetting a device is fairly easy. */ | ||
925 | static void reset_device(struct device *dev) | ||
926 | { | ||
927 | struct virtqueue *vq; | ||
928 | |||
929 | verbose("Resetting device %s\n", dev->name); | ||
930 | /* Clear the status. */ | ||
931 | dev->desc->status = 0; | ||
932 | |||
933 | /* Clear any features they've acked. */ | ||
934 | memset(get_feature_bits(dev) + dev->desc->feature_len, 0, | ||
935 | dev->desc->feature_len); | ||
936 | |||
937 | /* Zero out the virtqueues. */ | ||
938 | for (vq = dev->vq; vq; vq = vq->next) { | ||
939 | memset(vq->vring.desc, 0, | ||
940 | vring_size(vq->config.num, getpagesize())); | ||
941 | vq->last_avail_idx = 0; | ||
942 | } | ||
943 | } | ||
944 | |||
911 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ | 945 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ |
912 | static void handle_output(int fd, unsigned long addr) | 946 | static void handle_output(int fd, unsigned long addr) |
913 | { | 947 | { |
914 | struct device *i; | 948 | struct device *i; |
915 | struct virtqueue *vq; | 949 | struct virtqueue *vq; |
916 | 950 | ||
917 | /* Check each virtqueue. */ | 951 | /* Check each device and virtqueue. */ |
918 | for (i = devices.dev; i; i = i->next) { | 952 | for (i = devices.dev; i; i = i->next) { |
953 | /* Notifications to device descriptors reset the device. */ | ||
954 | if (from_guest_phys(addr) == i->desc) { | ||
955 | reset_device(i); | ||
956 | return; | ||
957 | } | ||
958 | |||
959 | /* Notifications to virtqueues mean output has occurred. */ | ||
919 | for (vq = i->vq; vq; vq = vq->next) { | 960 | for (vq = i->vq; vq; vq = vq->next) { |
920 | if (vq->config.pfn == addr/getpagesize() | 961 | if (vq->config.pfn != addr/getpagesize()) |
921 | && vq->handle_output) { | 962 | continue; |
922 | verbose("Output to %s\n", vq->dev->name); | 963 | |
923 | vq->handle_output(fd, vq); | 964 | /* Guest should acknowledge (and set features!) before |
965 | * using the device. */ | ||
966 | if (i->desc->status == 0) { | ||
967 | warnx("%s gave early output", i->name); | ||
924 | return; | 968 | return; |
925 | } | 969 | } |
970 | |||
971 | if (strcmp(vq->dev->name, "console") != 0) | ||
972 | verbose("Output to %s\n", vq->dev->name); | ||
973 | if (vq->handle_output) | ||
974 | vq->handle_output(fd, vq); | ||
975 | return; | ||
926 | } | 976 | } |
927 | } | 977 | } |
928 | 978 | ||
@@ -980,54 +1030,44 @@ static void handle_input(int fd) | |||
980 | * | 1030 | * |
981 | * All devices need a descriptor so the Guest knows it exists, and a "struct | 1031 | * All devices need a descriptor so the Guest knows it exists, and a "struct |
982 | * device" so the Launcher can keep track of it. We have common helper | 1032 | * device" so the Launcher can keep track of it. We have common helper |
983 | * routines to allocate them. | 1033 | * routines to allocate and manage them. */ |
984 | * | ||
985 | * This routine allocates a new "struct lguest_device_desc" from descriptor | ||
986 | * table just above the Guest's normal memory. It returns a pointer to that | ||
987 | * descriptor. */ | ||
988 | static struct lguest_device_desc *new_dev_desc(u16 type) | ||
989 | { | ||
990 | struct lguest_device_desc *d; | ||
991 | 1034 | ||
992 | /* We only have one page for all the descriptors. */ | 1035 | /* The layout of the device page is a "struct lguest_device_desc" followed by a |
993 | if (devices.desc_used + sizeof(*d) > getpagesize()) | 1036 | * number of virtqueue descriptors, then two sets of feature bits, then an |
994 | errx(1, "Too many devices"); | 1037 | * array of configuration bytes. This routine returns the configuration |
995 | 1038 | * pointer. */ | |
996 | /* We don't need to set config_len or status: page is 0 already. */ | 1039 | static u8 *device_config(const struct device *dev) |
997 | d = (void *)devices.descpage + devices.desc_used; | 1040 | { |
998 | d->type = type; | 1041 | return (void *)(dev->desc + 1) |
999 | devices.desc_used += sizeof(*d); | 1042 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig) |
1000 | 1043 | + dev->desc->feature_len * 2; | |
1001 | return d; | ||
1002 | } | 1044 | } |
1003 | 1045 | ||
1004 | /* Each device descriptor is followed by some configuration information. | 1046 | /* This routine allocates a new "struct lguest_device_desc" from descriptor |
1005 | * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. | 1047 | * table page just above the Guest's normal memory. It returns a pointer to |
1006 | * | 1048 | * that descriptor. */ |
1007 | * This routine adds a new field to an existing device's descriptor. It only | 1049 | static struct lguest_device_desc *new_dev_desc(u16 type) |
1008 | * works for the last device, but that's OK because that's how we use it. */ | ||
1009 | static void add_desc_field(struct device *dev, u8 type, u8 len, const void *c) | ||
1010 | { | 1050 | { |
1011 | /* This is the last descriptor, right? */ | 1051 | struct lguest_device_desc d = { .type = type }; |
1012 | assert(devices.descpage + devices.desc_used | 1052 | void *p; |
1013 | == (u8 *)(dev->desc + 1) + dev->desc->config_len); | ||
1014 | 1053 | ||
1015 | /* We only have one page of device descriptions. */ | 1054 | /* Figure out where the next device config is, based on the last one. */ |
1016 | if (devices.desc_used + 2 + len > getpagesize()) | 1055 | if (devices.lastdev) |
1017 | errx(1, "Too many devices"); | 1056 | p = device_config(devices.lastdev) |
1057 | + devices.lastdev->desc->config_len; | ||
1058 | else | ||
1059 | p = devices.descpage; | ||
1018 | 1060 | ||
1019 | /* Copy in the new config header: type then length. */ | 1061 | /* We only have one page for all the descriptors. */ |
1020 | devices.descpage[devices.desc_used++] = type; | 1062 | if (p + sizeof(d) > (void *)devices.descpage + getpagesize()) |
1021 | devices.descpage[devices.desc_used++] = len; | 1063 | errx(1, "Too many devices"); |
1022 | memcpy(devices.descpage + devices.desc_used, c, len); | ||
1023 | devices.desc_used += len; | ||
1024 | 1064 | ||
1025 | /* Update the device descriptor length: two byte head then data. */ | 1065 | /* p might not be aligned, so we memcpy in. */ |
1026 | dev->desc->config_len += 2 + len; | 1066 | return memcpy(p, &d, sizeof(d)); |
1027 | } | 1067 | } |
1028 | 1068 | ||
1029 | /* This routine adds a virtqueue to a device. We specify how many descriptors | 1069 | /* Each device descriptor is followed by the description of its virtqueues. We |
1030 | * the virtqueue is to have. */ | 1070 | * specify how many descriptors the virtqueue is to have. */ |
1031 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 1071 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
1032 | void (*handle_output)(int fd, struct virtqueue *me)) | 1072 | void (*handle_output)(int fd, struct virtqueue *me)) |
1033 | { | 1073 | { |
@@ -1053,9 +1093,15 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1053 | /* Initialize the vring. */ | 1093 | /* Initialize the vring. */ |
1054 | vring_init(&vq->vring, num_descs, p, getpagesize()); | 1094 | vring_init(&vq->vring, num_descs, p, getpagesize()); |
1055 | 1095 | ||
1056 | /* Add the configuration information to this device's descriptor. */ | 1096 | /* Append virtqueue to this device's descriptor. We use |
1057 | add_desc_field(dev, VIRTIO_CONFIG_F_VIRTQUEUE, | 1097 | * device_config() to get the end of the device's current virtqueues; |
1058 | sizeof(vq->config), &vq->config); | 1098 | * we check that we haven't added any config or feature information |
1099 | * yet, otherwise we'd be overwriting them. */ | ||
1100 | assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); | ||
1101 | memcpy(device_config(dev), &vq->config, sizeof(vq->config)); | ||
1102 | dev->desc->num_vq++; | ||
1103 | |||
1104 | verbose("Virtqueue page %#lx\n", to_guest_phys(p)); | ||
1059 | 1105 | ||
1060 | /* Add to tail of list, so dev->vq is first vq, dev->vq->next is | 1106 | /* Add to tail of list, so dev->vq is first vq, dev->vq->next is |
1061 | * second. */ | 1107 | * second. */ |
@@ -1066,11 +1112,41 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1066 | * virtqueue. */ | 1112 | * virtqueue. */ |
1067 | vq->handle_output = handle_output; | 1113 | vq->handle_output = handle_output; |
1068 | 1114 | ||
1069 | /* Set the "Don't Notify Me" flag if we don't have a handler */ | 1115 | /* As an optimization, set the advisory "Don't Notify Me" flag if we |
1116 | * don't have a handler */ | ||
1070 | if (!handle_output) | 1117 | if (!handle_output) |
1071 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | 1118 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; |
1072 | } | 1119 | } |
1073 | 1120 | ||
1121 | /* The first half of the feature bitmask is for us to advertise features. The | ||
1122 | * second half if for the Guest to accept features. */ | ||
1123 | static void add_feature(struct device *dev, unsigned bit) | ||
1124 | { | ||
1125 | u8 *features = get_feature_bits(dev); | ||
1126 | |||
1127 | /* We can't extend the feature bits once we've added config bytes */ | ||
1128 | if (dev->desc->feature_len <= bit / CHAR_BIT) { | ||
1129 | assert(dev->desc->config_len == 0); | ||
1130 | dev->desc->feature_len = (bit / CHAR_BIT) + 1; | ||
1131 | } | ||
1132 | |||
1133 | features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); | ||
1134 | } | ||
1135 | |||
1136 | /* This routine sets the configuration fields for an existing device's | ||
1137 | * descriptor. It only works for the last device, but that's OK because that's | ||
1138 | * how we use it. */ | ||
1139 | static void set_config(struct device *dev, unsigned len, const void *conf) | ||
1140 | { | ||
1141 | /* Check we haven't overflowed our single page. */ | ||
1142 | if (device_config(dev) + len > devices.descpage + getpagesize()) | ||
1143 | errx(1, "Too many devices"); | ||
1144 | |||
1145 | /* Copy in the config information, and store the length. */ | ||
1146 | memcpy(device_config(dev), conf, len); | ||
1147 | dev->desc->config_len = len; | ||
1148 | } | ||
1149 | |||
1074 | /* This routine does all the creation and setup of a new device, including | 1150 | /* This routine does all the creation and setup of a new device, including |
1075 | * calling new_dev_desc() to allocate the descriptor and device memory. */ | 1151 | * calling new_dev_desc() to allocate the descriptor and device memory. */ |
1076 | static struct device *new_device(const char *name, u16 type, int fd, | 1152 | static struct device *new_device(const char *name, u16 type, int fd, |
@@ -1078,14 +1154,6 @@ static struct device *new_device(const char *name, u16 type, int fd, | |||
1078 | { | 1154 | { |
1079 | struct device *dev = malloc(sizeof(*dev)); | 1155 | struct device *dev = malloc(sizeof(*dev)); |
1080 | 1156 | ||
1081 | /* Append to device list. Prepending to a single-linked list is | ||
1082 | * easier, but the user expects the devices to be arranged on the bus | ||
1083 | * in command-line order. The first network device on the command line | ||
1084 | * is eth0, the first block device /dev/vda, etc. */ | ||
1085 | *devices.lastdev = dev; | ||
1086 | dev->next = NULL; | ||
1087 | devices.lastdev = &dev->next; | ||
1088 | |||
1089 | /* Now we populate the fields one at a time. */ | 1157 | /* Now we populate the fields one at a time. */ |
1090 | dev->fd = fd; | 1158 | dev->fd = fd; |
1091 | /* If we have an input handler for this file descriptor, then we add it | 1159 | /* If we have an input handler for this file descriptor, then we add it |
@@ -1096,6 +1164,17 @@ static struct device *new_device(const char *name, u16 type, int fd, | |||
1096 | dev->handle_input = handle_input; | 1164 | dev->handle_input = handle_input; |
1097 | dev->name = name; | 1165 | dev->name = name; |
1098 | dev->vq = NULL; | 1166 | dev->vq = NULL; |
1167 | |||
1168 | /* Append to device list. Prepending to a single-linked list is | ||
1169 | * easier, but the user expects the devices to be arranged on the bus | ||
1170 | * in command-line order. The first network device on the command line | ||
1171 | * is eth0, the first block device /dev/vda, etc. */ | ||
1172 | if (devices.lastdev) | ||
1173 | devices.lastdev->next = dev; | ||
1174 | else | ||
1175 | devices.dev = dev; | ||
1176 | devices.lastdev = dev; | ||
1177 | |||
1099 | return dev; | 1178 | return dev; |
1100 | } | 1179 | } |
1101 | 1180 | ||
@@ -1220,7 +1299,7 @@ static void setup_tun_net(const char *arg) | |||
1220 | int netfd, ipfd; | 1299 | int netfd, ipfd; |
1221 | u32 ip; | 1300 | u32 ip; |
1222 | const char *br_name = NULL; | 1301 | const char *br_name = NULL; |
1223 | u8 hwaddr[6]; | 1302 | struct virtio_net_config conf; |
1224 | 1303 | ||
1225 | /* We open the /dev/net/tun device and tell it we want a tap device. A | 1304 | /* We open the /dev/net/tun device and tell it we want a tap device. A |
1226 | * tap device is like a tun device, only somehow different. To tell | 1305 | * tap device is like a tun device, only somehow different. To tell |
@@ -1259,12 +1338,13 @@ static void setup_tun_net(const char *arg) | |||
1259 | ip = str2ip(arg); | 1338 | ip = str2ip(arg); |
1260 | 1339 | ||
1261 | /* Set up the tun device, and get the mac address for the interface. */ | 1340 | /* Set up the tun device, and get the mac address for the interface. */ |
1262 | configure_device(ipfd, ifr.ifr_name, ip, hwaddr); | 1341 | configure_device(ipfd, ifr.ifr_name, ip, conf.mac); |
1263 | 1342 | ||
1264 | /* Tell Guest what MAC address to use. */ | 1343 | /* Tell Guest what MAC address to use. */ |
1265 | add_desc_field(dev, VIRTIO_CONFIG_NET_MAC_F, sizeof(hwaddr), hwaddr); | 1344 | add_feature(dev, VIRTIO_NET_F_MAC); |
1345 | set_config(dev, sizeof(conf), &conf); | ||
1266 | 1346 | ||
1267 | /* We don't seed the socket any more; setup is done. */ | 1347 | /* We don't need the socket any more; setup is done. */ |
1268 | close(ipfd); | 1348 | close(ipfd); |
1269 | 1349 | ||
1270 | verbose("device %u: tun net %u.%u.%u.%u\n", | 1350 | verbose("device %u: tun net %u.%u.%u.%u\n", |
@@ -1452,8 +1532,7 @@ static void setup_block_file(const char *filename) | |||
1452 | struct device *dev; | 1532 | struct device *dev; |
1453 | struct vblk_info *vblk; | 1533 | struct vblk_info *vblk; |
1454 | void *stack; | 1534 | void *stack; |
1455 | u64 cap; | 1535 | struct virtio_blk_config conf; |
1456 | unsigned int val; | ||
1457 | 1536 | ||
1458 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ | 1537 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ |
1459 | pipe(p); | 1538 | pipe(p); |
@@ -1471,14 +1550,18 @@ static void setup_block_file(const char *filename) | |||
1471 | vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); | 1550 | vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE); |
1472 | vblk->len = lseek64(vblk->fd, 0, SEEK_END); | 1551 | vblk->len = lseek64(vblk->fd, 0, SEEK_END); |
1473 | 1552 | ||
1553 | /* We support barriers. */ | ||
1554 | add_feature(dev, VIRTIO_BLK_F_BARRIER); | ||
1555 | |||
1474 | /* Tell Guest how many sectors this device has. */ | 1556 | /* Tell Guest how many sectors this device has. */ |
1475 | cap = cpu_to_le64(vblk->len / 512); | 1557 | conf.capacity = cpu_to_le64(vblk->len / 512); |
1476 | add_desc_field(dev, VIRTIO_CONFIG_BLK_F_CAPACITY, sizeof(cap), &cap); | ||
1477 | 1558 | ||
1478 | /* Tell Guest not to put in too many descriptors at once: two are used | 1559 | /* Tell Guest not to put in too many descriptors at once: two are used |
1479 | * for the in and out elements. */ | 1560 | * for the in and out elements. */ |
1480 | val = cpu_to_le32(VIRTQUEUE_NUM - 2); | 1561 | add_feature(dev, VIRTIO_BLK_F_SEG_MAX); |
1481 | add_desc_field(dev, VIRTIO_CONFIG_BLK_F_SEG_MAX, sizeof(val), &val); | 1562 | conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); |
1563 | |||
1564 | set_config(dev, sizeof(conf), &conf); | ||
1482 | 1565 | ||
1483 | /* The I/O thread writes to this end of the pipe when done. */ | 1566 | /* The I/O thread writes to this end of the pipe when done. */ |
1484 | vblk->done_fd = p[1]; | 1567 | vblk->done_fd = p[1]; |
@@ -1489,7 +1572,9 @@ static void setup_block_file(const char *filename) | |||
1489 | 1572 | ||
1490 | /* Create stack for thread and run it */ | 1573 | /* Create stack for thread and run it */ |
1491 | stack = malloc(32768); | 1574 | stack = malloc(32768); |
1492 | if (clone(io_thread, stack + 32768, CLONE_VM, dev) == -1) | 1575 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from |
1576 | * becoming a zombie. */ | ||
1577 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | ||
1493 | err(1, "Creating clone"); | 1578 | err(1, "Creating clone"); |
1494 | 1579 | ||
1495 | /* We don't need to keep the I/O thread's end of the pipes open. */ | 1580 | /* We don't need to keep the I/O thread's end of the pipes open. */ |
@@ -1497,9 +1582,23 @@ static void setup_block_file(const char *filename) | |||
1497 | close(vblk->workpipe[0]); | 1582 | close(vblk->workpipe[0]); |
1498 | 1583 | ||
1499 | verbose("device %u: virtblock %llu sectors\n", | 1584 | verbose("device %u: virtblock %llu sectors\n", |
1500 | devices.device_num, cap); | 1585 | devices.device_num, le64_to_cpu(conf.capacity)); |
1586 | } | ||
1587 | /* That's the end of device setup. :*/ | ||
1588 | |||
1589 | /* Reboot */ | ||
1590 | static void __attribute__((noreturn)) restart_guest(void) | ||
1591 | { | ||
1592 | unsigned int i; | ||
1593 | |||
1594 | /* Closing pipes causes the waker thread and io_threads to die, and | ||
1595 | * closing /dev/lguest cleans up the Guest. Since we don't track all | ||
1596 | * open fds, we simply close everything beyond stderr. */ | ||
1597 | for (i = 3; i < FD_SETSIZE; i++) | ||
1598 | close(i); | ||
1599 | execv(main_args[0], main_args); | ||
1600 | err(1, "Could not exec %s", main_args[0]); | ||
1501 | } | 1601 | } |
1502 | /* That's the end of device setup. */ | ||
1503 | 1602 | ||
1504 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves | 1603 | /*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves |
1505 | * its input and output, and finally, lays it to rest. */ | 1604 | * its input and output, and finally, lays it to rest. */ |
@@ -1511,7 +1610,8 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1511 | int readval; | 1610 | int readval; |
1512 | 1611 | ||
1513 | /* We read from the /dev/lguest device to run the Guest. */ | 1612 | /* We read from the /dev/lguest device to run the Guest. */ |
1514 | readval = read(lguest_fd, ¬ify_addr, sizeof(notify_addr)); | 1613 | readval = pread(lguest_fd, ¬ify_addr, |
1614 | sizeof(notify_addr), cpu_id); | ||
1515 | 1615 | ||
1516 | /* One unsigned long means the Guest did HCALL_NOTIFY */ | 1616 | /* One unsigned long means the Guest did HCALL_NOTIFY */ |
1517 | if (readval == sizeof(notify_addr)) { | 1617 | if (readval == sizeof(notify_addr)) { |
@@ -1521,16 +1621,23 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1521 | /* ENOENT means the Guest died. Reading tells us why. */ | 1621 | /* ENOENT means the Guest died. Reading tells us why. */ |
1522 | } else if (errno == ENOENT) { | 1622 | } else if (errno == ENOENT) { |
1523 | char reason[1024] = { 0 }; | 1623 | char reason[1024] = { 0 }; |
1524 | read(lguest_fd, reason, sizeof(reason)-1); | 1624 | pread(lguest_fd, reason, sizeof(reason)-1, cpu_id); |
1525 | errx(1, "%s", reason); | 1625 | errx(1, "%s", reason); |
1626 | /* ERESTART means that we need to reboot the guest */ | ||
1627 | } else if (errno == ERESTART) { | ||
1628 | restart_guest(); | ||
1526 | /* EAGAIN means the Waker wanted us to look at some input. | 1629 | /* EAGAIN means the Waker wanted us to look at some input. |
1527 | * Anything else means a bug or incompatible change. */ | 1630 | * Anything else means a bug or incompatible change. */ |
1528 | } else if (errno != EAGAIN) | 1631 | } else if (errno != EAGAIN) |
1529 | err(1, "Running guest failed"); | 1632 | err(1, "Running guest failed"); |
1530 | 1633 | ||
1634 | /* Only service input on thread for CPU 0. */ | ||
1635 | if (cpu_id != 0) | ||
1636 | continue; | ||
1637 | |||
1531 | /* Service input, then unset the BREAK to release the Waker. */ | 1638 | /* Service input, then unset the BREAK to release the Waker. */ |
1532 | handle_input(lguest_fd); | 1639 | handle_input(lguest_fd); |
1533 | if (write(lguest_fd, args, sizeof(args)) < 0) | 1640 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) |
1534 | err(1, "Resetting break"); | 1641 | err(1, "Resetting break"); |
1535 | } | 1642 | } |
1536 | } | 1643 | } |
@@ -1571,17 +1678,24 @@ int main(int argc, char *argv[]) | |||
1571 | /* If they specify an initrd file to load. */ | 1678 | /* If they specify an initrd file to load. */ |
1572 | const char *initrd_name = NULL; | 1679 | const char *initrd_name = NULL; |
1573 | 1680 | ||
1681 | /* Save the args: we "reboot" by execing ourselves again. */ | ||
1682 | main_args = argv; | ||
1683 | /* We don't "wait" for the children, so prevent them from becoming | ||
1684 | * zombies. */ | ||
1685 | signal(SIGCHLD, SIG_IGN); | ||
1686 | |||
1574 | /* First we initialize the device list. Since console and network | 1687 | /* First we initialize the device list. Since console and network |
1575 | * device receive input from a file descriptor, we keep an fdset | 1688 | * device receive input from a file descriptor, we keep an fdset |
1576 | * (infds) and the maximum fd number (max_infd) with the head of the | 1689 | * (infds) and the maximum fd number (max_infd) with the head of the |
1577 | * list. We also keep a pointer to the last device, for easy appending | 1690 | * list. We also keep a pointer to the last device. Finally, we keep |
1578 | * to the list. Finally, we keep the next interrupt number to hand out | 1691 | * the next interrupt number to hand out (1: remember that 0 is used by |
1579 | * (1: remember that 0 is used by the timer). */ | 1692 | * the timer). */ |
1580 | FD_ZERO(&devices.infds); | 1693 | FD_ZERO(&devices.infds); |
1581 | devices.max_infd = -1; | 1694 | devices.max_infd = -1; |
1582 | devices.lastdev = &devices.dev; | 1695 | devices.lastdev = NULL; |
1583 | devices.next_irq = 1; | 1696 | devices.next_irq = 1; |
1584 | 1697 | ||
1698 | cpu_id = 0; | ||
1585 | /* We need to know how much memory so we can set up the device | 1699 | /* We need to know how much memory so we can set up the device |
1586 | * descriptor and memory pages for the devices as we parse the command | 1700 | * descriptor and memory pages for the devices as we parse the command |
1587 | * line. So we quickly look through the arguments to find the amount | 1701 | * line. So we quickly look through the arguments to find the amount |