diff options
Diffstat (limited to 'Documentation/lguest/lguest.c')
-rw-r--r-- | Documentation/lguest/lguest.c | 178 |
1 files changed, 106 insertions, 72 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 5bdc37f81842..f2668390e8f7 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -34,25 +34,24 @@ | |||
34 | #include <zlib.h> | 34 | #include <zlib.h> |
35 | #include <assert.h> | 35 | #include <assert.h> |
36 | #include <sched.h> | 36 | #include <sched.h> |
37 | /*L:110 We can ignore the 30 include files we need for this program, but I do | ||
38 | * want to draw attention to the use of kernel-style types. | ||
39 | * | ||
40 | * As Linus said, "C is a Spartan language, and so should your naming be." I | ||
41 | * like these abbreviations and the header we need uses them, so we define them | ||
42 | * here. | ||
43 | */ | ||
44 | typedef unsigned long long u64; | ||
45 | typedef uint32_t u32; | ||
46 | typedef uint16_t u16; | ||
47 | typedef uint8_t u8; | ||
48 | #include "linux/lguest_launcher.h" | 37 | #include "linux/lguest_launcher.h" |
49 | #include "linux/pci_ids.h" | ||
50 | #include "linux/virtio_config.h" | 38 | #include "linux/virtio_config.h" |
51 | #include "linux/virtio_net.h" | 39 | #include "linux/virtio_net.h" |
52 | #include "linux/virtio_blk.h" | 40 | #include "linux/virtio_blk.h" |
53 | #include "linux/virtio_console.h" | 41 | #include "linux/virtio_console.h" |
54 | #include "linux/virtio_ring.h" | 42 | #include "linux/virtio_ring.h" |
55 | #include "asm-x86/bootparam.h" | 43 | #include "asm-x86/bootparam.h" |
44 | /*L:110 We can ignore the 38 include files we need for this program, but I do | ||
45 | * want to draw attention to the use of kernel-style types. | ||
46 | * | ||
47 | * As Linus said, "C is a Spartan language, and so should your naming be." I | ||
48 | * like these abbreviations, so we define them here. Note that u64 is always | ||
49 | * unsigned long long, which works on all Linux systems: this means that we can | ||
50 | * use %llu in printf for any u64. */ | ||
51 | typedef unsigned long long u64; | ||
52 | typedef uint32_t u32; | ||
53 | typedef uint16_t u16; | ||
54 | typedef uint8_t u8; | ||
56 | /*:*/ | 55 | /*:*/ |
57 | 56 | ||
58 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 57 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
@@ -361,8 +360,8 @@ static unsigned long load_bzimage(int fd) | |||
361 | } | 360 | } |
362 | 361 | ||
363 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels | 362 | /*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels |
364 | * come wrapped up in the self-decompressing "bzImage" format. With some funky | 363 | * come wrapped up in the self-decompressing "bzImage" format. With a little |
365 | * coding, we can load those, too. */ | 364 | * work, we can load those, too. */ |
366 | static unsigned long load_kernel(int fd) | 365 | static unsigned long load_kernel(int fd) |
367 | { | 366 | { |
368 | Elf32_Ehdr hdr; | 367 | Elf32_Ehdr hdr; |
@@ -465,6 +464,7 @@ static unsigned long setup_pagetables(unsigned long mem, | |||
465 | * to know where it is. */ | 464 | * to know where it is. */ |
466 | return to_guest_phys(pgdir); | 465 | return to_guest_phys(pgdir); |
467 | } | 466 | } |
467 | /*:*/ | ||
468 | 468 | ||
469 | /* Simple routine to roll all the commandline arguments together with spaces | 469 | /* Simple routine to roll all the commandline arguments together with spaces |
470 | * between them. */ | 470 | * between them. */ |
@@ -481,9 +481,9 @@ static void concat(char *dst, char *args[]) | |||
481 | dst[len] = '\0'; | 481 | dst[len] = '\0'; |
482 | } | 482 | } |
483 | 483 | ||
484 | /* This is where we actually tell the kernel to initialize the Guest. We saw | 484 | /*L:185 This is where we actually tell the kernel to initialize the Guest. We |
485 | * the arguments it expects when we looked at initialize() in lguest_user.c: | 485 | * saw the arguments it expects when we looked at initialize() in lguest_user.c: |
486 | * the base of guest "physical" memory, the top physical page to allow, the | 486 | * the base of Guest "physical" memory, the top physical page to allow, the |
487 | * top level pagetable and the entry point for the Guest. */ | 487 | * top level pagetable and the entry point for the Guest. */ |
488 | static int tell_kernel(unsigned long pgdir, unsigned long start) | 488 | static int tell_kernel(unsigned long pgdir, unsigned long start) |
489 | { | 489 | { |
@@ -513,13 +513,14 @@ static void add_device_fd(int fd) | |||
513 | /*L:200 | 513 | /*L:200 |
514 | * The Waker. | 514 | * The Waker. |
515 | * | 515 | * |
516 | * With a console and network devices, we can have lots of input which we need | 516 | * With console, block and network devices, we can have lots of input which we |
517 | * to process. We could try to tell the kernel what file descriptors to watch, | 517 | * need to process. We could try to tell the kernel what file descriptors to |
518 | * but handing a file descriptor mask through to the kernel is fairly icky. | 518 | * watch, but handing a file descriptor mask through to the kernel is fairly |
519 | * icky. | ||
519 | * | 520 | * |
520 | * Instead, we fork off a process which watches the file descriptors and writes | 521 | * Instead, we fork off a process which watches the file descriptors and writes |
521 | * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host | 522 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host |
522 | * loop to stop running the Guest. This causes it to return from the | 523 | * stop running the Guest. This causes the Launcher to return from the |
523 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | 524 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset |
524 | * the LHREQ_BREAK and wake us up again. | 525 | * the LHREQ_BREAK and wake us up again. |
525 | * | 526 | * |
@@ -545,7 +546,9 @@ static void wake_parent(int pipefd, int lguest_fd) | |||
545 | if (read(pipefd, &fd, sizeof(fd)) == 0) | 546 | if (read(pipefd, &fd, sizeof(fd)) == 0) |
546 | exit(0); | 547 | exit(0); |
547 | /* Otherwise it's telling us to change what file | 548 | /* Otherwise it's telling us to change what file |
548 | * descriptors we're to listen to. */ | 549 | * descriptors we're to listen to. Positive means |
550 | * listen to a new one, negative means stop | ||
551 | * listening. */ | ||
549 | if (fd >= 0) | 552 | if (fd >= 0) |
550 | FD_SET(fd, &devices.infds); | 553 | FD_SET(fd, &devices.infds); |
551 | else | 554 | else |
@@ -560,7 +563,7 @@ static int setup_waker(int lguest_fd) | |||
560 | { | 563 | { |
561 | int pipefd[2], child; | 564 | int pipefd[2], child; |
562 | 565 | ||
563 | /* We create a pipe to talk to the waker, and also so it knows when the | 566 | /* We create a pipe to talk to the Waker, and also so it knows when the |
564 | * Launcher dies (and closes pipe). */ | 567 | * Launcher dies (and closes pipe). */ |
565 | pipe(pipefd); | 568 | pipe(pipefd); |
566 | child = fork(); | 569 | child = fork(); |
@@ -568,7 +571,8 @@ static int setup_waker(int lguest_fd) | |||
568 | err(1, "forking"); | 571 | err(1, "forking"); |
569 | 572 | ||
570 | if (child == 0) { | 573 | if (child == 0) { |
571 | /* Close the "writing" end of our copy of the pipe */ | 574 | /* We are the Waker: close the "writing" end of our copy of the |
575 | * pipe and start waiting for input. */ | ||
572 | close(pipefd[1]); | 576 | close(pipefd[1]); |
573 | wake_parent(pipefd[0], lguest_fd); | 577 | wake_parent(pipefd[0], lguest_fd); |
574 | } | 578 | } |
@@ -579,12 +583,12 @@ static int setup_waker(int lguest_fd) | |||
579 | return pipefd[1]; | 583 | return pipefd[1]; |
580 | } | 584 | } |
581 | 585 | ||
582 | /*L:210 | 586 | /* |
583 | * Device Handling. | 587 | * Device Handling. |
584 | * | 588 | * |
585 | * When the Guest sends DMA to us, it sends us an array of addresses and sizes. | 589 | * When the Guest gives us a buffer, it sends an array of addresses and sizes. |
586 | * We need to make sure it's not trying to reach into the Launcher itself, so | 590 | * We need to make sure it's not trying to reach into the Launcher itself, so |
587 | * we have a convenient routine which check it and exits with an error message | 591 | * we have a convenient routine which checks it and exits with an error message |
588 | * if something funny is going on: | 592 | * if something funny is going on: |
589 | */ | 593 | */ |
590 | static void *_check_pointer(unsigned long addr, unsigned int size, | 594 | static void *_check_pointer(unsigned long addr, unsigned int size, |
@@ -601,7 +605,9 @@ static void *_check_pointer(unsigned long addr, unsigned int size, | |||
601 | /* A macro which transparently hands the line number to the real function. */ | 605 | /* A macro which transparently hands the line number to the real function. */ |
602 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) | 606 | #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) |
603 | 607 | ||
604 | /* This function returns the next descriptor in the chain, or vq->vring.num. */ | 608 | /* Each buffer in the virtqueues is actually a chain of descriptors. This |
609 | * function returns the next descriptor in the chain, or vq->vring.num if we're | ||
610 | * at the end. */ | ||
605 | static unsigned next_desc(struct virtqueue *vq, unsigned int i) | 611 | static unsigned next_desc(struct virtqueue *vq, unsigned int i) |
606 | { | 612 | { |
607 | unsigned int next; | 613 | unsigned int next; |
@@ -680,13 +686,14 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
680 | return head; | 686 | return head; |
681 | } | 687 | } |
682 | 688 | ||
683 | /* Once we've used one of their buffers, we tell them about it. We'll then | 689 | /* After we've used one of their buffers, we tell them about it. We'll then |
684 | * want to send them an interrupt, using trigger_irq(). */ | 690 | * want to send them an interrupt, using trigger_irq(). */ |
685 | static void add_used(struct virtqueue *vq, unsigned int head, int len) | 691 | static void add_used(struct virtqueue *vq, unsigned int head, int len) |
686 | { | 692 | { |
687 | struct vring_used_elem *used; | 693 | struct vring_used_elem *used; |
688 | 694 | ||
689 | /* Get a pointer to the next entry in the used ring. */ | 695 | /* The virtqueue contains a ring of used buffers. Get a pointer to the |
696 | * next entry in that used ring. */ | ||
690 | used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; | 697 | used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; |
691 | used->id = head; | 698 | used->id = head; |
692 | used->len = len; | 699 | used->len = len; |
@@ -700,6 +707,7 @@ static void trigger_irq(int fd, struct virtqueue *vq) | |||
700 | { | 707 | { |
701 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | 708 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; |
702 | 709 | ||
710 | /* If they don't want an interrupt, don't send one. */ | ||
703 | if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | 711 | if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) |
704 | return; | 712 | return; |
705 | 713 | ||
@@ -716,8 +724,11 @@ static void add_used_and_trigger(int fd, struct virtqueue *vq, | |||
716 | trigger_irq(fd, vq); | 724 | trigger_irq(fd, vq); |
717 | } | 725 | } |
718 | 726 | ||
719 | /* Here is the input terminal setting we save, and the routine to restore them | 727 | /* |
720 | * on exit so the user can see what they type next. */ | 728 | * The Console |
729 | * | ||
730 | * Here is the input terminal setting we save, and the routine to restore them | ||
731 | * on exit so the user gets their terminal back. */ | ||
721 | static struct termios orig_term; | 732 | static struct termios orig_term; |
722 | static void restore_term(void) | 733 | static void restore_term(void) |
723 | { | 734 | { |
@@ -818,7 +829,10 @@ static void handle_console_output(int fd, struct virtqueue *vq) | |||
818 | } | 829 | } |
819 | } | 830 | } |
820 | 831 | ||
821 | /* Handling output for network is also simple: we get all the output buffers | 832 | /* |
833 | * The Network | ||
834 | * | ||
835 | * Handling output for network is also simple: we get all the output buffers | ||
822 | * and write them (ignoring the first element) to this device's file descriptor | 836 | * and write them (ignoring the first element) to this device's file descriptor |
823 | * (stdout). */ | 837 | * (stdout). */ |
824 | static void handle_net_output(int fd, struct virtqueue *vq) | 838 | static void handle_net_output(int fd, struct virtqueue *vq) |
@@ -831,8 +845,9 @@ static void handle_net_output(int fd, struct virtqueue *vq) | |||
831 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | 845 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { |
832 | if (in) | 846 | if (in) |
833 | errx(1, "Input buffers in output queue?"); | 847 | errx(1, "Input buffers in output queue?"); |
834 | /* Check header, but otherwise ignore it (we said we supported | 848 | /* Check header, but otherwise ignore it (we told the Guest we |
835 | * no features). */ | 849 | * supported no features, so it shouldn't have anything |
850 | * interesting). */ | ||
836 | (void)convert(&iov[0], struct virtio_net_hdr); | 851 | (void)convert(&iov[0], struct virtio_net_hdr); |
837 | len = writev(vq->dev->fd, iov+1, out-1); | 852 | len = writev(vq->dev->fd, iov+1, out-1); |
838 | add_used_and_trigger(fd, vq, head, len); | 853 | add_used_and_trigger(fd, vq, head, len); |
@@ -883,7 +898,8 @@ static bool handle_tun_input(int fd, struct device *dev) | |||
883 | return true; | 898 | return true; |
884 | } | 899 | } |
885 | 900 | ||
886 | /* This callback ensures we try again, in case we stopped console or net | 901 | /*L:215 This is the callback attached to the network and console input |
902 | * virtqueues: it ensures we try again, in case we stopped console or net | ||
887 | * delivery because Guest didn't have any buffers. */ | 903 | * delivery because Guest didn't have any buffers. */ |
888 | static void enable_fd(int fd, struct virtqueue *vq) | 904 | static void enable_fd(int fd, struct virtqueue *vq) |
889 | { | 905 | { |
@@ -919,7 +935,7 @@ static void handle_output(int fd, unsigned long addr) | |||
919 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 935 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
920 | } | 936 | } |
921 | 937 | ||
922 | /* This is called when the waker wakes us up: check for incoming file | 938 | /* This is called when the Waker wakes us up: check for incoming file |
923 | * descriptors. */ | 939 | * descriptors. */ |
924 | static void handle_input(int fd) | 940 | static void handle_input(int fd) |
925 | { | 941 | { |
@@ -986,8 +1002,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
986 | } | 1002 | } |
987 | 1003 | ||
988 | /* Each device descriptor is followed by some configuration information. | 1004 | /* Each device descriptor is followed by some configuration information. |
989 | * The first byte is a "status" byte for the Guest to report what's happening. | 1005 | * Each configuration field looks like: u8 type, u8 len, [... len bytes...]. |
990 | * After that are fields: u8 type, u8 len, [... len bytes...]. | ||
991 | * | 1006 | * |
992 | * This routine adds a new field to an existing device's descriptor. It only | 1007 | * This routine adds a new field to an existing device's descriptor. It only |
993 | * works for the last device, but that's OK because that's how we use it. */ | 1008 | * works for the last device, but that's OK because that's how we use it. */ |
@@ -1044,14 +1059,17 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1044 | /* Link virtqueue back to device. */ | 1059 | /* Link virtqueue back to device. */ |
1045 | vq->dev = dev; | 1060 | vq->dev = dev; |
1046 | 1061 | ||
1047 | /* Set up handler. */ | 1062 | /* Set the routine to call when the Guest does something to this |
1063 | * virtqueue. */ | ||
1048 | vq->handle_output = handle_output; | 1064 | vq->handle_output = handle_output; |
1065 | |||
1066 | /* Set the "Don't Notify Me" flag if we don't have a handler */ | ||
1049 | if (!handle_output) | 1067 | if (!handle_output) |
1050 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | 1068 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; |
1051 | } | 1069 | } |
1052 | 1070 | ||
1053 | /* This routine does all the creation and setup of a new device, including | 1071 | /* This routine does all the creation and setup of a new device, including |
1054 | * caling new_dev_desc() to allocate the descriptor and device memory. */ | 1072 | * calling new_dev_desc() to allocate the descriptor and device memory. */ |
1055 | static struct device *new_device(const char *name, u16 type, int fd, | 1073 | static struct device *new_device(const char *name, u16 type, int fd, |
1056 | bool (*handle_input)(int, struct device *)) | 1074 | bool (*handle_input)(int, struct device *)) |
1057 | { | 1075 | { |
@@ -1060,7 +1078,7 @@ static struct device *new_device(const char *name, u16 type, int fd, | |||
1060 | /* Append to device list. Prepending to a single-linked list is | 1078 | /* Append to device list. Prepending to a single-linked list is |
1061 | * easier, but the user expects the devices to be arranged on the bus | 1079 | * easier, but the user expects the devices to be arranged on the bus |
1062 | * in command-line order. The first network device on the command line | 1080 | * in command-line order. The first network device on the command line |
1063 | * is eth0, the first block device /dev/lgba, etc. */ | 1081 | * is eth0, the first block device /dev/vda, etc. */ |
1064 | *devices.lastdev = dev; | 1082 | *devices.lastdev = dev; |
1065 | dev->next = NULL; | 1083 | dev->next = NULL; |
1066 | devices.lastdev = &dev->next; | 1084 | devices.lastdev = &dev->next; |
@@ -1104,7 +1122,7 @@ static void setup_console(void) | |||
1104 | /* The console needs two virtqueues: the input then the output. When | 1122 | /* The console needs two virtqueues: the input then the output. When |
1105 | * they put something the input queue, we make sure we're listening to | 1123 | * they put something the input queue, we make sure we're listening to |
1106 | * stdin. When they put something in the output queue, we write it to | 1124 | * stdin. When they put something in the output queue, we write it to |
1107 | * stdout. */ | 1125 | * stdout. */ |
1108 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1126 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); |
1109 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); | 1127 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); |
1110 | 1128 | ||
@@ -1252,21 +1270,17 @@ static void setup_tun_net(const char *arg) | |||
1252 | verbose("attached to bridge: %s\n", br_name); | 1270 | verbose("attached to bridge: %s\n", br_name); |
1253 | } | 1271 | } |
1254 | 1272 | ||
1255 | 1273 | /* Our block (disk) device should be really simple: the Guest asks for a block | |
1256 | /* | 1274 | * number and we read or write that position in the file. Unfortunately, that |
1257 | * Block device. | 1275 | * was amazingly slow: the Guest waits until the read is finished before |
1276 | * running anything else, even if it could have been doing useful work. | ||
1258 | * | 1277 | * |
1259 | * Serving a block device is really easy: the Guest asks for a block number and | 1278 | * We could use async I/O, except it's reputed to suck so hard that characters |
1260 | * we read or write that position in the file. | 1279 | * actually go missing from your code when you try to use it. |
1261 | * | ||
1262 | * Unfortunately, this is amazingly slow: the Guest waits until the read is | ||
1263 | * finished before running anything else, even if it could be doing useful | ||
1264 | * work. We could use async I/O, except it's reputed to suck so hard that | ||
1265 | * characters actually go missing from your code when you try to use it. | ||
1266 | * | 1280 | * |
1267 | * So we farm the I/O out to thread, and communicate with it via a pipe. */ | 1281 | * So we farm the I/O out to thread, and communicate with it via a pipe. */ |
1268 | 1282 | ||
1269 | /* This hangs off device->priv, with the data. */ | 1283 | /* This hangs off device->priv. */ |
1270 | struct vblk_info | 1284 | struct vblk_info |
1271 | { | 1285 | { |
1272 | /* The size of the file. */ | 1286 | /* The size of the file. */ |
@@ -1282,8 +1296,14 @@ struct vblk_info | |||
1282 | * Launcher triggers interrupt to Guest. */ | 1296 | * Launcher triggers interrupt to Guest. */ |
1283 | int done_fd; | 1297 | int done_fd; |
1284 | }; | 1298 | }; |
1299 | /*:*/ | ||
1285 | 1300 | ||
1286 | /* This is the core of the I/O thread. It returns true if it did something. */ | 1301 | /*L:210 |
1302 | * The Disk | ||
1303 | * | ||
1304 | * Remember that the block device is handled by a separate I/O thread. We head | ||
1305 | * straight into the core of that thread here: | ||
1306 | */ | ||
1287 | static bool service_io(struct device *dev) | 1307 | static bool service_io(struct device *dev) |
1288 | { | 1308 | { |
1289 | struct vblk_info *vblk = dev->priv; | 1309 | struct vblk_info *vblk = dev->priv; |
@@ -1294,10 +1314,14 @@ static bool service_io(struct device *dev) | |||
1294 | struct iovec iov[dev->vq->vring.num]; | 1314 | struct iovec iov[dev->vq->vring.num]; |
1295 | off64_t off; | 1315 | off64_t off; |
1296 | 1316 | ||
1317 | /* See if there's a request waiting. If not, nothing to do. */ | ||
1297 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1318 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); |
1298 | if (head == dev->vq->vring.num) | 1319 | if (head == dev->vq->vring.num) |
1299 | return false; | 1320 | return false; |
1300 | 1321 | ||
1322 | /* Every block request should contain at least one output buffer | ||
1323 | * (detailing the location on disk and the type of request) and one | ||
1324 | * input buffer (to hold the result). */ | ||
1301 | if (out_num == 0 || in_num == 0) | 1325 | if (out_num == 0 || in_num == 0) |
1302 | errx(1, "Bad virtblk cmd %u out=%u in=%u", | 1326 | errx(1, "Bad virtblk cmd %u out=%u in=%u", |
1303 | head, out_num, in_num); | 1327 | head, out_num, in_num); |
@@ -1306,10 +1330,15 @@ static bool service_io(struct device *dev) | |||
1306 | in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); | 1330 | in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr); |
1307 | off = out->sector * 512; | 1331 | off = out->sector * 512; |
1308 | 1332 | ||
1309 | /* This is how we implement barriers. Pretty poor, no? */ | 1333 | /* The block device implements "barriers", where the Guest indicates |
1334 | * that it wants all previous writes to occur before this write. We | ||
1335 | * don't have a way of asking our kernel to do a barrier, so we just | ||
1336 | * synchronize all the data in the file. Pretty poor, no? */ | ||
1310 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1337 | if (out->type & VIRTIO_BLK_T_BARRIER) |
1311 | fdatasync(vblk->fd); | 1338 | fdatasync(vblk->fd); |
1312 | 1339 | ||
1340 | /* In general the virtio block driver is allowed to try SCSI commands. | ||
1341 | * It'd be nice if we supported eject, for example, but we don't. */ | ||
1313 | if (out->type & VIRTIO_BLK_T_SCSI_CMD) { | 1342 | if (out->type & VIRTIO_BLK_T_SCSI_CMD) { |
1314 | fprintf(stderr, "Scsi commands unsupported\n"); | 1343 | fprintf(stderr, "Scsi commands unsupported\n"); |
1315 | in->status = VIRTIO_BLK_S_UNSUPP; | 1344 | in->status = VIRTIO_BLK_S_UNSUPP; |
@@ -1375,7 +1404,7 @@ static int io_thread(void *_dev) | |||
1375 | 1404 | ||
1376 | /* When this read fails, it means Launcher died, so we follow. */ | 1405 | /* When this read fails, it means Launcher died, so we follow. */ |
1377 | while (read(vblk->workpipe[0], &c, 1) == 1) { | 1406 | while (read(vblk->workpipe[0], &c, 1) == 1) { |
1378 | /* We acknowledge each request immediately, to reduce latency, | 1407 | /* We acknowledge each request immediately to reduce latency, |
1379 | * rather than waiting until we've done them all. I haven't | 1408 | * rather than waiting until we've done them all. I haven't |
1380 | * measured to see if it makes any difference. */ | 1409 | * measured to see if it makes any difference. */ |
1381 | while (service_io(dev)) | 1410 | while (service_io(dev)) |
@@ -1384,12 +1413,14 @@ static int io_thread(void *_dev) | |||
1384 | return 0; | 1413 | return 0; |
1385 | } | 1414 | } |
1386 | 1415 | ||
1387 | /* When the thread says some I/O is done, we interrupt the Guest. */ | 1416 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens |
1417 | * when the thread tells us it's completed some I/O. */ | ||
1388 | static bool handle_io_finish(int fd, struct device *dev) | 1418 | static bool handle_io_finish(int fd, struct device *dev) |
1389 | { | 1419 | { |
1390 | char c; | 1420 | char c; |
1391 | 1421 | ||
1392 | /* If child died, presumably it printed message. */ | 1422 | /* If the I/O thread died, presumably it printed the error, so we |
1423 | * simply exit. */ | ||
1393 | if (read(dev->fd, &c, 1) != 1) | 1424 | if (read(dev->fd, &c, 1) != 1) |
1394 | exit(1); | 1425 | exit(1); |
1395 | 1426 | ||
@@ -1398,7 +1429,7 @@ static bool handle_io_finish(int fd, struct device *dev) | |||
1398 | return true; | 1429 | return true; |
1399 | } | 1430 | } |
1400 | 1431 | ||
1401 | /* When the Guest submits some I/O, we wake the I/O thread. */ | 1432 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ |
1402 | static void handle_virtblk_output(int fd, struct virtqueue *vq) | 1433 | static void handle_virtblk_output(int fd, struct virtqueue *vq) |
1403 | { | 1434 | { |
1404 | struct vblk_info *vblk = vq->dev->priv; | 1435 | struct vblk_info *vblk = vq->dev->priv; |
@@ -1410,7 +1441,7 @@ static void handle_virtblk_output(int fd, struct virtqueue *vq) | |||
1410 | exit(1); | 1441 | exit(1); |
1411 | } | 1442 | } |
1412 | 1443 | ||
1413 | /* This creates a virtual block device. */ | 1444 | /*L:198 This actually sets up a virtual block device. */ |
1414 | static void setup_block_file(const char *filename) | 1445 | static void setup_block_file(const char *filename) |
1415 | { | 1446 | { |
1416 | int p[2]; | 1447 | int p[2]; |
@@ -1426,7 +1457,7 @@ static void setup_block_file(const char *filename) | |||
1426 | /* The device responds to return from I/O thread. */ | 1457 | /* The device responds to return from I/O thread. */ |
1427 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); | 1458 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); |
1428 | 1459 | ||
1429 | /* The device has a virtqueue. */ | 1460 | /* The device has one virtqueue, where the Guest places requests. */ |
1430 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); | 1461 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); |
1431 | 1462 | ||
1432 | /* Allocate the room for our own bookkeeping */ | 1463 | /* Allocate the room for our own bookkeeping */ |
@@ -1448,7 +1479,8 @@ static void setup_block_file(const char *filename) | |||
1448 | /* The I/O thread writes to this end of the pipe when done. */ | 1479 | /* The I/O thread writes to this end of the pipe when done. */ |
1449 | vblk->done_fd = p[1]; | 1480 | vblk->done_fd = p[1]; |
1450 | 1481 | ||
1451 | /* This is how we tell the I/O thread about more work. */ | 1482 | /* This is the second pipe, which is how we tell the I/O thread about |
1483 | * more work. */ | ||
1452 | pipe(vblk->workpipe); | 1484 | pipe(vblk->workpipe); |
1453 | 1485 | ||
1454 | /* Create stack for thread and run it */ | 1486 | /* Create stack for thread and run it */ |
@@ -1487,24 +1519,25 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
1487 | char reason[1024] = { 0 }; | 1519 | char reason[1024] = { 0 }; |
1488 | read(lguest_fd, reason, sizeof(reason)-1); | 1520 | read(lguest_fd, reason, sizeof(reason)-1); |
1489 | errx(1, "%s", reason); | 1521 | errx(1, "%s", reason); |
1490 | /* EAGAIN means the waker wanted us to look at some input. | 1522 | /* EAGAIN means the Waker wanted us to look at some input. |
1491 | * Anything else means a bug or incompatible change. */ | 1523 | * Anything else means a bug or incompatible change. */ |
1492 | } else if (errno != EAGAIN) | 1524 | } else if (errno != EAGAIN) |
1493 | err(1, "Running guest failed"); | 1525 | err(1, "Running guest failed"); |
1494 | 1526 | ||
1495 | /* Service input, then unset the BREAK which releases | 1527 | /* Service input, then unset the BREAK to release the Waker. */ |
1496 | * the Waker. */ | ||
1497 | handle_input(lguest_fd); | 1528 | handle_input(lguest_fd); |
1498 | if (write(lguest_fd, args, sizeof(args)) < 0) | 1529 | if (write(lguest_fd, args, sizeof(args)) < 0) |
1499 | err(1, "Resetting break"); | 1530 | err(1, "Resetting break"); |
1500 | } | 1531 | } |
1501 | } | 1532 | } |
1502 | /* | 1533 | /* |
1503 | * This is the end of the Launcher. | 1534 | * This is the end of the Launcher. The good news: we are over halfway |
1535 | * through! The bad news: the most fiendish part of the code still lies ahead | ||
1536 | * of us. | ||
1504 | * | 1537 | * |
1505 | * But wait! We've seen I/O from the Launcher, and we've seen I/O from the | 1538 | * Are you ready? Take a deep breath and join me in the core of the Host, in |
1506 | * Drivers. If we were to see the Host kernel I/O code, our understanding | 1539 | * "make Host". |
1507 | * would be complete... :*/ | 1540 | :*/ |
1508 | 1541 | ||
1509 | static struct option opts[] = { | 1542 | static struct option opts[] = { |
1510 | { "verbose", 0, NULL, 'v' }, | 1543 | { "verbose", 0, NULL, 'v' }, |
@@ -1527,7 +1560,7 @@ int main(int argc, char *argv[]) | |||
1527 | /* Memory, top-level pagetable, code startpoint and size of the | 1560 | /* Memory, top-level pagetable, code startpoint and size of the |
1528 | * (optional) initrd. */ | 1561 | * (optional) initrd. */ |
1529 | unsigned long mem = 0, pgdir, start, initrd_size = 0; | 1562 | unsigned long mem = 0, pgdir, start, initrd_size = 0; |
1530 | /* A temporary and the /dev/lguest file descriptor. */ | 1563 | /* Two temporaries and the /dev/lguest file descriptor. */ |
1531 | int i, c, lguest_fd; | 1564 | int i, c, lguest_fd; |
1532 | /* The boot information for the Guest. */ | 1565 | /* The boot information for the Guest. */ |
1533 | struct boot_params *boot; | 1566 | struct boot_params *boot; |
@@ -1622,6 +1655,7 @@ int main(int argc, char *argv[]) | |||
1622 | /* The boot header contains a command line pointer: we put the command | 1655 | /* The boot header contains a command line pointer: we put the command |
1623 | * line after the boot header. */ | 1656 | * line after the boot header. */ |
1624 | boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); | 1657 | boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); |
1658 | /* We use a simple helper to copy the arguments separated by spaces. */ | ||
1625 | concat((char *)(boot + 1), argv+optind+2); | 1659 | concat((char *)(boot + 1), argv+optind+2); |
1626 | 1660 | ||
1627 | /* Boot protocol version: 2.07 supports the fields for lguest. */ | 1661 | /* Boot protocol version: 2.07 supports the fields for lguest. */ |