diff options
| -rw-r--r-- | Documentation/lguest/lguest.c | 184 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 8 | ||||
| -rw-r--r-- | arch/x86/lguest/boot.c | 99 | ||||
| -rw-r--r-- | arch/x86/lguest/i386_head.S | 2 | ||||
| -rw-r--r-- | drivers/lguest/core.c | 7 | ||||
| -rw-r--r-- | drivers/lguest/hypercalls.c | 6 | ||||
| -rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 100 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 84 | ||||
| -rw-r--r-- | drivers/lguest/x86/core.c | 2 | ||||
| -rw-r--r-- | drivers/lguest/x86/switcher_32.S | 6 |
11 files changed, 398 insertions, 111 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa66a52b73e9..45163651b519 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
| @@ -49,7 +49,7 @@ | |||
| 49 | #include "linux/virtio_ring.h" | 49 | #include "linux/virtio_ring.h" |
| 50 | #include "asm/bootparam.h" | 50 | #include "asm/bootparam.h" |
| 51 | /*L:110 | 51 | /*L:110 |
| 52 | * We can ignore the 39 include files we need for this program, but I do want | 52 | * We can ignore the 42 include files we need for this program, but I do want |
| 53 | * to draw attention to the use of kernel-style types. | 53 | * to draw attention to the use of kernel-style types. |
| 54 | * | 54 | * |
| 55 | * As Linus said, "C is a Spartan language, and so should your naming be." I | 55 | * As Linus said, "C is a Spartan language, and so should your naming be." I |
| @@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num) | |||
| 305 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); | 305 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); |
| 306 | if (addr == MAP_FAILED) | 306 | if (addr == MAP_FAILED) |
| 307 | err(1, "Mmaping %u pages of /dev/zero", num); | 307 | err(1, "Mmaping %u pages of /dev/zero", num); |
| 308 | |||
| 309 | /* | ||
| 310 | * One neat mmap feature is that you can close the fd, and it | ||
| 311 | * stays mapped. | ||
| 312 | */ | ||
| 308 | close(fd); | 313 | close(fd); |
| 309 | 314 | ||
| 310 | return addr; | 315 | return addr; |
| @@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start) | |||
| 557 | } | 562 | } |
| 558 | /*:*/ | 563 | /*:*/ |
| 559 | 564 | ||
| 560 | /* | 565 | /*L:200 |
| 561 | * Device Handling. | 566 | * Device Handling. |
| 562 | * | 567 | * |
| 563 | * When the Guest gives us a buffer, it sends an array of addresses and sizes. | 568 | * When the Guest gives us a buffer, it sends an array of addresses and sizes. |
| @@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc, | |||
| 608 | return next; | 613 | return next; |
| 609 | } | 614 | } |
| 610 | 615 | ||
| 611 | /* This actually sends the interrupt for this virtqueue */ | 616 | /* |
| 617 | * This actually sends the interrupt for this virtqueue, if we've used a | ||
| 618 | * buffer. | ||
| 619 | */ | ||
| 612 | static void trigger_irq(struct virtqueue *vq) | 620 | static void trigger_irq(struct virtqueue *vq) |
| 613 | { | 621 | { |
| 614 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | 622 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; |
| @@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq) | |||
| 629 | } | 637 | } |
| 630 | 638 | ||
| 631 | /* | 639 | /* |
| 632 | * This looks in the virtqueue and for the first available buffer, and converts | 640 | * This looks in the virtqueue for the first available buffer, and converts |
| 633 | * it to an iovec for convenient access. Since descriptors consist of some | 641 | * it to an iovec for convenient access. Since descriptors consist of some |
| 634 | * number of output then some number of input descriptors, it's actually two | 642 | * number of output then some number of input descriptors, it's actually two |
| 635 | * iovecs, but we pack them into one and note how many of each there were. | 643 | * iovecs, but we pack them into one and note how many of each there were. |
| 636 | * | 644 | * |
| 637 | * This function returns the descriptor number found. | 645 | * This function waits if necessary, and returns the descriptor number found. |
| 638 | */ | 646 | */ |
| 639 | static unsigned wait_for_vq_desc(struct virtqueue *vq, | 647 | static unsigned wait_for_vq_desc(struct virtqueue *vq, |
| 640 | struct iovec iov[], | 648 | struct iovec iov[], |
| @@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, | |||
| 644 | struct vring_desc *desc; | 652 | struct vring_desc *desc; |
| 645 | u16 last_avail = lg_last_avail(vq); | 653 | u16 last_avail = lg_last_avail(vq); |
| 646 | 654 | ||
| 655 | /* There's nothing available? */ | ||
| 647 | while (last_avail == vq->vring.avail->idx) { | 656 | while (last_avail == vq->vring.avail->idx) { |
| 648 | u64 event; | 657 | u64 event; |
| 649 | 658 | ||
| 650 | /* OK, tell Guest about progress up to now. */ | 659 | /* |
| 660 | * Since we're about to sleep, now is a good time to tell the | ||
| 661 | * Guest about what we've used up to now. | ||
| 662 | */ | ||
| 651 | trigger_irq(vq); | 663 | trigger_irq(vq); |
| 652 | 664 | ||
| 653 | /* OK, now we need to know about added descriptors. */ | 665 | /* OK, now we need to know about added descriptors. */ |
| @@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, | |||
| 734 | } | 746 | } |
| 735 | 747 | ||
| 736 | /* | 748 | /* |
| 737 | * After we've used one of their buffers, we tell them about it. We'll then | 749 | * After we've used one of their buffers, we tell the Guest about it. Sometime |
| 738 | * want to send them an interrupt, using trigger_irq(). | 750 | * later we'll want to send them an interrupt using trigger_irq(); note that |
| 751 | * wait_for_vq_desc() does that for us if it has to wait. | ||
| 739 | */ | 752 | */ |
| 740 | static void add_used(struct virtqueue *vq, unsigned int head, int len) | 753 | static void add_used(struct virtqueue *vq, unsigned int head, int len) |
| 741 | { | 754 | { |
| @@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq) | |||
| 782 | struct console_abort *abort = vq->dev->priv; | 795 | struct console_abort *abort = vq->dev->priv; |
| 783 | struct iovec iov[vq->vring.num]; | 796 | struct iovec iov[vq->vring.num]; |
| 784 | 797 | ||
| 785 | /* Make sure there's a descriptor waiting. */ | 798 | /* Make sure there's a descriptor available. */ |
| 786 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | 799 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
| 787 | if (out_num) | 800 | if (out_num) |
| 788 | errx(1, "Output buffers in console in queue?"); | 801 | errx(1, "Output buffers in console in queue?"); |
| 789 | 802 | ||
| 790 | /* Read it in. */ | 803 | /* Read into it. This is where we usually wait. */ |
| 791 | len = readv(STDIN_FILENO, iov, in_num); | 804 | len = readv(STDIN_FILENO, iov, in_num); |
| 792 | if (len <= 0) { | 805 | if (len <= 0) { |
| 793 | /* Ran out of input? */ | 806 | /* Ran out of input? */ |
| @@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq) | |||
| 800 | pause(); | 813 | pause(); |
| 801 | } | 814 | } |
| 802 | 815 | ||
| 816 | /* Tell the Guest we used a buffer. */ | ||
| 803 | add_used_and_trigger(vq, head, len); | 817 | add_used_and_trigger(vq, head, len); |
| 804 | 818 | ||
| 805 | /* | 819 | /* |
| @@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq) | |||
| 834 | unsigned int head, out, in; | 848 | unsigned int head, out, in; |
| 835 | struct iovec iov[vq->vring.num]; | 849 | struct iovec iov[vq->vring.num]; |
| 836 | 850 | ||
| 851 | /* We usually wait in here, for the Guest to give us something. */ | ||
| 837 | head = wait_for_vq_desc(vq, iov, &out, &in); | 852 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 838 | if (in) | 853 | if (in) |
| 839 | errx(1, "Input buffers in console output queue?"); | 854 | errx(1, "Input buffers in console output queue?"); |
| 855 | |||
| 856 | /* writev can return a partial write, so we loop here. */ | ||
| 840 | while (!iov_empty(iov, out)) { | 857 | while (!iov_empty(iov, out)) { |
| 841 | int len = writev(STDOUT_FILENO, iov, out); | 858 | int len = writev(STDOUT_FILENO, iov, out); |
| 842 | if (len <= 0) | 859 | if (len <= 0) |
| 843 | err(1, "Write to stdout gave %i", len); | 860 | err(1, "Write to stdout gave %i", len); |
| 844 | iov_consume(iov, out, len); | 861 | iov_consume(iov, out, len); |
| 845 | } | 862 | } |
| 863 | |||
| 864 | /* | ||
| 865 | * We're finished with that buffer: if we're going to sleep, | ||
| 866 | * wait_for_vq_desc() will prod the Guest with an interrupt. | ||
| 867 | */ | ||
| 846 | add_used(vq, head, 0); | 868 | add_used(vq, head, 0); |
| 847 | } | 869 | } |
| 848 | 870 | ||
| @@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq) | |||
| 862 | unsigned int head, out, in; | 884 | unsigned int head, out, in; |
| 863 | struct iovec iov[vq->vring.num]; | 885 | struct iovec iov[vq->vring.num]; |
| 864 | 886 | ||
| 887 | /* We usually wait in here for the Guest to give us a packet. */ | ||
| 865 | head = wait_for_vq_desc(vq, iov, &out, &in); | 888 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 866 | if (in) | 889 | if (in) |
| 867 | errx(1, "Input buffers in net output queue?"); | 890 | errx(1, "Input buffers in net output queue?"); |
| 891 | /* | ||
| 892 | * Send the whole thing through to /dev/net/tun. It expects the exact | ||
| 893 | * same format: what a coincidence! | ||
| 894 | */ | ||
| 868 | if (writev(net_info->tunfd, iov, out) < 0) | 895 | if (writev(net_info->tunfd, iov, out) < 0) |
| 869 | errx(1, "Write to tun failed?"); | 896 | errx(1, "Write to tun failed?"); |
| 897 | |||
| 898 | /* | ||
| 899 | * Done with that one; wait_for_vq_desc() will send the interrupt if | ||
| 900 | * all packets are processed. | ||
| 901 | */ | ||
| 870 | add_used(vq, head, 0); | 902 | add_used(vq, head, 0); |
| 871 | } | 903 | } |
| 872 | 904 | ||
| 873 | /* Will reading from this file descriptor block? */ | 905 | /* |
| 906 | * Handling network input is a bit trickier, because I've tried to optimize it. | ||
| 907 | * | ||
| 908 | * First we have a helper routine which tells is if from this file descriptor | ||
| 909 | * (ie. the /dev/net/tun device) will block: | ||
| 910 | */ | ||
| 874 | static bool will_block(int fd) | 911 | static bool will_block(int fd) |
| 875 | { | 912 | { |
| 876 | fd_set fdset; | 913 | fd_set fdset; |
| @@ -880,7 +917,11 @@ static bool will_block(int fd) | |||
| 880 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; | 917 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; |
| 881 | } | 918 | } |
| 882 | 919 | ||
| 883 | /* This handles packets coming in from the tun device to our Guest. */ | 920 | /* |
| 921 | * This handles packets coming in from the tun device to our Guest. Like all | ||
| 922 | * service routines, it gets called again as soon as it returns, so you don't | ||
| 923 | * see a while(1) loop here. | ||
| 924 | */ | ||
| 884 | static void net_input(struct virtqueue *vq) | 925 | static void net_input(struct virtqueue *vq) |
| 885 | { | 926 | { |
| 886 | int len; | 927 | int len; |
| @@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq) | |||
| 888 | struct iovec iov[vq->vring.num]; | 929 | struct iovec iov[vq->vring.num]; |
| 889 | struct net_info *net_info = vq->dev->priv; | 930 | struct net_info *net_info = vq->dev->priv; |
| 890 | 931 | ||
| 932 | /* | ||
| 933 | * Get a descriptor to write an incoming packet into. This will also | ||
| 934 | * send an interrupt if they're out of descriptors. | ||
| 935 | */ | ||
| 891 | head = wait_for_vq_desc(vq, iov, &out, &in); | 936 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 892 | if (out) | 937 | if (out) |
| 893 | errx(1, "Output buffers in net input queue?"); | 938 | errx(1, "Output buffers in net input queue?"); |
| 894 | 939 | ||
| 895 | /* Deliver interrupt now, since we're about to sleep. */ | 940 | /* |
| 941 | * If it looks like we'll block reading from the tun device, send them | ||
| 942 | * an interrupt. | ||
| 943 | */ | ||
| 896 | if (vq->pending_used && will_block(net_info->tunfd)) | 944 | if (vq->pending_used && will_block(net_info->tunfd)) |
| 897 | trigger_irq(vq); | 945 | trigger_irq(vq); |
| 898 | 946 | ||
| 947 | /* | ||
| 948 | * Read in the packet. This is where we normally wait (when there's no | ||
| 949 | * incoming network traffic). | ||
| 950 | */ | ||
| 899 | len = readv(net_info->tunfd, iov, in); | 951 | len = readv(net_info->tunfd, iov, in); |
| 900 | if (len <= 0) | 952 | if (len <= 0) |
| 901 | err(1, "Failed to read from tun."); | 953 | err(1, "Failed to read from tun."); |
| 954 | |||
| 955 | /* | ||
| 956 | * Mark that packet buffer as used, but don't interrupt here. We want | ||
| 957 | * to wait until we've done as much work as we can. | ||
| 958 | */ | ||
| 902 | add_used(vq, head, len); | 959 | add_used(vq, head, len); |
| 903 | } | 960 | } |
| 961 | /*:*/ | ||
| 904 | 962 | ||
| 905 | /* This is the helper to create threads. */ | 963 | /* This is the helper to create threads: run the service routine in a loop. */ |
| 906 | static int do_thread(void *_vq) | 964 | static int do_thread(void *_vq) |
| 907 | { | 965 | { |
| 908 | struct virtqueue *vq = _vq; | 966 | struct virtqueue *vq = _vq; |
| @@ -950,11 +1008,14 @@ static void reset_device(struct device *dev) | |||
| 950 | signal(SIGCHLD, (void *)kill_launcher); | 1008 | signal(SIGCHLD, (void *)kill_launcher); |
| 951 | } | 1009 | } |
| 952 | 1010 | ||
| 1011 | /*L:216 | ||
| 1012 | * This actually creates the thread which services the virtqueue for a device. | ||
| 1013 | */ | ||
| 953 | static void create_thread(struct virtqueue *vq) | 1014 | static void create_thread(struct virtqueue *vq) |
| 954 | { | 1015 | { |
| 955 | /* | 1016 | /* |
| 956 | * Create stack for thread and run it. Since the stack grows upwards, | 1017 | * Create stack for thread. Since the stack grows upwards, we point |
| 957 | * we point the stack pointer to the end of this region. | 1018 | * the stack pointer to the end of this region. |
| 958 | */ | 1019 | */ |
| 959 | char *stack = malloc(32768); | 1020 | char *stack = malloc(32768); |
| 960 | unsigned long args[] = { LHREQ_EVENTFD, | 1021 | unsigned long args[] = { LHREQ_EVENTFD, |
| @@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq) | |||
| 966 | err(1, "Creating eventfd"); | 1027 | err(1, "Creating eventfd"); |
| 967 | args[2] = vq->eventfd; | 1028 | args[2] = vq->eventfd; |
| 968 | 1029 | ||
| 969 | /* Attach an eventfd to this virtqueue: it will go off | 1030 | /* |
| 970 | * when the Guest does an LHCALL_NOTIFY for this vq. */ | 1031 | * Attach an eventfd to this virtqueue: it will go off when the Guest |
| 1032 | * does an LHCALL_NOTIFY for this vq. | ||
| 1033 | */ | ||
| 971 | if (write(lguest_fd, &args, sizeof(args)) != 0) | 1034 | if (write(lguest_fd, &args, sizeof(args)) != 0) |
| 972 | err(1, "Attaching eventfd"); | 1035 | err(1, "Attaching eventfd"); |
| 973 | 1036 | ||
| 974 | /* CLONE_VM: because it has to access the Guest memory, and | 1037 | /* |
| 975 | * SIGCHLD so we get a signal if it dies. */ | 1038 | * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so |
| 1039 | * we get a signal if it dies. | ||
| 1040 | */ | ||
| 976 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); | 1041 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); |
| 977 | if (vq->thread == (pid_t)-1) | 1042 | if (vq->thread == (pid_t)-1) |
| 978 | err(1, "Creating clone"); | 1043 | err(1, "Creating clone"); |
| 979 | /* We close our local copy, now the child has it. */ | 1044 | |
| 1045 | /* We close our local copy now the child has it. */ | ||
| 980 | close(vq->eventfd); | 1046 | close(vq->eventfd); |
| 981 | } | 1047 | } |
| 982 | 1048 | ||
| @@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev) | |||
| 1028 | } | 1094 | } |
| 1029 | } | 1095 | } |
| 1030 | 1096 | ||
| 1031 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ | 1097 | /*L:215 |
| 1098 | * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In | ||
| 1099 | * particular, it's used to notify us of device status changes during boot. | ||
| 1100 | */ | ||
| 1032 | static void handle_output(unsigned long addr) | 1101 | static void handle_output(unsigned long addr) |
| 1033 | { | 1102 | { |
| 1034 | struct device *i; | 1103 | struct device *i; |
| @@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr) | |||
| 1037 | for (i = devices.dev; i; i = i->next) { | 1106 | for (i = devices.dev; i; i = i->next) { |
| 1038 | struct virtqueue *vq; | 1107 | struct virtqueue *vq; |
| 1039 | 1108 | ||
| 1040 | /* Notifications to device descriptors update device status. */ | 1109 | /* |
| 1110 | * Notifications to device descriptors mean they updated the | ||
| 1111 | * device status. | ||
| 1112 | */ | ||
| 1041 | if (from_guest_phys(addr) == i->desc) { | 1113 | if (from_guest_phys(addr) == i->desc) { |
| 1042 | update_device_status(i); | 1114 | update_device_status(i); |
| 1043 | return; | 1115 | return; |
| 1044 | } | 1116 | } |
| 1045 | 1117 | ||
| 1046 | /* Devices *can* be used before status is set to DRIVER_OK. */ | 1118 | /* |
| 1119 | * Devices *can* be used before status is set to DRIVER_OK. | ||
| 1120 | * The original plan was that they would never do this: they | ||
| 1121 | * would always finish setting up their status bits before | ||
| 1122 | * actually touching the virtqueues. In practice, we allowed | ||
| 1123 | * them to, and they do (eg. the disk probes for partition | ||
| 1124 | * tables as part of initialization). | ||
| 1125 | * | ||
| 1126 | * If we see this, we start the device: once it's running, we | ||
| 1127 | * expect the device to catch all the notifications. | ||
| 1128 | */ | ||
| 1047 | for (vq = i->vq; vq; vq = vq->next) { | 1129 | for (vq = i->vq; vq; vq = vq->next) { |
| 1048 | if (addr != vq->config.pfn*getpagesize()) | 1130 | if (addr != vq->config.pfn*getpagesize()) |
| 1049 | continue; | 1131 | continue; |
| 1050 | if (i->running) | 1132 | if (i->running) |
| 1051 | errx(1, "Notification on running %s", i->name); | 1133 | errx(1, "Notification on running %s", i->name); |
| 1134 | /* This just calls create_thread() for each virtqueue */ | ||
| 1052 | start_device(i); | 1135 | start_device(i); |
| 1053 | return; | 1136 | return; |
| 1054 | } | 1137 | } |
| @@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
| 1132 | vq->next = NULL; | 1215 | vq->next = NULL; |
| 1133 | vq->last_avail_idx = 0; | 1216 | vq->last_avail_idx = 0; |
| 1134 | vq->dev = dev; | 1217 | vq->dev = dev; |
| 1218 | |||
| 1219 | /* | ||
| 1220 | * This is the routine the service thread will run, and its Process ID | ||
| 1221 | * once it's running. | ||
| 1222 | */ | ||
| 1135 | vq->service = service; | 1223 | vq->service = service; |
| 1136 | vq->thread = (pid_t)-1; | 1224 | vq->thread = (pid_t)-1; |
| 1137 | 1225 | ||
| @@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
| 1202 | 1290 | ||
| 1203 | /* | 1291 | /* |
| 1204 | * This routine does all the creation and setup of a new device, including | 1292 | * This routine does all the creation and setup of a new device, including |
| 1205 | * calling new_dev_desc() to allocate the descriptor and device memory. | 1293 | * calling new_dev_desc() to allocate the descriptor and device memory. We |
| 1294 | * don't actually start the service threads until later. | ||
| 1206 | * | 1295 | * |
| 1207 | * See what I mean about userspace being boring? | 1296 | * See what I mean about userspace being boring? |
| 1208 | */ | 1297 | */ |
| @@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg) | |||
| 1478 | verbose("device %u: tun %s: %s\n", | 1567 | verbose("device %u: tun %s: %s\n", |
| 1479 | devices.device_num, tapif, arg); | 1568 | devices.device_num, tapif, arg); |
| 1480 | } | 1569 | } |
| 1481 | 1570 | /*:*/ | |
| 1482 | /* | ||
| 1483 | * Our block (disk) device should be really simple: the Guest asks for a block | ||
| 1484 | * number and we read or write that position in the file. Unfortunately, that | ||
| 1485 | * was amazingly slow: the Guest waits until the read is finished before | ||
| 1486 | * running anything else, even if it could have been doing useful work. | ||
| 1487 | * | ||
| 1488 | * We could use async I/O, except it's reputed to suck so hard that characters | ||
| 1489 | * actually go missing from your code when you try to use it. | ||
| 1490 | * | ||
| 1491 | * So this was one reason why lguest now does all virtqueue servicing in | ||
| 1492 | * separate threads: it's more efficient and more like a real device. | ||
| 1493 | */ | ||
| 1494 | 1571 | ||
| 1495 | /* This hangs off device->priv. */ | 1572 | /* This hangs off device->priv. */ |
| 1496 | struct vblk_info | 1573 | struct vblk_info |
| @@ -1512,8 +1589,16 @@ struct vblk_info | |||
| 1512 | /*L:210 | 1589 | /*L:210 |
| 1513 | * The Disk | 1590 | * The Disk |
| 1514 | * | 1591 | * |
| 1515 | * Remember that the block device is handled by a separate I/O thread. We head | 1592 | * The disk only has one virtqueue, so it only has one thread. It is really |
| 1516 | * straight into the core of that thread here: | 1593 | * simple: the Guest asks for a block number and we read or write that position |
| 1594 | * in the file. | ||
| 1595 | * | ||
| 1596 | * Before we serviced each virtqueue in a separate thread, that was unacceptably | ||
| 1597 | * slow: the Guest waits until the read is finished before running anything | ||
| 1598 | * else, even if it could have been doing useful work. | ||
| 1599 | * | ||
| 1600 | * We could have used async I/O, except it's reputed to suck so hard that | ||
| 1601 | * characters actually go missing from your code when you try to use it. | ||
| 1517 | */ | 1602 | */ |
| 1518 | static void blk_request(struct virtqueue *vq) | 1603 | static void blk_request(struct virtqueue *vq) |
| 1519 | { | 1604 | { |
| @@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq) | |||
| 1525 | struct iovec iov[vq->vring.num]; | 1610 | struct iovec iov[vq->vring.num]; |
| 1526 | off64_t off; | 1611 | off64_t off; |
| 1527 | 1612 | ||
| 1528 | /* Get the next request. */ | 1613 | /* |
| 1614 | * Get the next request, where we normally wait. It triggers the | ||
| 1615 | * interrupt to acknowledge previously serviced requests (if any). | ||
| 1616 | */ | ||
| 1529 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | 1617 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
| 1530 | 1618 | ||
| 1531 | /* | 1619 | /* |
| @@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq) | |||
| 1539 | 1627 | ||
| 1540 | out = convert(&iov[0], struct virtio_blk_outhdr); | 1628 | out = convert(&iov[0], struct virtio_blk_outhdr); |
| 1541 | in = convert(&iov[out_num+in_num-1], u8); | 1629 | in = convert(&iov[out_num+in_num-1], u8); |
| 1630 | /* | ||
| 1631 | * For historical reasons, block operations are expressed in 512 byte | ||
| 1632 | * "sectors". | ||
| 1633 | */ | ||
| 1542 | off = out->sector * 512; | 1634 | off = out->sector * 512; |
| 1543 | 1635 | ||
| 1544 | /* | 1636 | /* |
| @@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq) | |||
| 1614 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1706 | if (out->type & VIRTIO_BLK_T_BARRIER) |
| 1615 | fdatasync(vblk->fd); | 1707 | fdatasync(vblk->fd); |
| 1616 | 1708 | ||
| 1709 | /* Finished that request. */ | ||
| 1617 | add_used(vq, head, wlen); | 1710 | add_used(vq, head, wlen); |
| 1618 | } | 1711 | } |
| 1619 | 1712 | ||
| @@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq) | |||
| 1682 | errx(1, "Output buffers in rng?"); | 1775 | errx(1, "Output buffers in rng?"); |
| 1683 | 1776 | ||
| 1684 | /* | 1777 | /* |
| 1685 | * This is why we convert to iovecs: the readv() call uses them, and so | 1778 | * Just like the console write, we loop to cover the whole iovec. |
| 1686 | * it reads straight into the Guest's buffer. We loop to make sure we | 1779 | * In this case, short reads actually happen quite a bit. |
| 1687 | * fill it. | ||
| 1688 | */ | 1780 | */ |
| 1689 | while (!iov_empty(iov, in_num)) { | 1781 | while (!iov_empty(iov, in_num)) { |
| 1690 | len = readv(rng_info->rfd, iov, in_num); | 1782 | len = readv(rng_info->rfd, iov, in_num); |
| @@ -1818,7 +1910,9 @@ int main(int argc, char *argv[]) | |||
| 1818 | devices.lastdev = NULL; | 1910 | devices.lastdev = NULL; |
| 1819 | devices.next_irq = 1; | 1911 | devices.next_irq = 1; |
| 1820 | 1912 | ||
| 1913 | /* We're CPU 0. In fact, that's the only CPU possible right now. */ | ||
| 1821 | cpu_id = 0; | 1914 | cpu_id = 0; |
| 1915 | |||
| 1822 | /* | 1916 | /* |
| 1823 | * We need to know how much memory so we can set up the device | 1917 | * We need to know how much memory so we can set up the device |
| 1824 | * descriptor and memory pages for the devices as we parse the command | 1918 | * descriptor and memory pages for the devices as we parse the command |
| @@ -1926,7 +2020,7 @@ int main(int argc, char *argv[]) | |||
| 1926 | */ | 2020 | */ |
| 1927 | tell_kernel(start); | 2021 | tell_kernel(start); |
| 1928 | 2022 | ||
| 1929 | /* Ensure that we terminate if a child dies. */ | 2023 | /* Ensure that we terminate if a device-servicing child dies. */ |
| 1930 | signal(SIGCHLD, kill_launcher); | 2024 | signal(SIGCHLD, kill_launcher); |
| 1931 | 2025 | ||
| 1932 | /* If we exit via err(), this kills all the threads, restores tty. */ | 2026 | /* If we exit via err(), this kills all the threads, restores tty. */ |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index cceb73e12e50..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
| @@ -35,10 +35,10 @@ | |||
| 35 | * operations? There are two ways: the direct way is to make a "hypercall", | 35 | * operations? There are two ways: the direct way is to make a "hypercall", |
| 36 | * to make requests of the Host Itself. | 36 | * to make requests of the Host Itself. |
| 37 | * | 37 | * |
| 38 | * We use the KVM hypercall mechanism. Seventeen hypercalls are | 38 | * We use the KVM hypercall mechanism, though completely different hypercall |
| 39 | * available: the hypercall number is put in the %eax register, and the | 39 | * numbers. Seventeen hypercalls are available: the hypercall number is put in |
| 40 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. | 40 | * the %eax register, and the arguments (when required) are placed in %ebx, |
| 41 | * If a return value makes sense, it's returned in %eax. | 41 | * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. |
| 42 | * | 42 | * |
| 43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
| 44 | * Host, rather than returning failure. This reflects Winston Churchill's | 44 | * Host, rather than returning failure. This reflects Winston Churchill's |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2b..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
| 154 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
| 155 | } | 155 | } |
| 156 | 156 | ||
| 157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
| 157 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
| 158 | unsigned long arg1, | 159 | unsigned long arg1, |
| 159 | unsigned long arg2) | 160 | unsigned long arg2) |
| @@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
| 189 | } | 190 | } |
| 190 | #endif | 191 | #endif |
| 191 | 192 | ||
| 192 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
| 193 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
| 195 | * issue the do-nothing hypercall to flush any stored calls. | ||
| 196 | :*/ | ||
| 194 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
| 195 | { | 198 | { |
| 196 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
| @@ -250,13 +253,11 @@ extern void lg_irq_enable(void); | |||
| 250 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
| 251 | 254 | ||
| 252 | /*M:003 | 255 | /*M:003 |
| 253 | * Note that we don't check for outstanding interrupts when we re-enable them | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
| 254 | * (or when we unmask an interrupt). This seems to work for the moment, since | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
| 255 | * interrupts are rare and we'll just get the interrupt on the next timer tick, | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
| 256 | * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
| 257 | * be to put the "irq_enabled" field in a page by itself, and have the Host | 260 | * interrupts are re-enabled. |
| 258 | * write-protect it when an interrupt comes in when irqs are disabled. There | ||
| 259 | * will then be a page fault as soon as interrupts are re-enabled. | ||
| 260 | * | 261 | * |
| 261 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
| 262 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
| @@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
| 568 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
| 569 | * | --------->+---------+ | 570 | * | --------->+---------+ |
| 570 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
| 571 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
| 572 | * (PMD) page | | | | 573 | * (PMD) page | | | |
| 573 | * | | Lower-level | | 574 | * | | Lower-level | |
| 574 | * | | (PTE) page | | 575 | * | | (PTE) page | |
| @@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
| 588 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
| 589 | * page directory page pagetable page | 590 | * page directory page pagetable page |
| 590 | * | 591 | * |
| 591 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
| 592 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
| 593 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
| 594 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
| 595 | * the real page tables based on the Guests'. | 596 | * |
| 597 | * The result is a four level page table: | ||
| 598 | * | ||
| 599 | * cr3 --> [ 4 Upper ] | ||
| 600 | * [ Level ] | ||
| 601 | * [ Entries ] | ||
| 602 | * [(PUD Page)]---> +---------+ | ||
| 603 | * | --------->+---------+ | ||
| 604 | * | | | PADDR1 | | ||
| 605 | * Mid-level | | PADDR2 | | ||
| 606 | * (PMD) page | | | | ||
| 607 | * | | Lower-level | | ||
| 608 | * | | (PTE) page | | ||
| 609 | * | | | | | ||
| 610 | * .... .... | ||
| 611 | * | ||
| 612 | * | ||
| 613 | * And the virtual address is decoded as: | ||
| 614 | * | ||
| 615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
| 616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
| 617 | * Index into Index into mid Index into lower Offset within page | ||
| 618 | * top entries directory page pagetable page | ||
| 619 | * | ||
| 620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
| 621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
| 622 | * distributions turn it on, and not just for people with silly amounts of | ||
| 623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
| 624 | * kernel disable execution of pages and increase security. | ||
| 625 | * | ||
| 626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
| 627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
| 628 | * handful of puppies were crushed in the process! | ||
| 629 | * | ||
| 630 | * Back to our point: the kernel spends a lot of time changing both the | ||
| 631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
| 632 | * know physical addresses, so while it maintains these page tables exactly | ||
| 633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
| 634 | * change: the Host will create the real page tables based on the Guests'. | ||
| 596 | */ | 635 | */ |
| 597 | 636 | ||
| 598 | /* | 637 | /* |
| 599 | * The Guest calls this to set a second-level entry (pte), ie. to map a page | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
| 600 | * into a process' address space. We set the entry then tell the Host the | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
| 601 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
| 602 | * process, so we need to tell the Host which one we're changing (mm->pgd). | 641 | * we need to tell the Host which one we're changing (mm->pgd). |
| 603 | */ | 642 | */ |
| 604 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
| 605 | pte_t *ptep) | 644 | pte_t *ptep) |
| 606 | { | 645 | { |
| 607 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
| 647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
| 608 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
| 609 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
| 610 | #else | 650 | #else |
| @@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
| 612 | #endif | 652 | #endif |
| 613 | } | 653 | } |
| 614 | 654 | ||
| 655 | /* This is the "set and update" combo-meal-deal version. */ | ||
| 615 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
| 616 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
| 617 | { | 658 | { |
| @@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
| 672 | } | 713 | } |
| 673 | 714 | ||
| 674 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
| 716 | /* | ||
| 717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
| 718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
| 719 | * versions ensure we update all 64 bits at once. | ||
| 720 | */ | ||
| 675 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
| 676 | { | 722 | { |
| 677 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
| @@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
| 679 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
| 680 | } | 726 | } |
| 681 | 727 | ||
| 682 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
| 729 | pte_t *ptep) | ||
| 683 | { | 730 | { |
| 684 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
| 685 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
| 686 | } | 733 | } |
| 687 | 734 | ||
| 688 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
| 689 | { | 736 | { |
| 690 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
| 691 | } | 738 | } |
| @@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) | |||
| 784 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
| 785 | } | 832 | } |
| 786 | 833 | ||
| 834 | /* | ||
| 835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
| 836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
| 837 | * lguest device needs an interrupt. | ||
| 838 | * | ||
| 839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
| 840 | * pass that up! | ||
| 841 | */ | ||
| 787 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
| 788 | { | 843 | { |
| 789 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
| @@ -1298,7 +1353,7 @@ __init void lguest_init(void) | |||
| 1298 | */ | 1353 | */ |
| 1299 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
| 1300 | 1355 | ||
| 1301 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
| 1302 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
| 1303 | 1358 | ||
| 1304 | /* | 1359 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb054..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
| @@ -102,6 +102,7 @@ send_interrupts: | |||
| 102 | * create one manually here. | 102 | * create one manually here. |
| 103 | */ | 103 | */ |
| 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
| 105 | /* Put eax back the way we found it. */ | ||
| 105 | popl %eax | 106 | popl %eax |
| 106 | ret | 107 | ret |
| 107 | 108 | ||
| @@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) | |||
| 125 | jnz send_interrupts | 126 | jnz send_interrupts |
| 126 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
| 127 | ret | 128 | ret |
| 129 | /*:*/ | ||
| 128 | 130 | ||
| 129 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
| 130 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
| @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 217 | 217 | ||
| 218 | /* | 218 | /* |
| 219 | * It's possible the Guest did a NOTIFY hypercall to the | 219 | * It's possible the Guest did a NOTIFY hypercall to the |
| 220 | * Launcher, in which case we return from the read() now. | 220 | * Launcher. |
| 221 | */ | 221 | */ |
| 222 | if (cpu->pending_notify) { | 222 | if (cpu->pending_notify) { |
| 223 | /* | ||
| 224 | * Does it just needs to write to a registered | ||
| 225 | * eventfd (ie. the appropriate virtqueue thread)? | ||
| 226 | */ | ||
| 223 | if (!send_notify_to_eventfd(cpu)) { | 227 | if (!send_notify_to_eventfd(cpu)) { |
| 228 | /* OK, we tell the main Laucher. */ | ||
| 224 | if (put_user(cpu->pending_notify, user)) | 229 | if (put_user(cpu->pending_notify, user)) |
| 225 | return -EFAULT; | 230 | return -EFAULT; |
| 226 | return sizeof(cpu->pending_notify); | 231 | return sizeof(cpu->pending_notify); |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
| @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 59 | case LHCALL_SHUTDOWN: { | 59 | case LHCALL_SHUTDOWN: { |
| 60 | char msg[128]; | 60 | char msg[128]; |
| 61 | /* | 61 | /* |
| 62 | * Shutdown is such a trivial hypercall that we do it in four | 62 | * Shutdown is such a trivial hypercall that we do it in five |
| 63 | * lines right here. | 63 | * lines right here. |
| 64 | * | 64 | * |
| 65 | * If the lgread fails, it will call kill_guest() itself; the | 65 | * If the lgread fails, it will call kill_guest() itself; the |
| @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) | |||
| 245 | * device), the Guest will still see the old page. In practice, this never | 245 | * device), the Guest will still see the old page. In practice, this never |
| 246 | * happens: why would the Guest read a page which it has never written to? But | 246 | * happens: why would the Guest read a page which it has never written to? But |
| 247 | * a similar scenario might one day bite us, so it's worth mentioning. | 247 | * a similar scenario might one day bite us, so it's worth mentioning. |
| 248 | * | ||
| 249 | * Note that if we used a shared anonymous mapping in the Launcher instead of | ||
| 250 | * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we | ||
| 251 | * need that to switch the Launcher to processes (away from threads) anyway. | ||
| 248 | :*/ | 252 | :*/ |
| 249 | 253 | ||
| 250 | /*H:100 | 254 | /*H:100 |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
| @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) | |||
| 236 | extern void lguest_setup_irq(unsigned int irq); | 236 | extern void lguest_setup_irq(unsigned int irq); |
| 237 | 237 | ||
| 238 | /* | 238 | /* |
| 239 | * This routine finds the first virtqueue described in the configuration of | 239 | * This routine finds the Nth virtqueue described in the configuration of |
| 240 | * this device and sets it up. | 240 | * this device and sets it up. |
| 241 | * | 241 | * |
| 242 | * This is kind of an ugly duckling. It'd be nicer to have a standard | 242 | * This is kind of an ugly duckling. It'd be nicer to have a standard |
| @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); | |||
| 244 | * everyone wants to do it differently. The KVM coders want the Guest to | 244 | * everyone wants to do it differently. The KVM coders want the Guest to |
| 245 | * allocate its own pages and tell the Host where they are, but for lguest it's | 245 | * allocate its own pages and tell the Host where they are, but for lguest it's |
| 246 | * simpler for the Host to simply tell us where the pages are. | 246 | * simpler for the Host to simply tell us where the pages are. |
| 247 | * | ||
| 248 | * So we provide drivers with a "find the Nth virtqueue and set it up" | ||
| 249 | * function. | ||
| 250 | */ | 247 | */ |
| 251 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | 248 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, |
| 252 | unsigned index, | 249 | unsigned index, |
| @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, | |||
| 422 | 419 | ||
| 423 | /* This devices' parent is the lguest/ dir. */ | 420 | /* This devices' parent is the lguest/ dir. */ |
| 424 | ldev->vdev.dev.parent = lguest_root; | 421 | ldev->vdev.dev.parent = lguest_root; |
| 425 | /* We have a unique device index thanks to the dev_index counter. */ | 422 | /* |
| 423 | * The device type comes straight from the descriptor. There's also a | ||
| 424 | * device vendor field in the virtio_device struct, which we leave as | ||
| 425 | * 0. | ||
| 426 | */ | ||
| 426 | ldev->vdev.id.device = d->type; | 427 | ldev->vdev.id.device = d->type; |
| 427 | /* | 428 | /* |
| 428 | * We have a simple set of routines for querying the device's | 429 | * We have a simple set of routines for querying the device's |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
| @@ -1,9 +1,8 @@ | |||
| 1 | /*P:200 | 1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher |
| 2 | * This contains all the /dev/lguest code, whereby the userspace launcher | ||
| 3 | * controls and communicates with the Guest. For example, the first write will | 2 | * controls and communicates with the Guest. For example, the first write will |
| 4 | * tell us the Guest's memory layout, pagetable, entry point and kernel address | 3 | * tell us the Guest's memory layout and entry point. A read will run the |
| 5 | * offset. A read will run the Guest until something happens, such as a signal | 4 | * Guest until something happens, such as a signal or the Guest doing a NOTIFY |
| 6 | * or the Guest doing a NOTIFY out to the Launcher. | 5 | * out to the Launcher. |
| 7 | :*/ | 6 | :*/ |
| 8 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
| 9 | #include <linux/miscdevice.h> | 8 | #include <linux/miscdevice.h> |
| @@ -13,14 +12,41 @@ | |||
| 13 | #include <linux/file.h> | 12 | #include <linux/file.h> |
| 14 | #include "lg.h" | 13 | #include "lg.h" |
| 15 | 14 | ||
| 15 | /*L:056 | ||
| 16 | * Before we move on, let's jump ahead and look at what the kernel does when | ||
| 17 | * it needs to look up the eventfds. That will complete our picture of how we | ||
| 18 | * use RCU. | ||
| 19 | * | ||
| 20 | * The notification value is in cpu->pending_notify: we return true if it went | ||
| 21 | * to an eventfd. | ||
| 22 | */ | ||
| 16 | bool send_notify_to_eventfd(struct lg_cpu *cpu) | 23 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
| 17 | { | 24 | { |
| 18 | unsigned int i; | 25 | unsigned int i; |
| 19 | struct lg_eventfd_map *map; | 26 | struct lg_eventfd_map *map; |
| 20 | 27 | ||
| 21 | /* lg->eventfds is RCU-protected */ | 28 | /* |
| 29 | * This "rcu_read_lock()" helps track when someone is still looking at | ||
| 30 | * the (RCU-using) eventfds array. It's not actually a lock at all; | ||
| 31 | * indeed it's a noop in many configurations. (You didn't expect me to | ||
| 32 | * explain all the RCU secrets here, did you?) | ||
| 33 | */ | ||
| 22 | rcu_read_lock(); | 34 | rcu_read_lock(); |
| 35 | /* | ||
| 36 | * rcu_dereference is the counter-side of rcu_assign_pointer(); it | ||
| 37 | * makes sure we don't access the memory pointed to by | ||
| 38 | * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, | ||
| 39 | * but Alpha allows this! Paul McKenney points out that a really | ||
| 40 | * aggressive compiler could have the same effect: | ||
| 41 | * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html | ||
| 42 | * | ||
| 43 | * So play safe, use rcu_dereference to get the rcu-protected pointer: | ||
| 44 | */ | ||
| 23 | map = rcu_dereference(cpu->lg->eventfds); | 45 | map = rcu_dereference(cpu->lg->eventfds); |
| 46 | /* | ||
| 47 | * Simple array search: even if they add an eventfd while we do this, | ||
| 48 | * we'll continue to use the old array and just won't see the new one. | ||
| 49 | */ | ||
| 24 | for (i = 0; i < map->num; i++) { | 50 | for (i = 0; i < map->num; i++) { |
| 25 | if (map->map[i].addr == cpu->pending_notify) { | 51 | if (map->map[i].addr == cpu->pending_notify) { |
| 26 | eventfd_signal(map->map[i].event, 1); | 52 | eventfd_signal(map->map[i].event, 1); |
| @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) | |||
| 28 | break; | 54 | break; |
| 29 | } | 55 | } |
| 30 | } | 56 | } |
| 57 | /* We're done with the rcu-protected variable cpu->lg->eventfds. */ | ||
| 31 | rcu_read_unlock(); | 58 | rcu_read_unlock(); |
| 59 | |||
| 60 | /* If we cleared the notification, it's because we found a match. */ | ||
| 32 | return cpu->pending_notify == 0; | 61 | return cpu->pending_notify == 0; |
| 33 | } | 62 | } |
| 34 | 63 | ||
| 64 | /*L:055 | ||
| 65 | * One of the more tricksy tricks in the Linux Kernel is a technique called | ||
| 66 | * Read Copy Update. Since one point of lguest is to teach lguest journeyers | ||
| 67 | * about kernel coding, I use it here. (In case you're curious, other purposes | ||
| 68 | * include learning about virtualization and instilling a deep appreciation for | ||
| 69 | * simplicity and puppies). | ||
| 70 | * | ||
| 71 | * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we | ||
| 72 | * add new eventfds without ever blocking readers from accessing the array. | ||
| 73 | * The current Launcher only does this during boot, so that never happens. But | ||
| 74 | * Read Copy Update is cool, and adding a lock risks damaging even more puppies | ||
| 75 | * than this code does. | ||
| 76 | * | ||
| 77 | * We allocate a brand new one-larger array, copy the old one and add our new | ||
| 78 | * element. Then we make the lg eventfd pointer point to the new array. | ||
| 79 | * That's the easy part: now we need to free the old one, but we need to make | ||
| 80 | * sure no slow CPU somewhere is still looking at it. That's what | ||
| 81 | * synchronize_rcu does for us: waits until every CPU has indicated that it has | ||
| 82 | * moved on to know it's no longer using the old one. | ||
| 83 | * | ||
| 84 | * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. | ||
| 85 | */ | ||
| 35 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | 86 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
| 36 | { | 87 | { |
| 37 | struct lg_eventfd_map *new, *old = lg->eventfds; | 88 | struct lg_eventfd_map *new, *old = lg->eventfds; |
| 38 | 89 | ||
| 90 | /* | ||
| 91 | * We don't allow notifications on value 0 anyway (pending_notify of | ||
| 92 | * 0 means "nothing pending"). | ||
| 93 | */ | ||
| 39 | if (!addr) | 94 | if (!addr) |
| 40 | return -EINVAL; | 95 | return -EINVAL; |
| 41 | 96 | ||
| @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
| 62 | } | 117 | } |
| 63 | new->num++; | 118 | new->num++; |
| 64 | 119 | ||
| 65 | /* Now put new one in place. */ | 120 | /* |
| 121 | * Now put new one in place: rcu_assign_pointer() is a fancy way of | ||
| 122 | * doing "lg->eventfds = new", but it uses memory barriers to make | ||
| 123 | * absolutely sure that the contents of "new" written above is nailed | ||
| 124 | * down before we actually do the assignment. | ||
| 125 | * | ||
| 126 | * We have to think about these kinds of things when we're operating on | ||
| 127 | * live data without locks. | ||
| 128 | */ | ||
| 66 | rcu_assign_pointer(lg->eventfds, new); | 129 | rcu_assign_pointer(lg->eventfds, new); |
| 67 | 130 | ||
| 68 | /* | 131 | /* |
| 69 | * We're not in a big hurry. Wait until noone's looking at old | 132 | * We're not in a big hurry. Wait until noone's looking at old |
| 70 | * version, then delete it. | 133 | * version, then free it. |
| 71 | */ | 134 | */ |
| 72 | synchronize_rcu(); | 135 | synchronize_rcu(); |
| 73 | kfree(old); | 136 | kfree(old); |
| @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
| 75 | return 0; | 138 | return 0; |
| 76 | } | 139 | } |
| 77 | 140 | ||
| 141 | /*L:052 | ||
| 142 | * Receiving notifications from the Guest is usually done by attaching a | ||
| 143 | * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will | ||
| 144 | * become readable when the Guest does an LHCALL_NOTIFY with that value. | ||
| 145 | * | ||
| 146 | * This is really convenient for processing each virtqueue in a separate | ||
| 147 | * thread. | ||
| 148 | */ | ||
| 78 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | 149 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) |
| 79 | { | 150 | { |
| 80 | unsigned long addr, fd; | 151 | unsigned long addr, fd; |
| @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | |||
| 86 | if (get_user(fd, input) != 0) | 157 | if (get_user(fd, input) != 0) |
| 87 | return -EFAULT; | 158 | return -EFAULT; |
| 88 | 159 | ||
| 160 | /* | ||
| 161 | * Just make sure two callers don't add eventfds at once. We really | ||
| 162 | * only need to lock against callers adding to the same Guest, so using | ||
| 163 | * the Big Lguest Lock is overkill. But this is setup, not a fast path. | ||
| 164 | */ | ||
| 89 | mutex_lock(&lguest_lock); | 165 | mutex_lock(&lguest_lock); |
| 90 | err = add_eventfd(lg, addr, fd); | 166 | err = add_eventfd(lg, addr, fd); |
| 91 | mutex_unlock(&lguest_lock); | 167 | mutex_unlock(&lguest_lock); |
| @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
| 106 | if (irq >= LGUEST_IRQS) | 182 | if (irq >= LGUEST_IRQS) |
| 107 | return -EINVAL; | 183 | return -EINVAL; |
| 108 | 184 | ||
| 185 | /* | ||
| 186 | * Next time the Guest runs, the core code will see if it can deliver | ||
| 187 | * this interrupt. | ||
| 188 | */ | ||
| 109 | set_interrupt(cpu, irq); | 189 | set_interrupt(cpu, irq); |
| 110 | return 0; | 190 | return 0; |
| 111 | } | 191 | } |
| @@ -307,10 +387,10 @@ unlock: | |||
| 307 | * The first operation the Launcher does must be a write. All writes | 387 | * The first operation the Launcher does must be a write. All writes |
| 308 | * start with an unsigned long number: for the first write this must be | 388 | * start with an unsigned long number: for the first write this must be |
| 309 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | 389 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use |
| 310 | * writes of other values to send interrupts. | 390 | * writes of other values to send interrupts or set up receipt of notifications. |
| 311 | * | 391 | * |
| 312 | * Note that we overload the "offset" in the /dev/lguest file to indicate what | 392 | * Note that we overload the "offset" in the /dev/lguest file to indicate what |
| 313 | * CPU number we're dealing with. Currently this is always 0, since we only | 393 | * CPU number we're dealing with. Currently this is always 0 since we only |
| 314 | * support uniprocessor Guests, but you can see the beginnings of SMP support | 394 | * support uniprocessor Guests, but you can see the beginnings of SMP support |
| 315 | * here. | 395 | * here. |
| 316 | */ | 396 | */ |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
| @@ -29,10 +29,10 @@ | |||
| 29 | /*H:300 | 29 | /*H:300 |
| 30 | * The Page Table Code | 30 | * The Page Table Code |
| 31 | * | 31 | * |
| 32 | * We use two-level page tables for the Guest. If you're not entirely | 32 | * We use two-level page tables for the Guest, or three-level with PAE. If |
| 33 | * comfortable with virtual addresses, physical addresses and page tables then | 33 | * you're not entirely comfortable with virtual addresses, physical addresses |
| 34 | * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with | 34 | * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page |
| 35 | * diagrams!). | 35 | * Table Handling" (with diagrams!). |
| 36 | * | 36 | * |
| 37 | * The Guest keeps page tables, but we maintain the actual ones here: these are | 37 | * The Guest keeps page tables, but we maintain the actual ones here: these are |
| 38 | * called "shadow" page tables. Which is a very Guest-centric name: these are | 38 | * called "shadow" page tables. Which is a very Guest-centric name: these are |
| @@ -52,9 +52,8 @@ | |||
| 52 | :*/ | 52 | :*/ |
| 53 | 53 | ||
| 54 | /* | 54 | /* |
| 55 | * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 55 | * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) |
| 56 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 56 | * or 512 PTE entries with PAE (2MB). |
| 57 | * page. | ||
| 58 | */ | 57 | */ |
| 59 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 58 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
| 60 | 59 | ||
| @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | |||
| 81 | 80 | ||
| 82 | /*H:320 | 81 | /*H:320 |
| 83 | * The page table code is curly enough to need helper functions to keep it | 82 | * The page table code is curly enough to need helper functions to keep it |
| 84 | * clear and clean. | 83 | * clear and clean. The kernel itself provides many of them; one advantage |
| 84 | * of insisting that the Guest and Host use the same CONFIG_PAE setting. | ||
| 85 | * | 85 | * |
| 86 | * There are two functions which return pointers to the shadow (aka "real") | 86 | * There are two functions which return pointers to the shadow (aka "real") |
| 87 | * page tables. | 87 | * page tables. |
| @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
| 155 | } | 155 | } |
| 156 | 156 | ||
| 157 | /* | 157 | /* |
| 158 | * These two functions just like the above two, except they access the Guest | 158 | * These functions are just like the above two, except they access the Guest |
| 159 | * page tables. Hence they return a Guest address. | 159 | * page tables. Hence they return a Guest address. |
| 160 | */ | 160 | */ |
| 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
| @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 165 | } | 165 | } |
| 166 | 166 | ||
| 167 | #ifdef CONFIG_X86_PAE | 167 | #ifdef CONFIG_X86_PAE |
| 168 | /* Follow the PGD to the PMD. */ | ||
| 168 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | 169 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) |
| 169 | { | 170 | { |
| 170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 171 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
| @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | |||
| 172 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | 173 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); |
| 173 | } | 174 | } |
| 174 | 175 | ||
| 176 | /* Follow the PMD to the PTE. */ | ||
| 175 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 177 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
| 176 | pmd_t gpmd, unsigned long vaddr) | 178 | pmd_t gpmd, unsigned long vaddr) |
| 177 | { | 179 | { |
| @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, | |||
| 181 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 183 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
| 182 | } | 184 | } |
| 183 | #else | 185 | #else |
| 186 | /* Follow the PGD to the PTE (no mid-level for !PAE). */ | ||
| 184 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 187 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
| 185 | pgd_t gpgd, unsigned long vaddr) | 188 | pgd_t gpgd, unsigned long vaddr) |
| 186 | { | 189 | { |
| @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 314 | pte_t gpte; | 317 | pte_t gpte; |
| 315 | pte_t *spte; | 318 | pte_t *spte; |
| 316 | 319 | ||
| 320 | /* Mid level for PAE. */ | ||
| 317 | #ifdef CONFIG_X86_PAE | 321 | #ifdef CONFIG_X86_PAE |
| 318 | pmd_t *spmd; | 322 | pmd_t *spmd; |
| 319 | pmd_t gpmd; | 323 | pmd_t gpmd; |
| @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 391 | */ | 395 | */ |
| 392 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 396 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
| 393 | #endif | 397 | #endif |
| 398 | |||
| 399 | /* Read the actual PTE value. */ | ||
| 394 | gpte = lgread(cpu, gpte_ptr, pte_t); | 400 | gpte = lgread(cpu, gpte_ptr, pte_t); |
| 395 | 401 | ||
| 396 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 402 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
| @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 507 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) | 513 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
| 508 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 514 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
| 509 | } | 515 | } |
| 516 | /*:*/ | ||
| 510 | 517 | ||
| 511 | #ifdef CONFIG_X86_PAE | 518 | #ifdef CONFIG_X86_PAE |
| 512 | static void release_pmd(pmd_t *spmd) | 519 | static void release_pmd(pmd_t *spmd) |
| @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) | |||
| 543 | } | 550 | } |
| 544 | 551 | ||
| 545 | #else /* !CONFIG_X86_PAE */ | 552 | #else /* !CONFIG_X86_PAE */ |
| 546 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 553 | /*H:450 |
| 554 | * If we chase down the release_pgd() code, the non-PAE version looks like | ||
| 555 | * this. The PAE version is almost identical, but instead of calling | ||
| 556 | * release_pte it calls release_pmd(), which looks much like this. | ||
| 557 | */ | ||
| 547 | static void release_pgd(pgd_t *spgd) | 558 | static void release_pgd(pgd_t *spgd) |
| 548 | { | 559 | { |
| 549 | /* If the entry's not present, there's nothing to release. */ | 560 | /* If the entry's not present, there's nothing to release. */ |
| @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
| 898 | /* ... throw it away. */ | 909 | /* ... throw it away. */ |
| 899 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 910 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
| 900 | } | 911 | } |
| 912 | |||
| 901 | #ifdef CONFIG_X86_PAE | 913 | #ifdef CONFIG_X86_PAE |
| 914 | /* For setting a mid-level, we just throw everything away. It's easy. */ | ||
| 902 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | 915 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) |
| 903 | { | 916 | { |
| 904 | guest_pagetable_clear_all(&lg->cpus[0]); | 917 | guest_pagetable_clear_all(&lg->cpus[0]); |
| 905 | } | 918 | } |
| 906 | #endif | 919 | #endif |
| 907 | 920 | ||
| 908 | /* | 921 | /*H:505 |
| 909 | * Once we know how much memory we have we can construct simple identity (which | 922 | * To get through boot, we construct simple identity page mappings (which |
| 910 | * set virtual == physical) and linear mappings which will get the Guest far | 923 | * set virtual == physical) and linear mappings which will get the Guest far |
| 911 | * enough into the boot to create its own. | 924 | * enough into the boot to create its own. The linear mapping means we |
| 925 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
| 926 | * as you'll see. | ||
| 912 | * | 927 | * |
| 913 | * We lay them out of the way, just below the initrd (which is why we need to | 928 | * We lay them out of the way, just below the initrd (which is why we need to |
| 914 | * know its size here). | 929 | * know its size here). |
| @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 944 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 959 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
| 945 | 960 | ||
| 946 | #ifdef CONFIG_X86_PAE | 961 | #ifdef CONFIG_X86_PAE |
| 962 | /* | ||
| 963 | * And the single mid page goes below that. We only use one, but | ||
| 964 | * that's enough to map 1G, which definitely gets us through boot. | ||
| 965 | */ | ||
| 947 | pmds = (void *)linear - PAGE_SIZE; | 966 | pmds = (void *)linear - PAGE_SIZE; |
| 948 | #endif | 967 | #endif |
| 949 | /* | 968 | /* |
| @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 957 | return -EFAULT; | 976 | return -EFAULT; |
| 958 | } | 977 | } |
| 959 | 978 | ||
| 979 | #ifdef CONFIG_X86_PAE | ||
| 960 | /* | 980 | /* |
| 961 | * The top level points to the linear page table pages above. | 981 | * Make the Guest PMD entries point to the corresponding place in the |
| 962 | * We setup the identity and linear mappings here. | 982 | * linear mapping (up to one page worth of PMD). |
| 963 | */ | 983 | */ |
| 964 | #ifdef CONFIG_X86_PAE | ||
| 965 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | 984 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; |
| 966 | i += PTRS_PER_PTE, j++) { | 985 | i += PTRS_PER_PTE, j++) { |
| 986 | /* FIXME: native_set_pmd is overkill here. */ | ||
| 967 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | 987 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) |
| 968 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 988 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
| 969 | 989 | ||
| @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 971 | return -EFAULT; | 991 | return -EFAULT; |
| 972 | } | 992 | } |
| 973 | 993 | ||
| 994 | /* One PGD entry, pointing to that PMD page. */ | ||
| 974 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | 995 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); |
| 996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
| 975 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | 997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) |
| 976 | return -EFAULT; | 998 | return -EFAULT; |
| 999 | /* | ||
| 1000 | * And the third PGD entry (ie. addresses 3G-4G). | ||
| 1001 | * | ||
| 1002 | * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. | ||
| 1003 | */ | ||
| 977 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | 1004 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) |
| 978 | return -EFAULT; | 1005 | return -EFAULT; |
| 979 | #else | 1006 | #else |
| 1007 | /* | ||
| 1008 | * The top level points to the linear page table pages above. | ||
| 1009 | * We setup the identity and linear mappings here. | ||
| 1010 | */ | ||
| 980 | phys_linear = (unsigned long)linear - mem_base; | 1011 | phys_linear = (unsigned long)linear - mem_base; |
| 981 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 1012 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
| 982 | pgd_t pgd; | 1013 | pgd_t pgd; |
| 1014 | /* | ||
| 1015 | * Create a PGD entry which points to the right part of the | ||
| 1016 | * linear PTE pages. | ||
| 1017 | */ | ||
| 983 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | 1018 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | |
| 984 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 1019 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
| 985 | 1020 | ||
| 1021 | /* | ||
| 1022 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
| 1023 | */ | ||
| 986 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | 1024 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) |
| 987 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | 1025 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) |
| 988 | + i / PTRS_PER_PTE], | 1026 | + i / PTRS_PER_PTE], |
| @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 992 | #endif | 1030 | #endif |
| 993 | 1031 | ||
| 994 | /* | 1032 | /* |
| 995 | * We return the top level (guest-physical) address: remember where | 1033 | * We return the top level (guest-physical) address: we remember where |
| 996 | * this is. | 1034 | * this is to write it into lguest_data when the Guest initializes. |
| 997 | */ | 1035 | */ |
| 998 | return (unsigned long)pgdir - mem_base; | 1036 | return (unsigned long)pgdir - mem_base; |
| 999 | } | 1037 | } |
| @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 1031 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 1069 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 1032 | if (!lg->pgdirs[0].pgdir) | 1070 | if (!lg->pgdirs[0].pgdir) |
| 1033 | return -ENOMEM; | 1071 | return -ENOMEM; |
| 1072 | |||
| 1034 | #ifdef CONFIG_X86_PAE | 1073 | #ifdef CONFIG_X86_PAE |
| 1074 | /* For PAE, we also create the initial mid-level. */ | ||
| 1035 | pgd = lg->pgdirs[0].pgdir; | 1075 | pgd = lg->pgdirs[0].pgdir; |
| 1036 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | 1076 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); |
| 1037 | if (!pmd_table) | 1077 | if (!pmd_table) |
| @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 1040 | set_pgd(pgd + SWITCHER_PGD_INDEX, | 1080 | set_pgd(pgd + SWITCHER_PGD_INDEX, |
| 1041 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 1081 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
| 1042 | #endif | 1082 | #endif |
| 1083 | |||
| 1084 | /* This is the current page table. */ | ||
| 1043 | lg->cpus[0].cpu_pgd = 0; | 1085 | lg->cpus[0].cpu_pgd = 0; |
| 1044 | return 0; | 1086 | return 0; |
| 1045 | } | 1087 | } |
| 1046 | 1088 | ||
| 1047 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1089 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
| 1048 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1090 | void page_table_guest_data_init(struct lg_cpu *cpu) |
| 1049 | { | 1091 | { |
| 1050 | /* We get the kernel address: above this is all kernel memory. */ | 1092 | /* We get the kernel address: above this is all kernel memory. */ |
| @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
| 1105 | pmd_t switcher_pmd; | 1147 | pmd_t switcher_pmd; |
| 1106 | pmd_t *pmd_table; | 1148 | pmd_t *pmd_table; |
| 1107 | 1149 | ||
| 1150 | /* FIXME: native_set_pmd is overkill here. */ | ||
| 1108 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | 1151 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> |
| 1109 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | 1152 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
| 1110 | 1153 | ||
| 1154 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
| 1155 | * it to a virtual address. */ | ||
| 1111 | pmd_table = __va(pgd_pfn(cpu->lg-> | 1156 | pmd_table = __va(pgd_pfn(cpu->lg-> |
| 1112 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | 1157 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) |
| 1113 | << PAGE_SHIFT); | 1158 | << PAGE_SHIFT); |
| 1159 | /* Now write it into the shadow page table. */ | ||
| 1114 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | 1160 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); |
| 1115 | #else | 1161 | #else |
| 1116 | pgd_t switcher_pgd; | 1162 | pgd_t switcher_pgd; |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
| @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
| 187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore | 187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore |
| 188 | * things when we exit to Launcher userspace, but that's fairly easy. | 188 | * things when we exit to Launcher userspace, but that's fairly easy. |
| 189 | * | 189 | * |
| 190 | * We could also try using this hooks for PGE, but that might be too expensive. | 190 | * We could also try using these hooks for PGE, but that might be too expensive. |
| 191 | * | 191 | * |
| 192 | * The hooks were designed for KVM, but we can also put them to good use. | 192 | * The hooks were designed for KVM, but we can also put them to good use. |
| 193 | :*/ | 193 | :*/ |
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /*P:900 | 1 | /*P:900 |
| 2 | * This is the Switcher: code which sits at 0xFFC00000 astride both the | 2 | * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride |
| 3 | * Host and Guest to do the low-level Guest<->Host switch. It is as simple as | 3 | * both the Host and Guest to do the low-level Guest<->Host switch. It is as |
| 4 | * it can be made, but it's naturally very specific to x86. | 4 | * simple as it can be made, but it's naturally very specific to x86. |
| 5 | * | 5 | * |
| 6 | * You have now completed Preparation. If this has whet your appetite; if you | 6 | * You have now completed Preparation. If this has whet your appetite; if you |
| 7 | * are feeling invigorated and refreshed then the next, more challenging stage | 7 | * are feeling invigorated and refreshed then the next, more challenging stage |
