diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 18:03:45 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 02:33:46 -0400 |
commit | a91d74a3c4de8115295ee87350c13a329164aaaf (patch) | |
tree | 02c862fccc9abedf7fc354061e69c4b5fbcce06d | |
parent | 2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff) |
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot
the Lguest Journey (see drivers/lguest/README). Since we now use RCU in
a simple form in one place I took the opportunity to expand that explanation.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r-- | Documentation/lguest/lguest.c | 184 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 8 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 99 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 2 | ||||
-rw-r--r-- | drivers/lguest/core.c | 7 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 100 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 84 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 2 | ||||
-rw-r--r-- | drivers/lguest/x86/switcher_32.S | 6 |
11 files changed, 398 insertions, 111 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index aa66a52b73e9..45163651b519 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
@@ -49,7 +49,7 @@ | |||
49 | #include "linux/virtio_ring.h" | 49 | #include "linux/virtio_ring.h" |
50 | #include "asm/bootparam.h" | 50 | #include "asm/bootparam.h" |
51 | /*L:110 | 51 | /*L:110 |
52 | * We can ignore the 39 include files we need for this program, but I do want | 52 | * We can ignore the 42 include files we need for this program, but I do want |
53 | * to draw attention to the use of kernel-style types. | 53 | * to draw attention to the use of kernel-style types. |
54 | * | 54 | * |
55 | * As Linus said, "C is a Spartan language, and so should your naming be." I | 55 | * As Linus said, "C is a Spartan language, and so should your naming be." I |
@@ -305,6 +305,11 @@ static void *map_zeroed_pages(unsigned int num) | |||
305 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); | 305 | PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); |
306 | if (addr == MAP_FAILED) | 306 | if (addr == MAP_FAILED) |
307 | err(1, "Mmaping %u pages of /dev/zero", num); | 307 | err(1, "Mmaping %u pages of /dev/zero", num); |
308 | |||
309 | /* | ||
310 | * One neat mmap feature is that you can close the fd, and it | ||
311 | * stays mapped. | ||
312 | */ | ||
308 | close(fd); | 313 | close(fd); |
309 | 314 | ||
310 | return addr; | 315 | return addr; |
@@ -557,7 +562,7 @@ static void tell_kernel(unsigned long start) | |||
557 | } | 562 | } |
558 | /*:*/ | 563 | /*:*/ |
559 | 564 | ||
560 | /* | 565 | /*L:200 |
561 | * Device Handling. | 566 | * Device Handling. |
562 | * | 567 | * |
563 | * When the Guest gives us a buffer, it sends an array of addresses and sizes. | 568 | * When the Guest gives us a buffer, it sends an array of addresses and sizes. |
@@ -608,7 +613,10 @@ static unsigned next_desc(struct vring_desc *desc, | |||
608 | return next; | 613 | return next; |
609 | } | 614 | } |
610 | 615 | ||
611 | /* This actually sends the interrupt for this virtqueue */ | 616 | /* |
617 | * This actually sends the interrupt for this virtqueue, if we've used a | ||
618 | * buffer. | ||
619 | */ | ||
612 | static void trigger_irq(struct virtqueue *vq) | 620 | static void trigger_irq(struct virtqueue *vq) |
613 | { | 621 | { |
614 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | 622 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; |
@@ -629,12 +637,12 @@ static void trigger_irq(struct virtqueue *vq) | |||
629 | } | 637 | } |
630 | 638 | ||
631 | /* | 639 | /* |
632 | * This looks in the virtqueue and for the first available buffer, and converts | 640 | * This looks in the virtqueue for the first available buffer, and converts |
633 | * it to an iovec for convenient access. Since descriptors consist of some | 641 | * it to an iovec for convenient access. Since descriptors consist of some |
634 | * number of output then some number of input descriptors, it's actually two | 642 | * number of output then some number of input descriptors, it's actually two |
635 | * iovecs, but we pack them into one and note how many of each there were. | 643 | * iovecs, but we pack them into one and note how many of each there were. |
636 | * | 644 | * |
637 | * This function returns the descriptor number found. | 645 | * This function waits if necessary, and returns the descriptor number found. |
638 | */ | 646 | */ |
639 | static unsigned wait_for_vq_desc(struct virtqueue *vq, | 647 | static unsigned wait_for_vq_desc(struct virtqueue *vq, |
640 | struct iovec iov[], | 648 | struct iovec iov[], |
@@ -644,10 +652,14 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, | |||
644 | struct vring_desc *desc; | 652 | struct vring_desc *desc; |
645 | u16 last_avail = lg_last_avail(vq); | 653 | u16 last_avail = lg_last_avail(vq); |
646 | 654 | ||
655 | /* There's nothing available? */ | ||
647 | while (last_avail == vq->vring.avail->idx) { | 656 | while (last_avail == vq->vring.avail->idx) { |
648 | u64 event; | 657 | u64 event; |
649 | 658 | ||
650 | /* OK, tell Guest about progress up to now. */ | 659 | /* |
660 | * Since we're about to sleep, now is a good time to tell the | ||
661 | * Guest about what we've used up to now. | ||
662 | */ | ||
651 | trigger_irq(vq); | 663 | trigger_irq(vq); |
652 | 664 | ||
653 | /* OK, now we need to know about added descriptors. */ | 665 | /* OK, now we need to know about added descriptors. */ |
@@ -734,8 +746,9 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, | |||
734 | } | 746 | } |
735 | 747 | ||
736 | /* | 748 | /* |
737 | * After we've used one of their buffers, we tell them about it. We'll then | 749 | * After we've used one of their buffers, we tell the Guest about it. Sometime |
738 | * want to send them an interrupt, using trigger_irq(). | 750 | * later we'll want to send them an interrupt using trigger_irq(); note that |
751 | * wait_for_vq_desc() does that for us if it has to wait. | ||
739 | */ | 752 | */ |
740 | static void add_used(struct virtqueue *vq, unsigned int head, int len) | 753 | static void add_used(struct virtqueue *vq, unsigned int head, int len) |
741 | { | 754 | { |
@@ -782,12 +795,12 @@ static void console_input(struct virtqueue *vq) | |||
782 | struct console_abort *abort = vq->dev->priv; | 795 | struct console_abort *abort = vq->dev->priv; |
783 | struct iovec iov[vq->vring.num]; | 796 | struct iovec iov[vq->vring.num]; |
784 | 797 | ||
785 | /* Make sure there's a descriptor waiting. */ | 798 | /* Make sure there's a descriptor available. */ |
786 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | 799 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
787 | if (out_num) | 800 | if (out_num) |
788 | errx(1, "Output buffers in console in queue?"); | 801 | errx(1, "Output buffers in console in queue?"); |
789 | 802 | ||
790 | /* Read it in. */ | 803 | /* Read into it. This is where we usually wait. */ |
791 | len = readv(STDIN_FILENO, iov, in_num); | 804 | len = readv(STDIN_FILENO, iov, in_num); |
792 | if (len <= 0) { | 805 | if (len <= 0) { |
793 | /* Ran out of input? */ | 806 | /* Ran out of input? */ |
@@ -800,6 +813,7 @@ static void console_input(struct virtqueue *vq) | |||
800 | pause(); | 813 | pause(); |
801 | } | 814 | } |
802 | 815 | ||
816 | /* Tell the Guest we used a buffer. */ | ||
803 | add_used_and_trigger(vq, head, len); | 817 | add_used_and_trigger(vq, head, len); |
804 | 818 | ||
805 | /* | 819 | /* |
@@ -834,15 +848,23 @@ static void console_output(struct virtqueue *vq) | |||
834 | unsigned int head, out, in; | 848 | unsigned int head, out, in; |
835 | struct iovec iov[vq->vring.num]; | 849 | struct iovec iov[vq->vring.num]; |
836 | 850 | ||
851 | /* We usually wait in here, for the Guest to give us something. */ | ||
837 | head = wait_for_vq_desc(vq, iov, &out, &in); | 852 | head = wait_for_vq_desc(vq, iov, &out, &in); |
838 | if (in) | 853 | if (in) |
839 | errx(1, "Input buffers in console output queue?"); | 854 | errx(1, "Input buffers in console output queue?"); |
855 | |||
856 | /* writev can return a partial write, so we loop here. */ | ||
840 | while (!iov_empty(iov, out)) { | 857 | while (!iov_empty(iov, out)) { |
841 | int len = writev(STDOUT_FILENO, iov, out); | 858 | int len = writev(STDOUT_FILENO, iov, out); |
842 | if (len <= 0) | 859 | if (len <= 0) |
843 | err(1, "Write to stdout gave %i", len); | 860 | err(1, "Write to stdout gave %i", len); |
844 | iov_consume(iov, out, len); | 861 | iov_consume(iov, out, len); |
845 | } | 862 | } |
863 | |||
864 | /* | ||
865 | * We're finished with that buffer: if we're going to sleep, | ||
866 | * wait_for_vq_desc() will prod the Guest with an interrupt. | ||
867 | */ | ||
846 | add_used(vq, head, 0); | 868 | add_used(vq, head, 0); |
847 | } | 869 | } |
848 | 870 | ||
@@ -862,15 +884,30 @@ static void net_output(struct virtqueue *vq) | |||
862 | unsigned int head, out, in; | 884 | unsigned int head, out, in; |
863 | struct iovec iov[vq->vring.num]; | 885 | struct iovec iov[vq->vring.num]; |
864 | 886 | ||
887 | /* We usually wait in here for the Guest to give us a packet. */ | ||
865 | head = wait_for_vq_desc(vq, iov, &out, &in); | 888 | head = wait_for_vq_desc(vq, iov, &out, &in); |
866 | if (in) | 889 | if (in) |
867 | errx(1, "Input buffers in net output queue?"); | 890 | errx(1, "Input buffers in net output queue?"); |
891 | /* | ||
892 | * Send the whole thing through to /dev/net/tun. It expects the exact | ||
893 | * same format: what a coincidence! | ||
894 | */ | ||
868 | if (writev(net_info->tunfd, iov, out) < 0) | 895 | if (writev(net_info->tunfd, iov, out) < 0) |
869 | errx(1, "Write to tun failed?"); | 896 | errx(1, "Write to tun failed?"); |
897 | |||
898 | /* | ||
899 | * Done with that one; wait_for_vq_desc() will send the interrupt if | ||
900 | * all packets are processed. | ||
901 | */ | ||
870 | add_used(vq, head, 0); | 902 | add_used(vq, head, 0); |
871 | } | 903 | } |
872 | 904 | ||
873 | /* Will reading from this file descriptor block? */ | 905 | /* |
906 | * Handling network input is a bit trickier, because I've tried to optimize it. | ||
907 | * | ||
908 | * First we have a helper routine which tells is if from this file descriptor | ||
909 | * (ie. the /dev/net/tun device) will block: | ||
910 | */ | ||
874 | static bool will_block(int fd) | 911 | static bool will_block(int fd) |
875 | { | 912 | { |
876 | fd_set fdset; | 913 | fd_set fdset; |
@@ -880,7 +917,11 @@ static bool will_block(int fd) | |||
880 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; | 917 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; |
881 | } | 918 | } |
882 | 919 | ||
883 | /* This handles packets coming in from the tun device to our Guest. */ | 920 | /* |
921 | * This handles packets coming in from the tun device to our Guest. Like all | ||
922 | * service routines, it gets called again as soon as it returns, so you don't | ||
923 | * see a while(1) loop here. | ||
924 | */ | ||
884 | static void net_input(struct virtqueue *vq) | 925 | static void net_input(struct virtqueue *vq) |
885 | { | 926 | { |
886 | int len; | 927 | int len; |
@@ -888,21 +929,38 @@ static void net_input(struct virtqueue *vq) | |||
888 | struct iovec iov[vq->vring.num]; | 929 | struct iovec iov[vq->vring.num]; |
889 | struct net_info *net_info = vq->dev->priv; | 930 | struct net_info *net_info = vq->dev->priv; |
890 | 931 | ||
932 | /* | ||
933 | * Get a descriptor to write an incoming packet into. This will also | ||
934 | * send an interrupt if they're out of descriptors. | ||
935 | */ | ||
891 | head = wait_for_vq_desc(vq, iov, &out, &in); | 936 | head = wait_for_vq_desc(vq, iov, &out, &in); |
892 | if (out) | 937 | if (out) |
893 | errx(1, "Output buffers in net input queue?"); | 938 | errx(1, "Output buffers in net input queue?"); |
894 | 939 | ||
895 | /* Deliver interrupt now, since we're about to sleep. */ | 940 | /* |
941 | * If it looks like we'll block reading from the tun device, send them | ||
942 | * an interrupt. | ||
943 | */ | ||
896 | if (vq->pending_used && will_block(net_info->tunfd)) | 944 | if (vq->pending_used && will_block(net_info->tunfd)) |
897 | trigger_irq(vq); | 945 | trigger_irq(vq); |
898 | 946 | ||
947 | /* | ||
948 | * Read in the packet. This is where we normally wait (when there's no | ||
949 | * incoming network traffic). | ||
950 | */ | ||
899 | len = readv(net_info->tunfd, iov, in); | 951 | len = readv(net_info->tunfd, iov, in); |
900 | if (len <= 0) | 952 | if (len <= 0) |
901 | err(1, "Failed to read from tun."); | 953 | err(1, "Failed to read from tun."); |
954 | |||
955 | /* | ||
956 | * Mark that packet buffer as used, but don't interrupt here. We want | ||
957 | * to wait until we've done as much work as we can. | ||
958 | */ | ||
902 | add_used(vq, head, len); | 959 | add_used(vq, head, len); |
903 | } | 960 | } |
961 | /*:*/ | ||
904 | 962 | ||
905 | /* This is the helper to create threads. */ | 963 | /* This is the helper to create threads: run the service routine in a loop. */ |
906 | static int do_thread(void *_vq) | 964 | static int do_thread(void *_vq) |
907 | { | 965 | { |
908 | struct virtqueue *vq = _vq; | 966 | struct virtqueue *vq = _vq; |
@@ -950,11 +1008,14 @@ static void reset_device(struct device *dev) | |||
950 | signal(SIGCHLD, (void *)kill_launcher); | 1008 | signal(SIGCHLD, (void *)kill_launcher); |
951 | } | 1009 | } |
952 | 1010 | ||
1011 | /*L:216 | ||
1012 | * This actually creates the thread which services the virtqueue for a device. | ||
1013 | */ | ||
953 | static void create_thread(struct virtqueue *vq) | 1014 | static void create_thread(struct virtqueue *vq) |
954 | { | 1015 | { |
955 | /* | 1016 | /* |
956 | * Create stack for thread and run it. Since the stack grows upwards, | 1017 | * Create stack for thread. Since the stack grows upwards, we point |
957 | * we point the stack pointer to the end of this region. | 1018 | * the stack pointer to the end of this region. |
958 | */ | 1019 | */ |
959 | char *stack = malloc(32768); | 1020 | char *stack = malloc(32768); |
960 | unsigned long args[] = { LHREQ_EVENTFD, | 1021 | unsigned long args[] = { LHREQ_EVENTFD, |
@@ -966,17 +1027,22 @@ static void create_thread(struct virtqueue *vq) | |||
966 | err(1, "Creating eventfd"); | 1027 | err(1, "Creating eventfd"); |
967 | args[2] = vq->eventfd; | 1028 | args[2] = vq->eventfd; |
968 | 1029 | ||
969 | /* Attach an eventfd to this virtqueue: it will go off | 1030 | /* |
970 | * when the Guest does an LHCALL_NOTIFY for this vq. */ | 1031 | * Attach an eventfd to this virtqueue: it will go off when the Guest |
1032 | * does an LHCALL_NOTIFY for this vq. | ||
1033 | */ | ||
971 | if (write(lguest_fd, &args, sizeof(args)) != 0) | 1034 | if (write(lguest_fd, &args, sizeof(args)) != 0) |
972 | err(1, "Attaching eventfd"); | 1035 | err(1, "Attaching eventfd"); |
973 | 1036 | ||
974 | /* CLONE_VM: because it has to access the Guest memory, and | 1037 | /* |
975 | * SIGCHLD so we get a signal if it dies. */ | 1038 | * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so |
1039 | * we get a signal if it dies. | ||
1040 | */ | ||
976 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); | 1041 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); |
977 | if (vq->thread == (pid_t)-1) | 1042 | if (vq->thread == (pid_t)-1) |
978 | err(1, "Creating clone"); | 1043 | err(1, "Creating clone"); |
979 | /* We close our local copy, now the child has it. */ | 1044 | |
1045 | /* We close our local copy now the child has it. */ | ||
980 | close(vq->eventfd); | 1046 | close(vq->eventfd); |
981 | } | 1047 | } |
982 | 1048 | ||
@@ -1028,7 +1094,10 @@ static void update_device_status(struct device *dev) | |||
1028 | } | 1094 | } |
1029 | } | 1095 | } |
1030 | 1096 | ||
1031 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ | 1097 | /*L:215 |
1098 | * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In | ||
1099 | * particular, it's used to notify us of device status changes during boot. | ||
1100 | */ | ||
1032 | static void handle_output(unsigned long addr) | 1101 | static void handle_output(unsigned long addr) |
1033 | { | 1102 | { |
1034 | struct device *i; | 1103 | struct device *i; |
@@ -1037,18 +1106,32 @@ static void handle_output(unsigned long addr) | |||
1037 | for (i = devices.dev; i; i = i->next) { | 1106 | for (i = devices.dev; i; i = i->next) { |
1038 | struct virtqueue *vq; | 1107 | struct virtqueue *vq; |
1039 | 1108 | ||
1040 | /* Notifications to device descriptors update device status. */ | 1109 | /* |
1110 | * Notifications to device descriptors mean they updated the | ||
1111 | * device status. | ||
1112 | */ | ||
1041 | if (from_guest_phys(addr) == i->desc) { | 1113 | if (from_guest_phys(addr) == i->desc) { |
1042 | update_device_status(i); | 1114 | update_device_status(i); |
1043 | return; | 1115 | return; |
1044 | } | 1116 | } |
1045 | 1117 | ||
1046 | /* Devices *can* be used before status is set to DRIVER_OK. */ | 1118 | /* |
1119 | * Devices *can* be used before status is set to DRIVER_OK. | ||
1120 | * The original plan was that they would never do this: they | ||
1121 | * would always finish setting up their status bits before | ||
1122 | * actually touching the virtqueues. In practice, we allowed | ||
1123 | * them to, and they do (eg. the disk probes for partition | ||
1124 | * tables as part of initialization). | ||
1125 | * | ||
1126 | * If we see this, we start the device: once it's running, we | ||
1127 | * expect the device to catch all the notifications. | ||
1128 | */ | ||
1047 | for (vq = i->vq; vq; vq = vq->next) { | 1129 | for (vq = i->vq; vq; vq = vq->next) { |
1048 | if (addr != vq->config.pfn*getpagesize()) | 1130 | if (addr != vq->config.pfn*getpagesize()) |
1049 | continue; | 1131 | continue; |
1050 | if (i->running) | 1132 | if (i->running) |
1051 | errx(1, "Notification on running %s", i->name); | 1133 | errx(1, "Notification on running %s", i->name); |
1134 | /* This just calls create_thread() for each virtqueue */ | ||
1052 | start_device(i); | 1135 | start_device(i); |
1053 | return; | 1136 | return; |
1054 | } | 1137 | } |
@@ -1132,6 +1215,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
1132 | vq->next = NULL; | 1215 | vq->next = NULL; |
1133 | vq->last_avail_idx = 0; | 1216 | vq->last_avail_idx = 0; |
1134 | vq->dev = dev; | 1217 | vq->dev = dev; |
1218 | |||
1219 | /* | ||
1220 | * This is the routine the service thread will run, and its Process ID | ||
1221 | * once it's running. | ||
1222 | */ | ||
1135 | vq->service = service; | 1223 | vq->service = service; |
1136 | vq->thread = (pid_t)-1; | 1224 | vq->thread = (pid_t)-1; |
1137 | 1225 | ||
@@ -1202,7 +1290,8 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
1202 | 1290 | ||
1203 | /* | 1291 | /* |
1204 | * This routine does all the creation and setup of a new device, including | 1292 | * This routine does all the creation and setup of a new device, including |
1205 | * calling new_dev_desc() to allocate the descriptor and device memory. | 1293 | * calling new_dev_desc() to allocate the descriptor and device memory. We |
1294 | * don't actually start the service threads until later. | ||
1206 | * | 1295 | * |
1207 | * See what I mean about userspace being boring? | 1296 | * See what I mean about userspace being boring? |
1208 | */ | 1297 | */ |
@@ -1478,19 +1567,7 @@ static void setup_tun_net(char *arg) | |||
1478 | verbose("device %u: tun %s: %s\n", | 1567 | verbose("device %u: tun %s: %s\n", |
1479 | devices.device_num, tapif, arg); | 1568 | devices.device_num, tapif, arg); |
1480 | } | 1569 | } |
1481 | 1570 | /*:*/ | |
1482 | /* | ||
1483 | * Our block (disk) device should be really simple: the Guest asks for a block | ||
1484 | * number and we read or write that position in the file. Unfortunately, that | ||
1485 | * was amazingly slow: the Guest waits until the read is finished before | ||
1486 | * running anything else, even if it could have been doing useful work. | ||
1487 | * | ||
1488 | * We could use async I/O, except it's reputed to suck so hard that characters | ||
1489 | * actually go missing from your code when you try to use it. | ||
1490 | * | ||
1491 | * So this was one reason why lguest now does all virtqueue servicing in | ||
1492 | * separate threads: it's more efficient and more like a real device. | ||
1493 | */ | ||
1494 | 1571 | ||
1495 | /* This hangs off device->priv. */ | 1572 | /* This hangs off device->priv. */ |
1496 | struct vblk_info | 1573 | struct vblk_info |
@@ -1512,8 +1589,16 @@ struct vblk_info | |||
1512 | /*L:210 | 1589 | /*L:210 |
1513 | * The Disk | 1590 | * The Disk |
1514 | * | 1591 | * |
1515 | * Remember that the block device is handled by a separate I/O thread. We head | 1592 | * The disk only has one virtqueue, so it only has one thread. It is really |
1516 | * straight into the core of that thread here: | 1593 | * simple: the Guest asks for a block number and we read or write that position |
1594 | * in the file. | ||
1595 | * | ||
1596 | * Before we serviced each virtqueue in a separate thread, that was unacceptably | ||
1597 | * slow: the Guest waits until the read is finished before running anything | ||
1598 | * else, even if it could have been doing useful work. | ||
1599 | * | ||
1600 | * We could have used async I/O, except it's reputed to suck so hard that | ||
1601 | * characters actually go missing from your code when you try to use it. | ||
1517 | */ | 1602 | */ |
1518 | static void blk_request(struct virtqueue *vq) | 1603 | static void blk_request(struct virtqueue *vq) |
1519 | { | 1604 | { |
@@ -1525,7 +1610,10 @@ static void blk_request(struct virtqueue *vq) | |||
1525 | struct iovec iov[vq->vring.num]; | 1610 | struct iovec iov[vq->vring.num]; |
1526 | off64_t off; | 1611 | off64_t off; |
1527 | 1612 | ||
1528 | /* Get the next request. */ | 1613 | /* |
1614 | * Get the next request, where we normally wait. It triggers the | ||
1615 | * interrupt to acknowledge previously serviced requests (if any). | ||
1616 | */ | ||
1529 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | 1617 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
1530 | 1618 | ||
1531 | /* | 1619 | /* |
@@ -1539,6 +1627,10 @@ static void blk_request(struct virtqueue *vq) | |||
1539 | 1627 | ||
1540 | out = convert(&iov[0], struct virtio_blk_outhdr); | 1628 | out = convert(&iov[0], struct virtio_blk_outhdr); |
1541 | in = convert(&iov[out_num+in_num-1], u8); | 1629 | in = convert(&iov[out_num+in_num-1], u8); |
1630 | /* | ||
1631 | * For historical reasons, block operations are expressed in 512 byte | ||
1632 | * "sectors". | ||
1633 | */ | ||
1542 | off = out->sector * 512; | 1634 | off = out->sector * 512; |
1543 | 1635 | ||
1544 | /* | 1636 | /* |
@@ -1614,6 +1706,7 @@ static void blk_request(struct virtqueue *vq) | |||
1614 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1706 | if (out->type & VIRTIO_BLK_T_BARRIER) |
1615 | fdatasync(vblk->fd); | 1707 | fdatasync(vblk->fd); |
1616 | 1708 | ||
1709 | /* Finished that request. */ | ||
1617 | add_used(vq, head, wlen); | 1710 | add_used(vq, head, wlen); |
1618 | } | 1711 | } |
1619 | 1712 | ||
@@ -1682,9 +1775,8 @@ static void rng_input(struct virtqueue *vq) | |||
1682 | errx(1, "Output buffers in rng?"); | 1775 | errx(1, "Output buffers in rng?"); |
1683 | 1776 | ||
1684 | /* | 1777 | /* |
1685 | * This is why we convert to iovecs: the readv() call uses them, and so | 1778 | * Just like the console write, we loop to cover the whole iovec. |
1686 | * it reads straight into the Guest's buffer. We loop to make sure we | 1779 | * In this case, short reads actually happen quite a bit. |
1687 | * fill it. | ||
1688 | */ | 1780 | */ |
1689 | while (!iov_empty(iov, in_num)) { | 1781 | while (!iov_empty(iov, in_num)) { |
1690 | len = readv(rng_info->rfd, iov, in_num); | 1782 | len = readv(rng_info->rfd, iov, in_num); |
@@ -1818,7 +1910,9 @@ int main(int argc, char *argv[]) | |||
1818 | devices.lastdev = NULL; | 1910 | devices.lastdev = NULL; |
1819 | devices.next_irq = 1; | 1911 | devices.next_irq = 1; |
1820 | 1912 | ||
1913 | /* We're CPU 0. In fact, that's the only CPU possible right now. */ | ||
1821 | cpu_id = 0; | 1914 | cpu_id = 0; |
1915 | |||
1822 | /* | 1916 | /* |
1823 | * We need to know how much memory so we can set up the device | 1917 | * We need to know how much memory so we can set up the device |
1824 | * descriptor and memory pages for the devices as we parse the command | 1918 | * descriptor and memory pages for the devices as we parse the command |
@@ -1926,7 +2020,7 @@ int main(int argc, char *argv[]) | |||
1926 | */ | 2020 | */ |
1927 | tell_kernel(start); | 2021 | tell_kernel(start); |
1928 | 2022 | ||
1929 | /* Ensure that we terminate if a child dies. */ | 2023 | /* Ensure that we terminate if a device-servicing child dies. */ |
1930 | signal(SIGCHLD, kill_launcher); | 2024 | signal(SIGCHLD, kill_launcher); |
1931 | 2025 | ||
1932 | /* If we exit via err(), this kills all the threads, restores tty. */ | 2026 | /* If we exit via err(), this kills all the threads, restores tty. */ |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index cceb73e12e50..ba0eed8aa1a6 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
@@ -35,10 +35,10 @@ | |||
35 | * operations? There are two ways: the direct way is to make a "hypercall", | 35 | * operations? There are two ways: the direct way is to make a "hypercall", |
36 | * to make requests of the Host Itself. | 36 | * to make requests of the Host Itself. |
37 | * | 37 | * |
38 | * We use the KVM hypercall mechanism. Seventeen hypercalls are | 38 | * We use the KVM hypercall mechanism, though completely different hypercall |
39 | * available: the hypercall number is put in the %eax register, and the | 39 | * numbers. Seventeen hypercalls are available: the hypercall number is put in |
40 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. | 40 | * the %eax register, and the arguments (when required) are placed in %ebx, |
41 | * If a return value makes sense, it's returned in %eax. | 41 | * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. |
42 | * | 42 | * |
43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 43 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
44 | * Host, rather than returning failure. This reflects Winston Churchill's | 44 | * Host, rather than returning failure. This reflects Winston Churchill's |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 025c04d18f2b..d677fa9ca650 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
@@ -154,6 +154,7 @@ static void lazy_hcall1(unsigned long call, | |||
154 | async_hcall(call, arg1, 0, 0, 0); | 154 | async_hcall(call, arg1, 0, 0, 0); |
155 | } | 155 | } |
156 | 156 | ||
157 | /* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ | ||
157 | static void lazy_hcall2(unsigned long call, | 158 | static void lazy_hcall2(unsigned long call, |
158 | unsigned long arg1, | 159 | unsigned long arg1, |
159 | unsigned long arg2) | 160 | unsigned long arg2) |
@@ -189,8 +190,10 @@ static void lazy_hcall4(unsigned long call, | |||
189 | } | 190 | } |
190 | #endif | 191 | #endif |
191 | 192 | ||
192 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 193 | /*G:036 |
193 | * issue the do-nothing hypercall to flush any stored calls. */ | 194 | * When lazy mode is turned off reset the per-cpu lazy mode variable and then |
195 | * issue the do-nothing hypercall to flush any stored calls. | ||
196 | :*/ | ||
194 | static void lguest_leave_lazy_mmu_mode(void) | 197 | static void lguest_leave_lazy_mmu_mode(void) |
195 | { | 198 | { |
196 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); | 199 | kvm_hypercall0(LHCALL_FLUSH_ASYNC); |
@@ -250,13 +253,11 @@ extern void lg_irq_enable(void); | |||
250 | extern void lg_restore_fl(unsigned long flags); | 253 | extern void lg_restore_fl(unsigned long flags); |
251 | 254 | ||
252 | /*M:003 | 255 | /*M:003 |
253 | * Note that we don't check for outstanding interrupts when we re-enable them | 256 | * We could be more efficient in our checking of outstanding interrupts, rather |
254 | * (or when we unmask an interrupt). This seems to work for the moment, since | 257 | * than using a branch. One way would be to put the "irq_enabled" field in a |
255 | * interrupts are rare and we'll just get the interrupt on the next timer tick, | 258 | * page by itself, and have the Host write-protect it when an interrupt comes |
256 | * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would | 259 | * in when irqs are disabled. There will then be a page fault as soon as |
257 | * be to put the "irq_enabled" field in a page by itself, and have the Host | 260 | * interrupts are re-enabled. |
258 | * write-protect it when an interrupt comes in when irqs are disabled. There | ||
259 | * will then be a page fault as soon as interrupts are re-enabled. | ||
260 | * | 261 | * |
261 | * A better method is to implement soft interrupt disable generally for x86: | 262 | * A better method is to implement soft interrupt disable generally for x86: |
262 | * instead of disabling interrupts, we set a flag. If an interrupt does come | 263 | * instead of disabling interrupts, we set a flag. If an interrupt does come |
@@ -568,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) | |||
568 | * cr3 ---> +---------+ | 569 | * cr3 ---> +---------+ |
569 | * | --------->+---------+ | 570 | * | --------->+---------+ |
570 | * | | | PADDR1 | | 571 | * | | | PADDR1 | |
571 | * Top-level | | PADDR2 | | 572 | * Mid-level | | PADDR2 | |
572 | * (PMD) page | | | | 573 | * (PMD) page | | | |
573 | * | | Lower-level | | 574 | * | | Lower-level | |
574 | * | | (PTE) page | | 575 | * | | (PTE) page | |
@@ -588,23 +589,62 @@ static void lguest_write_cr4(unsigned long val) | |||
588 | * Index into top Index into second Offset within page | 589 | * Index into top Index into second Offset within page |
589 | * page directory page pagetable page | 590 | * page directory page pagetable page |
590 | * | 591 | * |
591 | * The kernel spends a lot of time changing both the top-level page directory | 592 | * Now, unfortunately, this isn't the whole story: Intel added Physical Address |
592 | * and lower-level pagetable pages. The Guest doesn't know physical addresses, | 593 | * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). |
593 | * so while it maintains these page tables exactly like normal, it also needs | 594 | * These are held in 64-bit page table entries, so we can now only fit 512 |
594 | * to keep the Host informed whenever it makes a change: the Host will create | 595 | * entries in a page, and the neat three-level tree breaks down. |
595 | * the real page tables based on the Guests'. | 596 | * |
597 | * The result is a four level page table: | ||
598 | * | ||
599 | * cr3 --> [ 4 Upper ] | ||
600 | * [ Level ] | ||
601 | * [ Entries ] | ||
602 | * [(PUD Page)]---> +---------+ | ||
603 | * | --------->+---------+ | ||
604 | * | | | PADDR1 | | ||
605 | * Mid-level | | PADDR2 | | ||
606 | * (PMD) page | | | | ||
607 | * | | Lower-level | | ||
608 | * | | (PTE) page | | ||
609 | * | | | | | ||
610 | * .... .... | ||
611 | * | ||
612 | * | ||
613 | * And the virtual address is decoded as: | ||
614 | * | ||
615 | * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 | ||
616 | * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| | ||
617 | * Index into Index into mid Index into lower Offset within page | ||
618 | * top entries directory page pagetable page | ||
619 | * | ||
620 | * It's too hard to switch between these two formats at runtime, so Linux only | ||
621 | * supports one or the other depending on whether CONFIG_X86_PAE is set. Many | ||
622 | * distributions turn it on, and not just for people with silly amounts of | ||
623 | * memory: the larger PTE entries allow room for the NX bit, which lets the | ||
624 | * kernel disable execution of pages and increase security. | ||
625 | * | ||
626 | * This was a problem for lguest, which couldn't run on these distributions; | ||
627 | * then Matias Zabaljauregui figured it all out and implemented it, and only a | ||
628 | * handful of puppies were crushed in the process! | ||
629 | * | ||
630 | * Back to our point: the kernel spends a lot of time changing both the | ||
631 | * top-level page directory and lower-level pagetable pages. The Guest doesn't | ||
632 | * know physical addresses, so while it maintains these page tables exactly | ||
633 | * like normal, it also needs to keep the Host informed whenever it makes a | ||
634 | * change: the Host will create the real page tables based on the Guests'. | ||
596 | */ | 635 | */ |
597 | 636 | ||
598 | /* | 637 | /* |
599 | * The Guest calls this to set a second-level entry (pte), ie. to map a page | 638 | * The Guest calls this after it has set a second-level entry (pte), ie. to map |
600 | * into a process' address space. We set the entry then tell the Host the | 639 | * a page into a process' address space. Wetell the Host the toplevel and |
601 | * toplevel and address this corresponds to. The Guest uses one pagetable per | 640 | * address this corresponds to. The Guest uses one pagetable per process, so |
602 | * process, so we need to tell the Host which one we're changing (mm->pgd). | 641 | * we need to tell the Host which one we're changing (mm->pgd). |
603 | */ | 642 | */ |
604 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 643 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
605 | pte_t *ptep) | 644 | pte_t *ptep) |
606 | { | 645 | { |
607 | #ifdef CONFIG_X86_PAE | 646 | #ifdef CONFIG_X86_PAE |
647 | /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ | ||
608 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | 648 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, |
609 | ptep->pte_low, ptep->pte_high); | 649 | ptep->pte_low, ptep->pte_high); |
610 | #else | 650 | #else |
@@ -612,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | |||
612 | #endif | 652 | #endif |
613 | } | 653 | } |
614 | 654 | ||
655 | /* This is the "set and update" combo-meal-deal version. */ | ||
615 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 656 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
616 | pte_t *ptep, pte_t pteval) | 657 | pte_t *ptep, pte_t pteval) |
617 | { | 658 | { |
@@ -672,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) | |||
672 | } | 713 | } |
673 | 714 | ||
674 | #ifdef CONFIG_X86_PAE | 715 | #ifdef CONFIG_X86_PAE |
716 | /* | ||
717 | * With 64-bit PTE values, we need to be careful setting them: if we set 32 | ||
718 | * bits at a time, the hardware could see a weird half-set entry. These | ||
719 | * versions ensure we update all 64 bits at once. | ||
720 | */ | ||
675 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | 721 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) |
676 | { | 722 | { |
677 | native_set_pte_atomic(ptep, pte); | 723 | native_set_pte_atomic(ptep, pte); |
@@ -679,13 +725,14 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | |||
679 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 725 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
680 | } | 726 | } |
681 | 727 | ||
682 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | 728 | static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, |
729 | pte_t *ptep) | ||
683 | { | 730 | { |
684 | native_pte_clear(mm, addr, ptep); | 731 | native_pte_clear(mm, addr, ptep); |
685 | lguest_pte_update(mm, addr, ptep); | 732 | lguest_pte_update(mm, addr, ptep); |
686 | } | 733 | } |
687 | 734 | ||
688 | void lguest_pmd_clear(pmd_t *pmdp) | 735 | static void lguest_pmd_clear(pmd_t *pmdp) |
689 | { | 736 | { |
690 | lguest_set_pmd(pmdp, __pmd(0)); | 737 | lguest_set_pmd(pmdp, __pmd(0)); |
691 | } | 738 | } |
@@ -784,6 +831,14 @@ static void __init lguest_init_IRQ(void) | |||
784 | irq_ctx_init(smp_processor_id()); | 831 | irq_ctx_init(smp_processor_id()); |
785 | } | 832 | } |
786 | 833 | ||
834 | /* | ||
835 | * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so | ||
836 | * rather than set them in lguest_init_IRQ we are called here every time an | ||
837 | * lguest device needs an interrupt. | ||
838 | * | ||
839 | * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should | ||
840 | * pass that up! | ||
841 | */ | ||
787 | void lguest_setup_irq(unsigned int irq) | 842 | void lguest_setup_irq(unsigned int irq) |
788 | { | 843 | { |
789 | irq_to_desc_alloc_node(irq, 0); | 844 | irq_to_desc_alloc_node(irq, 0); |
@@ -1298,7 +1353,7 @@ __init void lguest_init(void) | |||
1298 | */ | 1353 | */ |
1299 | switch_to_new_gdt(0); | 1354 | switch_to_new_gdt(0); |
1300 | 1355 | ||
1301 | /* As described in head_32.S, we map the first 128M of memory. */ | 1356 | /* We actually boot with all memory mapped, but let's say 128MB. */ |
1302 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; | 1357 | max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; |
1303 | 1358 | ||
1304 | /* | 1359 | /* |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index db6aa95eb054..27eac0faee48 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
@@ -102,6 +102,7 @@ send_interrupts: | |||
102 | * create one manually here. | 102 | * create one manually here. |
103 | */ | 103 | */ |
104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | 104 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ |
105 | /* Put eax back the way we found it. */ | ||
105 | popl %eax | 106 | popl %eax |
106 | ret | 107 | ret |
107 | 108 | ||
@@ -125,6 +126,7 @@ ENTRY(lg_restore_fl) | |||
125 | jnz send_interrupts | 126 | jnz send_interrupts |
126 | /* Again, the normal path has used no extra registers. Clever, huh? */ | 127 | /* Again, the normal path has used no extra registers. Clever, huh? */ |
127 | ret | 128 | ret |
129 | /*:*/ | ||
128 | 130 | ||
129 | /* These demark the EIP range where host should never deliver interrupts. */ | 131 | /* These demark the EIP range where host should never deliver interrupts. */ |
130 | .global lguest_noirq_start | 132 | .global lguest_noirq_start |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
217 | 217 | ||
218 | /* | 218 | /* |
219 | * It's possible the Guest did a NOTIFY hypercall to the | 219 | * It's possible the Guest did a NOTIFY hypercall to the |
220 | * Launcher, in which case we return from the read() now. | 220 | * Launcher. |
221 | */ | 221 | */ |
222 | if (cpu->pending_notify) { | 222 | if (cpu->pending_notify) { |
223 | /* | ||
224 | * Does it just needs to write to a registered | ||
225 | * eventfd (ie. the appropriate virtqueue thread)? | ||
226 | */ | ||
223 | if (!send_notify_to_eventfd(cpu)) { | 227 | if (!send_notify_to_eventfd(cpu)) { |
228 | /* OK, we tell the main Laucher. */ | ||
224 | if (put_user(cpu->pending_notify, user)) | 229 | if (put_user(cpu->pending_notify, user)) |
225 | return -EFAULT; | 230 | return -EFAULT; |
226 | return sizeof(cpu->pending_notify); | 231 | return sizeof(cpu->pending_notify); |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
59 | case LHCALL_SHUTDOWN: { | 59 | case LHCALL_SHUTDOWN: { |
60 | char msg[128]; | 60 | char msg[128]; |
61 | /* | 61 | /* |
62 | * Shutdown is such a trivial hypercall that we do it in four | 62 | * Shutdown is such a trivial hypercall that we do it in five |
63 | * lines right here. | 63 | * lines right here. |
64 | * | 64 | * |
65 | * If the lgread fails, it will call kill_guest() itself; the | 65 | * If the lgread fails, it will call kill_guest() itself; the |
@@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) | |||
245 | * device), the Guest will still see the old page. In practice, this never | 245 | * device), the Guest will still see the old page. In practice, this never |
246 | * happens: why would the Guest read a page which it has never written to? But | 246 | * happens: why would the Guest read a page which it has never written to? But |
247 | * a similar scenario might one day bite us, so it's worth mentioning. | 247 | * a similar scenario might one day bite us, so it's worth mentioning. |
248 | * | ||
249 | * Note that if we used a shared anonymous mapping in the Launcher instead of | ||
250 | * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we | ||
251 | * need that to switch the Launcher to processes (away from threads) anyway. | ||
248 | :*/ | 252 | :*/ |
249 | 253 | ||
250 | /*H:100 | 254 | /*H:100 |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
@@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) | |||
236 | extern void lguest_setup_irq(unsigned int irq); | 236 | extern void lguest_setup_irq(unsigned int irq); |
237 | 237 | ||
238 | /* | 238 | /* |
239 | * This routine finds the first virtqueue described in the configuration of | 239 | * This routine finds the Nth virtqueue described in the configuration of |
240 | * this device and sets it up. | 240 | * this device and sets it up. |
241 | * | 241 | * |
242 | * This is kind of an ugly duckling. It'd be nicer to have a standard | 242 | * This is kind of an ugly duckling. It'd be nicer to have a standard |
@@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); | |||
244 | * everyone wants to do it differently. The KVM coders want the Guest to | 244 | * everyone wants to do it differently. The KVM coders want the Guest to |
245 | * allocate its own pages and tell the Host where they are, but for lguest it's | 245 | * allocate its own pages and tell the Host where they are, but for lguest it's |
246 | * simpler for the Host to simply tell us where the pages are. | 246 | * simpler for the Host to simply tell us where the pages are. |
247 | * | ||
248 | * So we provide drivers with a "find the Nth virtqueue and set it up" | ||
249 | * function. | ||
250 | */ | 247 | */ |
251 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | 248 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, |
252 | unsigned index, | 249 | unsigned index, |
@@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, | |||
422 | 419 | ||
423 | /* This devices' parent is the lguest/ dir. */ | 420 | /* This devices' parent is the lguest/ dir. */ |
424 | ldev->vdev.dev.parent = lguest_root; | 421 | ldev->vdev.dev.parent = lguest_root; |
425 | /* We have a unique device index thanks to the dev_index counter. */ | 422 | /* |
423 | * The device type comes straight from the descriptor. There's also a | ||
424 | * device vendor field in the virtio_device struct, which we leave as | ||
425 | * 0. | ||
426 | */ | ||
426 | ldev->vdev.id.device = d->type; | 427 | ldev->vdev.id.device = d->type; |
427 | /* | 428 | /* |
428 | * We have a simple set of routines for querying the device's | 429 | * We have a simple set of routines for querying the device's |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -1,9 +1,8 @@ | |||
1 | /*P:200 | 1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher |
2 | * This contains all the /dev/lguest code, whereby the userspace launcher | ||
3 | * controls and communicates with the Guest. For example, the first write will | 2 | * controls and communicates with the Guest. For example, the first write will |
4 | * tell us the Guest's memory layout, pagetable, entry point and kernel address | 3 | * tell us the Guest's memory layout and entry point. A read will run the |
5 | * offset. A read will run the Guest until something happens, such as a signal | 4 | * Guest until something happens, such as a signal or the Guest doing a NOTIFY |
6 | * or the Guest doing a NOTIFY out to the Launcher. | 5 | * out to the Launcher. |
7 | :*/ | 6 | :*/ |
8 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
9 | #include <linux/miscdevice.h> | 8 | #include <linux/miscdevice.h> |
@@ -13,14 +12,41 @@ | |||
13 | #include <linux/file.h> | 12 | #include <linux/file.h> |
14 | #include "lg.h" | 13 | #include "lg.h" |
15 | 14 | ||
15 | /*L:056 | ||
16 | * Before we move on, let's jump ahead and look at what the kernel does when | ||
17 | * it needs to look up the eventfds. That will complete our picture of how we | ||
18 | * use RCU. | ||
19 | * | ||
20 | * The notification value is in cpu->pending_notify: we return true if it went | ||
21 | * to an eventfd. | ||
22 | */ | ||
16 | bool send_notify_to_eventfd(struct lg_cpu *cpu) | 23 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
17 | { | 24 | { |
18 | unsigned int i; | 25 | unsigned int i; |
19 | struct lg_eventfd_map *map; | 26 | struct lg_eventfd_map *map; |
20 | 27 | ||
21 | /* lg->eventfds is RCU-protected */ | 28 | /* |
29 | * This "rcu_read_lock()" helps track when someone is still looking at | ||
30 | * the (RCU-using) eventfds array. It's not actually a lock at all; | ||
31 | * indeed it's a noop in many configurations. (You didn't expect me to | ||
32 | * explain all the RCU secrets here, did you?) | ||
33 | */ | ||
22 | rcu_read_lock(); | 34 | rcu_read_lock(); |
35 | /* | ||
36 | * rcu_dereference is the counter-side of rcu_assign_pointer(); it | ||
37 | * makes sure we don't access the memory pointed to by | ||
38 | * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, | ||
39 | * but Alpha allows this! Paul McKenney points out that a really | ||
40 | * aggressive compiler could have the same effect: | ||
41 | * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html | ||
42 | * | ||
43 | * So play safe, use rcu_dereference to get the rcu-protected pointer: | ||
44 | */ | ||
23 | map = rcu_dereference(cpu->lg->eventfds); | 45 | map = rcu_dereference(cpu->lg->eventfds); |
46 | /* | ||
47 | * Simple array search: even if they add an eventfd while we do this, | ||
48 | * we'll continue to use the old array and just won't see the new one. | ||
49 | */ | ||
24 | for (i = 0; i < map->num; i++) { | 50 | for (i = 0; i < map->num; i++) { |
25 | if (map->map[i].addr == cpu->pending_notify) { | 51 | if (map->map[i].addr == cpu->pending_notify) { |
26 | eventfd_signal(map->map[i].event, 1); | 52 | eventfd_signal(map->map[i].event, 1); |
@@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) | |||
28 | break; | 54 | break; |
29 | } | 55 | } |
30 | } | 56 | } |
57 | /* We're done with the rcu-protected variable cpu->lg->eventfds. */ | ||
31 | rcu_read_unlock(); | 58 | rcu_read_unlock(); |
59 | |||
60 | /* If we cleared the notification, it's because we found a match. */ | ||
32 | return cpu->pending_notify == 0; | 61 | return cpu->pending_notify == 0; |
33 | } | 62 | } |
34 | 63 | ||
64 | /*L:055 | ||
65 | * One of the more tricksy tricks in the Linux Kernel is a technique called | ||
66 | * Read Copy Update. Since one point of lguest is to teach lguest journeyers | ||
67 | * about kernel coding, I use it here. (In case you're curious, other purposes | ||
68 | * include learning about virtualization and instilling a deep appreciation for | ||
69 | * simplicity and puppies). | ||
70 | * | ||
71 | * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we | ||
72 | * add new eventfds without ever blocking readers from accessing the array. | ||
73 | * The current Launcher only does this during boot, so that never happens. But | ||
74 | * Read Copy Update is cool, and adding a lock risks damaging even more puppies | ||
75 | * than this code does. | ||
76 | * | ||
77 | * We allocate a brand new one-larger array, copy the old one and add our new | ||
78 | * element. Then we make the lg eventfd pointer point to the new array. | ||
79 | * That's the easy part: now we need to free the old one, but we need to make | ||
80 | * sure no slow CPU somewhere is still looking at it. That's what | ||
81 | * synchronize_rcu does for us: waits until every CPU has indicated that it has | ||
82 | * moved on to know it's no longer using the old one. | ||
83 | * | ||
84 | * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. | ||
85 | */ | ||
35 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | 86 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
36 | { | 87 | { |
37 | struct lg_eventfd_map *new, *old = lg->eventfds; | 88 | struct lg_eventfd_map *new, *old = lg->eventfds; |
38 | 89 | ||
90 | /* | ||
91 | * We don't allow notifications on value 0 anyway (pending_notify of | ||
92 | * 0 means "nothing pending"). | ||
93 | */ | ||
39 | if (!addr) | 94 | if (!addr) |
40 | return -EINVAL; | 95 | return -EINVAL; |
41 | 96 | ||
@@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
62 | } | 117 | } |
63 | new->num++; | 118 | new->num++; |
64 | 119 | ||
65 | /* Now put new one in place. */ | 120 | /* |
121 | * Now put new one in place: rcu_assign_pointer() is a fancy way of | ||
122 | * doing "lg->eventfds = new", but it uses memory barriers to make | ||
123 | * absolutely sure that the contents of "new" written above is nailed | ||
124 | * down before we actually do the assignment. | ||
125 | * | ||
126 | * We have to think about these kinds of things when we're operating on | ||
127 | * live data without locks. | ||
128 | */ | ||
66 | rcu_assign_pointer(lg->eventfds, new); | 129 | rcu_assign_pointer(lg->eventfds, new); |
67 | 130 | ||
68 | /* | 131 | /* |
69 | * We're not in a big hurry. Wait until noone's looking at old | 132 | * We're not in a big hurry. Wait until noone's looking at old |
70 | * version, then delete it. | 133 | * version, then free it. |
71 | */ | 134 | */ |
72 | synchronize_rcu(); | 135 | synchronize_rcu(); |
73 | kfree(old); | 136 | kfree(old); |
@@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
75 | return 0; | 138 | return 0; |
76 | } | 139 | } |
77 | 140 | ||
141 | /*L:052 | ||
142 | * Receiving notifications from the Guest is usually done by attaching a | ||
143 | * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will | ||
144 | * become readable when the Guest does an LHCALL_NOTIFY with that value. | ||
145 | * | ||
146 | * This is really convenient for processing each virtqueue in a separate | ||
147 | * thread. | ||
148 | */ | ||
78 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | 149 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) |
79 | { | 150 | { |
80 | unsigned long addr, fd; | 151 | unsigned long addr, fd; |
@@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | |||
86 | if (get_user(fd, input) != 0) | 157 | if (get_user(fd, input) != 0) |
87 | return -EFAULT; | 158 | return -EFAULT; |
88 | 159 | ||
160 | /* | ||
161 | * Just make sure two callers don't add eventfds at once. We really | ||
162 | * only need to lock against callers adding to the same Guest, so using | ||
163 | * the Big Lguest Lock is overkill. But this is setup, not a fast path. | ||
164 | */ | ||
89 | mutex_lock(&lguest_lock); | 165 | mutex_lock(&lguest_lock); |
90 | err = add_eventfd(lg, addr, fd); | 166 | err = add_eventfd(lg, addr, fd); |
91 | mutex_unlock(&lguest_lock); | 167 | mutex_unlock(&lguest_lock); |
@@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
106 | if (irq >= LGUEST_IRQS) | 182 | if (irq >= LGUEST_IRQS) |
107 | return -EINVAL; | 183 | return -EINVAL; |
108 | 184 | ||
185 | /* | ||
186 | * Next time the Guest runs, the core code will see if it can deliver | ||
187 | * this interrupt. | ||
188 | */ | ||
109 | set_interrupt(cpu, irq); | 189 | set_interrupt(cpu, irq); |
110 | return 0; | 190 | return 0; |
111 | } | 191 | } |
@@ -307,10 +387,10 @@ unlock: | |||
307 | * The first operation the Launcher does must be a write. All writes | 387 | * The first operation the Launcher does must be a write. All writes |
308 | * start with an unsigned long number: for the first write this must be | 388 | * start with an unsigned long number: for the first write this must be |
309 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | 389 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use |
310 | * writes of other values to send interrupts. | 390 | * writes of other values to send interrupts or set up receipt of notifications. |
311 | * | 391 | * |
312 | * Note that we overload the "offset" in the /dev/lguest file to indicate what | 392 | * Note that we overload the "offset" in the /dev/lguest file to indicate what |
313 | * CPU number we're dealing with. Currently this is always 0, since we only | 393 | * CPU number we're dealing with. Currently this is always 0 since we only |
314 | * support uniprocessor Guests, but you can see the beginnings of SMP support | 394 | * support uniprocessor Guests, but you can see the beginnings of SMP support |
315 | * here. | 395 | * here. |
316 | */ | 396 | */ |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -29,10 +29,10 @@ | |||
29 | /*H:300 | 29 | /*H:300 |
30 | * The Page Table Code | 30 | * The Page Table Code |
31 | * | 31 | * |
32 | * We use two-level page tables for the Guest. If you're not entirely | 32 | * We use two-level page tables for the Guest, or three-level with PAE. If |
33 | * comfortable with virtual addresses, physical addresses and page tables then | 33 | * you're not entirely comfortable with virtual addresses, physical addresses |
34 | * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with | 34 | * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page |
35 | * diagrams!). | 35 | * Table Handling" (with diagrams!). |
36 | * | 36 | * |
37 | * The Guest keeps page tables, but we maintain the actual ones here: these are | 37 | * The Guest keeps page tables, but we maintain the actual ones here: these are |
38 | * called "shadow" page tables. Which is a very Guest-centric name: these are | 38 | * called "shadow" page tables. Which is a very Guest-centric name: these are |
@@ -52,9 +52,8 @@ | |||
52 | :*/ | 52 | :*/ |
53 | 53 | ||
54 | /* | 54 | /* |
55 | * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 55 | * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) |
56 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 56 | * or 512 PTE entries with PAE (2MB). |
57 | * page. | ||
58 | */ | 57 | */ |
59 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 58 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
60 | 59 | ||
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | |||
81 | 80 | ||
82 | /*H:320 | 81 | /*H:320 |
83 | * The page table code is curly enough to need helper functions to keep it | 82 | * The page table code is curly enough to need helper functions to keep it |
84 | * clear and clean. | 83 | * clear and clean. The kernel itself provides many of them; one advantage |
84 | * of insisting that the Guest and Host use the same CONFIG_PAE setting. | ||
85 | * | 85 | * |
86 | * There are two functions which return pointers to the shadow (aka "real") | 86 | * There are two functions which return pointers to the shadow (aka "real") |
87 | * page tables. | 87 | * page tables. |
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * These two functions just like the above two, except they access the Guest | 158 | * These functions are just like the above two, except they access the Guest |
159 | * page tables. Hence they return a Guest address. | 159 | * page tables. Hence they return a Guest address. |
160 | */ | 160 | */ |
161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | #ifdef CONFIG_X86_PAE | 167 | #ifdef CONFIG_X86_PAE |
168 | /* Follow the PGD to the PMD. */ | ||
168 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | 169 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) |
169 | { | 170 | { |
170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 171 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | |||
172 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | 173 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); |
173 | } | 174 | } |
174 | 175 | ||
176 | /* Follow the PMD to the PTE. */ | ||
175 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 177 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
176 | pmd_t gpmd, unsigned long vaddr) | 178 | pmd_t gpmd, unsigned long vaddr) |
177 | { | 179 | { |
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, | |||
181 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 183 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
182 | } | 184 | } |
183 | #else | 185 | #else |
186 | /* Follow the PGD to the PTE (no mid-level for !PAE). */ | ||
184 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 187 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
185 | pgd_t gpgd, unsigned long vaddr) | 188 | pgd_t gpgd, unsigned long vaddr) |
186 | { | 189 | { |
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
314 | pte_t gpte; | 317 | pte_t gpte; |
315 | pte_t *spte; | 318 | pte_t *spte; |
316 | 319 | ||
320 | /* Mid level for PAE. */ | ||
317 | #ifdef CONFIG_X86_PAE | 321 | #ifdef CONFIG_X86_PAE |
318 | pmd_t *spmd; | 322 | pmd_t *spmd; |
319 | pmd_t gpmd; | 323 | pmd_t gpmd; |
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
391 | */ | 395 | */ |
392 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 396 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
393 | #endif | 397 | #endif |
398 | |||
399 | /* Read the actual PTE value. */ | ||
394 | gpte = lgread(cpu, gpte_ptr, pte_t); | 400 | gpte = lgread(cpu, gpte_ptr, pte_t); |
395 | 401 | ||
396 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 402 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
507 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) | 513 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
508 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 514 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
509 | } | 515 | } |
516 | /*:*/ | ||
510 | 517 | ||
511 | #ifdef CONFIG_X86_PAE | 518 | #ifdef CONFIG_X86_PAE |
512 | static void release_pmd(pmd_t *spmd) | 519 | static void release_pmd(pmd_t *spmd) |
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) | |||
543 | } | 550 | } |
544 | 551 | ||
545 | #else /* !CONFIG_X86_PAE */ | 552 | #else /* !CONFIG_X86_PAE */ |
546 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 553 | /*H:450 |
554 | * If we chase down the release_pgd() code, the non-PAE version looks like | ||
555 | * this. The PAE version is almost identical, but instead of calling | ||
556 | * release_pte it calls release_pmd(), which looks much like this. | ||
557 | */ | ||
547 | static void release_pgd(pgd_t *spgd) | 558 | static void release_pgd(pgd_t *spgd) |
548 | { | 559 | { |
549 | /* If the entry's not present, there's nothing to release. */ | 560 | /* If the entry's not present, there's nothing to release. */ |
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
898 | /* ... throw it away. */ | 909 | /* ... throw it away. */ |
899 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 910 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
900 | } | 911 | } |
912 | |||
901 | #ifdef CONFIG_X86_PAE | 913 | #ifdef CONFIG_X86_PAE |
914 | /* For setting a mid-level, we just throw everything away. It's easy. */ | ||
902 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | 915 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) |
903 | { | 916 | { |
904 | guest_pagetable_clear_all(&lg->cpus[0]); | 917 | guest_pagetable_clear_all(&lg->cpus[0]); |
905 | } | 918 | } |
906 | #endif | 919 | #endif |
907 | 920 | ||
908 | /* | 921 | /*H:505 |
909 | * Once we know how much memory we have we can construct simple identity (which | 922 | * To get through boot, we construct simple identity page mappings (which |
910 | * set virtual == physical) and linear mappings which will get the Guest far | 923 | * set virtual == physical) and linear mappings which will get the Guest far |
911 | * enough into the boot to create its own. | 924 | * enough into the boot to create its own. The linear mapping means we |
925 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
926 | * as you'll see. | ||
912 | * | 927 | * |
913 | * We lay them out of the way, just below the initrd (which is why we need to | 928 | * We lay them out of the way, just below the initrd (which is why we need to |
914 | * know its size here). | 929 | * know its size here). |
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
944 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 959 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
945 | 960 | ||
946 | #ifdef CONFIG_X86_PAE | 961 | #ifdef CONFIG_X86_PAE |
962 | /* | ||
963 | * And the single mid page goes below that. We only use one, but | ||
964 | * that's enough to map 1G, which definitely gets us through boot. | ||
965 | */ | ||
947 | pmds = (void *)linear - PAGE_SIZE; | 966 | pmds = (void *)linear - PAGE_SIZE; |
948 | #endif | 967 | #endif |
949 | /* | 968 | /* |
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
957 | return -EFAULT; | 976 | return -EFAULT; |
958 | } | 977 | } |
959 | 978 | ||
979 | #ifdef CONFIG_X86_PAE | ||
960 | /* | 980 | /* |
961 | * The top level points to the linear page table pages above. | 981 | * Make the Guest PMD entries point to the corresponding place in the |
962 | * We setup the identity and linear mappings here. | 982 | * linear mapping (up to one page worth of PMD). |
963 | */ | 983 | */ |
964 | #ifdef CONFIG_X86_PAE | ||
965 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | 984 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; |
966 | i += PTRS_PER_PTE, j++) { | 985 | i += PTRS_PER_PTE, j++) { |
986 | /* FIXME: native_set_pmd is overkill here. */ | ||
967 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | 987 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) |
968 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 988 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
969 | 989 | ||
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
971 | return -EFAULT; | 991 | return -EFAULT; |
972 | } | 992 | } |
973 | 993 | ||
994 | /* One PGD entry, pointing to that PMD page. */ | ||
974 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | 995 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); |
996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
975 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | 997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) |
976 | return -EFAULT; | 998 | return -EFAULT; |
999 | /* | ||
1000 | * And the third PGD entry (ie. addresses 3G-4G). | ||
1001 | * | ||
1002 | * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. | ||
1003 | */ | ||
977 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | 1004 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) |
978 | return -EFAULT; | 1005 | return -EFAULT; |
979 | #else | 1006 | #else |
1007 | /* | ||
1008 | * The top level points to the linear page table pages above. | ||
1009 | * We setup the identity and linear mappings here. | ||
1010 | */ | ||
980 | phys_linear = (unsigned long)linear - mem_base; | 1011 | phys_linear = (unsigned long)linear - mem_base; |
981 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 1012 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
982 | pgd_t pgd; | 1013 | pgd_t pgd; |
1014 | /* | ||
1015 | * Create a PGD entry which points to the right part of the | ||
1016 | * linear PTE pages. | ||
1017 | */ | ||
983 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | 1018 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | |
984 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 1019 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
985 | 1020 | ||
1021 | /* | ||
1022 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
1023 | */ | ||
986 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | 1024 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) |
987 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | 1025 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) |
988 | + i / PTRS_PER_PTE], | 1026 | + i / PTRS_PER_PTE], |
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
992 | #endif | 1030 | #endif |
993 | 1031 | ||
994 | /* | 1032 | /* |
995 | * We return the top level (guest-physical) address: remember where | 1033 | * We return the top level (guest-physical) address: we remember where |
996 | * this is. | 1034 | * this is to write it into lguest_data when the Guest initializes. |
997 | */ | 1035 | */ |
998 | return (unsigned long)pgdir - mem_base; | 1036 | return (unsigned long)pgdir - mem_base; |
999 | } | 1037 | } |
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) | |||
1031 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 1069 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
1032 | if (!lg->pgdirs[0].pgdir) | 1070 | if (!lg->pgdirs[0].pgdir) |
1033 | return -ENOMEM; | 1071 | return -ENOMEM; |
1072 | |||
1034 | #ifdef CONFIG_X86_PAE | 1073 | #ifdef CONFIG_X86_PAE |
1074 | /* For PAE, we also create the initial mid-level. */ | ||
1035 | pgd = lg->pgdirs[0].pgdir; | 1075 | pgd = lg->pgdirs[0].pgdir; |
1036 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | 1076 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); |
1037 | if (!pmd_table) | 1077 | if (!pmd_table) |
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) | |||
1040 | set_pgd(pgd + SWITCHER_PGD_INDEX, | 1080 | set_pgd(pgd + SWITCHER_PGD_INDEX, |
1041 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 1081 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
1042 | #endif | 1082 | #endif |
1083 | |||
1084 | /* This is the current page table. */ | ||
1043 | lg->cpus[0].cpu_pgd = 0; | 1085 | lg->cpus[0].cpu_pgd = 0; |
1044 | return 0; | 1086 | return 0; |
1045 | } | 1087 | } |
1046 | 1088 | ||
1047 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1089 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
1048 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1090 | void page_table_guest_data_init(struct lg_cpu *cpu) |
1049 | { | 1091 | { |
1050 | /* We get the kernel address: above this is all kernel memory. */ | 1092 | /* We get the kernel address: above this is all kernel memory. */ |
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
1105 | pmd_t switcher_pmd; | 1147 | pmd_t switcher_pmd; |
1106 | pmd_t *pmd_table; | 1148 | pmd_t *pmd_table; |
1107 | 1149 | ||
1150 | /* FIXME: native_set_pmd is overkill here. */ | ||
1108 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | 1151 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> |
1109 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | 1152 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
1110 | 1153 | ||
1154 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
1155 | * it to a virtual address. */ | ||
1111 | pmd_table = __va(pgd_pfn(cpu->lg-> | 1156 | pmd_table = __va(pgd_pfn(cpu->lg-> |
1112 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | 1157 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) |
1113 | << PAGE_SHIFT); | 1158 | << PAGE_SHIFT); |
1159 | /* Now write it into the shadow page table. */ | ||
1114 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | 1160 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); |
1115 | #else | 1161 | #else |
1116 | pgd_t switcher_pgd; | 1162 | pgd_t switcher_pgd; |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore | 187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore |
188 | * things when we exit to Launcher userspace, but that's fairly easy. | 188 | * things when we exit to Launcher userspace, but that's fairly easy. |
189 | * | 189 | * |
190 | * We could also try using this hooks for PGE, but that might be too expensive. | 190 | * We could also try using these hooks for PGE, but that might be too expensive. |
191 | * | 191 | * |
192 | * The hooks were designed for KVM, but we can also put them to good use. | 192 | * The hooks were designed for KVM, but we can also put them to good use. |
193 | :*/ | 193 | :*/ |
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /*P:900 | 1 | /*P:900 |
2 | * This is the Switcher: code which sits at 0xFFC00000 astride both the | 2 | * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride |
3 | * Host and Guest to do the low-level Guest<->Host switch. It is as simple as | 3 | * both the Host and Guest to do the low-level Guest<->Host switch. It is as |
4 | * it can be made, but it's naturally very specific to x86. | 4 | * simple as it can be made, but it's naturally very specific to x86. |
5 | * | 5 | * |
6 | * You have now completed Preparation. If this has whet your appetite; if you | 6 | * You have now completed Preparation. If this has whet your appetite; if you |
7 | * are feeling invigorated and refreshed then the next, more challenging stage | 7 | * are feeling invigorated and refreshed then the next, more challenging stage |