diff options
| -rw-r--r-- | Documentation/lguest/Makefile | 3 | ||||
| -rw-r--r-- | Documentation/lguest/lguest.c | 1008 | ||||
| -rw-r--r-- | Documentation/lguest/lguest.txt | 1 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest.h | 7 | ||||
| -rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 15 | ||||
| -rw-r--r-- | arch/x86/kernel/asm-offsets_32.c | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/Kconfig | 1 | ||||
| -rw-r--r-- | arch/x86/lguest/boot.c | 158 | ||||
| -rw-r--r-- | arch/x86/lguest/i386_head.S | 60 | ||||
| -rw-r--r-- | drivers/lguest/Kconfig | 2 | ||||
| -rw-r--r-- | drivers/lguest/core.c | 30 | ||||
| -rw-r--r-- | drivers/lguest/hypercalls.c | 14 | ||||
| -rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 57 | ||||
| -rw-r--r-- | drivers/lguest/lg.h | 28 | ||||
| -rw-r--r-- | drivers/lguest/lguest_user.c | 127 | ||||
| -rw-r--r-- | drivers/lguest/page_tables.c | 396 | ||||
| -rw-r--r-- | drivers/lguest/segments.c | 2 | ||||
| -rw-r--r-- | fs/eventfd.c | 3 | ||||
| -rw-r--r-- | include/linux/lguest.h | 4 | ||||
| -rw-r--r-- | include/linux/lguest_launcher.h | 3 | ||||
| -rw-r--r-- | kernel/sched.c | 1 |
21 files changed, 1103 insertions, 818 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile index 1f4f9e888bd1..28c8cdfcafd8 100644 --- a/Documentation/lguest/Makefile +++ b/Documentation/lguest/Makefile | |||
| @@ -1,6 +1,5 @@ | |||
| 1 | # This creates the demonstration utility "lguest" which runs a Linux guest. | 1 | # This creates the demonstration utility "lguest" which runs a Linux guest. |
| 2 | CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE | 2 | CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE |
| 3 | LDLIBS:=-lz | ||
| 4 | 3 | ||
| 5 | all: lguest | 4 | all: lguest |
| 6 | 5 | ||
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index d36fcc0f2715..9ebcd6ef361b 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <sys/types.h> | 16 | #include <sys/types.h> |
| 17 | #include <sys/stat.h> | 17 | #include <sys/stat.h> |
| 18 | #include <sys/wait.h> | 18 | #include <sys/wait.h> |
| 19 | #include <sys/eventfd.h> | ||
| 19 | #include <fcntl.h> | 20 | #include <fcntl.h> |
| 20 | #include <stdbool.h> | 21 | #include <stdbool.h> |
| 21 | #include <errno.h> | 22 | #include <errno.h> |
| @@ -59,7 +60,6 @@ typedef uint8_t u8; | |||
| 59 | /*:*/ | 60 | /*:*/ |
| 60 | 61 | ||
| 61 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ | 62 | #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ |
| 62 | #define NET_PEERNUM 1 | ||
| 63 | #define BRIDGE_PFX "bridge:" | 63 | #define BRIDGE_PFX "bridge:" |
| 64 | #ifndef SIOCBRADDIF | 64 | #ifndef SIOCBRADDIF |
| 65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ | 65 | #define SIOCBRADDIF 0x89a2 /* add interface to bridge */ |
| @@ -76,19 +76,12 @@ static bool verbose; | |||
| 76 | do { if (verbose) printf(args); } while(0) | 76 | do { if (verbose) printf(args); } while(0) |
| 77 | /*:*/ | 77 | /*:*/ |
| 78 | 78 | ||
| 79 | /* File descriptors for the Waker. */ | ||
| 80 | struct { | ||
| 81 | int pipe[2]; | ||
| 82 | int lguest_fd; | ||
| 83 | } waker_fds; | ||
| 84 | |||
| 85 | /* The pointer to the start of guest memory. */ | 79 | /* The pointer to the start of guest memory. */ |
| 86 | static void *guest_base; | 80 | static void *guest_base; |
| 87 | /* The maximum guest physical address allowed, and maximum possible. */ | 81 | /* The maximum guest physical address allowed, and maximum possible. */ |
| 88 | static unsigned long guest_limit, guest_max; | 82 | static unsigned long guest_limit, guest_max; |
| 89 | /* The pipe for signal hander to write to. */ | 83 | /* The /dev/lguest file descriptor. */ |
| 90 | static int timeoutpipe[2]; | 84 | static int lguest_fd; |
| 91 | static unsigned int timeout_usec = 500; | ||
| 92 | 85 | ||
| 93 | /* a per-cpu variable indicating whose vcpu is currently running */ | 86 | /* a per-cpu variable indicating whose vcpu is currently running */ |
| 94 | static unsigned int __thread cpu_id; | 87 | static unsigned int __thread cpu_id; |
| @@ -96,11 +89,6 @@ static unsigned int __thread cpu_id; | |||
| 96 | /* This is our list of devices. */ | 89 | /* This is our list of devices. */ |
| 97 | struct device_list | 90 | struct device_list |
| 98 | { | 91 | { |
| 99 | /* Summary information about the devices in our list: ready to pass to | ||
| 100 | * select() to ask which need servicing.*/ | ||
| 101 | fd_set infds; | ||
| 102 | int max_infd; | ||
| 103 | |||
| 104 | /* Counter to assign interrupt numbers. */ | 92 | /* Counter to assign interrupt numbers. */ |
| 105 | unsigned int next_irq; | 93 | unsigned int next_irq; |
| 106 | 94 | ||
| @@ -126,22 +114,21 @@ struct device | |||
| 126 | /* The linked-list pointer. */ | 114 | /* The linked-list pointer. */ |
| 127 | struct device *next; | 115 | struct device *next; |
| 128 | 116 | ||
| 129 | /* The this device's descriptor, as mapped into the Guest. */ | 117 | /* The device's descriptor, as mapped into the Guest. */ |
| 130 | struct lguest_device_desc *desc; | 118 | struct lguest_device_desc *desc; |
| 131 | 119 | ||
| 120 | /* We can't trust desc values once Guest has booted: we use these. */ | ||
| 121 | unsigned int feature_len; | ||
| 122 | unsigned int num_vq; | ||
| 123 | |||
| 132 | /* The name of this device, for --verbose. */ | 124 | /* The name of this device, for --verbose. */ |
| 133 | const char *name; | 125 | const char *name; |
| 134 | 126 | ||
| 135 | /* If handle_input is set, it wants to be called when this file | ||
| 136 | * descriptor is ready. */ | ||
| 137 | int fd; | ||
| 138 | bool (*handle_input)(int fd, struct device *me); | ||
| 139 | |||
| 140 | /* Any queues attached to this device */ | 127 | /* Any queues attached to this device */ |
| 141 | struct virtqueue *vq; | 128 | struct virtqueue *vq; |
| 142 | 129 | ||
| 143 | /* Handle status being finalized (ie. feature bits stable). */ | 130 | /* Is it operational */ |
| 144 | void (*ready)(struct device *me); | 131 | bool running; |
| 145 | 132 | ||
| 146 | /* Device-specific data. */ | 133 | /* Device-specific data. */ |
| 147 | void *priv; | 134 | void *priv; |
| @@ -164,22 +151,28 @@ struct virtqueue | |||
| 164 | /* Last available index we saw. */ | 151 | /* Last available index we saw. */ |
| 165 | u16 last_avail_idx; | 152 | u16 last_avail_idx; |
| 166 | 153 | ||
| 167 | /* The routine to call when the Guest pings us, or timeout. */ | 154 | /* How many are used since we sent last irq? */ |
| 168 | void (*handle_output)(int fd, struct virtqueue *me, bool timeout); | 155 | unsigned int pending_used; |
| 169 | 156 | ||
| 170 | /* Outstanding buffers */ | 157 | /* Eventfd where Guest notifications arrive. */ |
| 171 | unsigned int inflight; | 158 | int eventfd; |
| 172 | 159 | ||
| 173 | /* Is this blocked awaiting a timer? */ | 160 | /* Function for the thread which is servicing this virtqueue. */ |
| 174 | bool blocked; | 161 | void (*service)(struct virtqueue *vq); |
| 162 | pid_t thread; | ||
| 175 | }; | 163 | }; |
| 176 | 164 | ||
| 177 | /* Remember the arguments to the program so we can "reboot" */ | 165 | /* Remember the arguments to the program so we can "reboot" */ |
| 178 | static char **main_args; | 166 | static char **main_args; |
| 179 | 167 | ||
| 180 | /* Since guest is UP and we don't run at the same time, we don't need barriers. | 168 | /* The original tty settings to restore on exit. */ |
| 181 | * But I include them in the code in case others copy it. */ | 169 | static struct termios orig_term; |
| 182 | #define wmb() | 170 | |
| 171 | /* We have to be careful with barriers: our devices are all run in separate | ||
| 172 | * threads and so we need to make sure that changes visible to the Guest happen | ||
| 173 | * in precise order. */ | ||
| 174 | #define wmb() __asm__ __volatile__("" : : : "memory") | ||
| 175 | #define mb() __asm__ __volatile__("" : : : "memory") | ||
| 183 | 176 | ||
| 184 | /* Convert an iovec element to the given type. | 177 | /* Convert an iovec element to the given type. |
| 185 | * | 178 | * |
| @@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len) | |||
| 245 | static u8 *get_feature_bits(struct device *dev) | 238 | static u8 *get_feature_bits(struct device *dev) |
| 246 | { | 239 | { |
| 247 | return (u8 *)(dev->desc + 1) | 240 | return (u8 *)(dev->desc + 1) |
| 248 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig); | 241 | + dev->num_vq * sizeof(struct lguest_vqconfig); |
| 249 | } | 242 | } |
| 250 | 243 | ||
| 251 | /*L:100 The Launcher code itself takes us out into userspace, that scary place | 244 | /*L:100 The Launcher code itself takes us out into userspace, that scary place |
| @@ -505,99 +498,19 @@ static void concat(char *dst, char *args[]) | |||
| 505 | * saw the arguments it expects when we looked at initialize() in lguest_user.c: | 498 | * saw the arguments it expects when we looked at initialize() in lguest_user.c: |
| 506 | * the base of Guest "physical" memory, the top physical page to allow and the | 499 | * the base of Guest "physical" memory, the top physical page to allow and the |
| 507 | * entry point for the Guest. */ | 500 | * entry point for the Guest. */ |
| 508 | static int tell_kernel(unsigned long start) | 501 | static void tell_kernel(unsigned long start) |
| 509 | { | 502 | { |
| 510 | unsigned long args[] = { LHREQ_INITIALIZE, | 503 | unsigned long args[] = { LHREQ_INITIALIZE, |
| 511 | (unsigned long)guest_base, | 504 | (unsigned long)guest_base, |
| 512 | guest_limit / getpagesize(), start }; | 505 | guest_limit / getpagesize(), start }; |
| 513 | int fd; | ||
| 514 | |||
| 515 | verbose("Guest: %p - %p (%#lx)\n", | 506 | verbose("Guest: %p - %p (%#lx)\n", |
| 516 | guest_base, guest_base + guest_limit, guest_limit); | 507 | guest_base, guest_base + guest_limit, guest_limit); |
| 517 | fd = open_or_die("/dev/lguest", O_RDWR); | 508 | lguest_fd = open_or_die("/dev/lguest", O_RDWR); |
| 518 | if (write(fd, args, sizeof(args)) < 0) | 509 | if (write(lguest_fd, args, sizeof(args)) < 0) |
| 519 | err(1, "Writing to /dev/lguest"); | 510 | err(1, "Writing to /dev/lguest"); |
| 520 | |||
| 521 | /* We return the /dev/lguest file descriptor to control this Guest */ | ||
| 522 | return fd; | ||
| 523 | } | 511 | } |
| 524 | /*:*/ | 512 | /*:*/ |
| 525 | 513 | ||
| 526 | static void add_device_fd(int fd) | ||
| 527 | { | ||
| 528 | FD_SET(fd, &devices.infds); | ||
| 529 | if (fd > devices.max_infd) | ||
| 530 | devices.max_infd = fd; | ||
| 531 | } | ||
| 532 | |||
| 533 | /*L:200 | ||
| 534 | * The Waker. | ||
| 535 | * | ||
| 536 | * With console, block and network devices, we can have lots of input which we | ||
| 537 | * need to process. We could try to tell the kernel what file descriptors to | ||
| 538 | * watch, but handing a file descriptor mask through to the kernel is fairly | ||
| 539 | * icky. | ||
| 540 | * | ||
| 541 | * Instead, we clone off a thread which watches the file descriptors and writes | ||
| 542 | * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host | ||
| 543 | * stop running the Guest. This causes the Launcher to return from the | ||
| 544 | * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset | ||
| 545 | * the LHREQ_BREAK and wake us up again. | ||
| 546 | * | ||
| 547 | * This, of course, is merely a different *kind* of icky. | ||
| 548 | * | ||
| 549 | * Given my well-known antipathy to threads, I'd prefer to use processes. But | ||
| 550 | * it's easier to share Guest memory with threads, and trivial to share the | ||
| 551 | * devices.infds as the Launcher changes it. | ||
| 552 | */ | ||
| 553 | static int waker(void *unused) | ||
| 554 | { | ||
| 555 | /* Close the write end of the pipe: only the Launcher has it open. */ | ||
| 556 | close(waker_fds.pipe[1]); | ||
| 557 | |||
| 558 | for (;;) { | ||
| 559 | fd_set rfds = devices.infds; | ||
| 560 | unsigned long args[] = { LHREQ_BREAK, 1 }; | ||
| 561 | unsigned int maxfd = devices.max_infd; | ||
| 562 | |||
| 563 | /* We also listen to the pipe from the Launcher. */ | ||
| 564 | FD_SET(waker_fds.pipe[0], &rfds); | ||
| 565 | if (waker_fds.pipe[0] > maxfd) | ||
| 566 | maxfd = waker_fds.pipe[0]; | ||
| 567 | |||
| 568 | /* Wait until input is ready from one of the devices. */ | ||
| 569 | select(maxfd+1, &rfds, NULL, NULL, NULL); | ||
| 570 | |||
| 571 | /* Message from Launcher? */ | ||
| 572 | if (FD_ISSET(waker_fds.pipe[0], &rfds)) { | ||
| 573 | char c; | ||
| 574 | /* If this fails, then assume Launcher has exited. | ||
| 575 | * Don't do anything on exit: we're just a thread! */ | ||
| 576 | if (read(waker_fds.pipe[0], &c, 1) != 1) | ||
| 577 | _exit(0); | ||
| 578 | continue; | ||
| 579 | } | ||
| 580 | |||
| 581 | /* Send LHREQ_BREAK command to snap the Launcher out of it. */ | ||
| 582 | pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id); | ||
| 583 | } | ||
| 584 | return 0; | ||
| 585 | } | ||
| 586 | |||
| 587 | /* This routine just sets up a pipe to the Waker process. */ | ||
| 588 | static void setup_waker(int lguest_fd) | ||
| 589 | { | ||
| 590 | /* This pipe is closed when Launcher dies, telling Waker. */ | ||
| 591 | if (pipe(waker_fds.pipe) != 0) | ||
| 592 | err(1, "Creating pipe for Waker"); | ||
| 593 | |||
| 594 | /* Waker also needs to know the lguest fd */ | ||
| 595 | waker_fds.lguest_fd = lguest_fd; | ||
| 596 | |||
| 597 | if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1) | ||
| 598 | err(1, "Creating Waker"); | ||
| 599 | } | ||
| 600 | |||
| 601 | /* | 514 | /* |
| 602 | * Device Handling. | 515 | * Device Handling. |
| 603 | * | 516 | * |
| @@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size, | |||
| 623 | /* Each buffer in the virtqueues is actually a chain of descriptors. This | 536 | /* Each buffer in the virtqueues is actually a chain of descriptors. This |
| 624 | * function returns the next descriptor in the chain, or vq->vring.num if we're | 537 | * function returns the next descriptor in the chain, or vq->vring.num if we're |
| 625 | * at the end. */ | 538 | * at the end. */ |
| 626 | static unsigned next_desc(struct virtqueue *vq, unsigned int i) | 539 | static unsigned next_desc(struct vring_desc *desc, |
| 540 | unsigned int i, unsigned int max) | ||
| 627 | { | 541 | { |
| 628 | unsigned int next; | 542 | unsigned int next; |
| 629 | 543 | ||
| 630 | /* If this descriptor says it doesn't chain, we're done. */ | 544 | /* If this descriptor says it doesn't chain, we're done. */ |
| 631 | if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) | 545 | if (!(desc[i].flags & VRING_DESC_F_NEXT)) |
| 632 | return vq->vring.num; | 546 | return max; |
| 633 | 547 | ||
| 634 | /* Check they're not leading us off end of descriptors. */ | 548 | /* Check they're not leading us off end of descriptors. */ |
| 635 | next = vq->vring.desc[i].next; | 549 | next = desc[i].next; |
| 636 | /* Make sure compiler knows to grab that: we don't want it changing! */ | 550 | /* Make sure compiler knows to grab that: we don't want it changing! */ |
| 637 | wmb(); | 551 | wmb(); |
| 638 | 552 | ||
| 639 | if (next >= vq->vring.num) | 553 | if (next >= max) |
| 640 | errx(1, "Desc next is %u", next); | 554 | errx(1, "Desc next is %u", next); |
| 641 | 555 | ||
| 642 | return next; | 556 | return next; |
| 643 | } | 557 | } |
| 644 | 558 | ||
| 559 | /* This actually sends the interrupt for this virtqueue */ | ||
| 560 | static void trigger_irq(struct virtqueue *vq) | ||
| 561 | { | ||
| 562 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | ||
| 563 | |||
| 564 | /* Don't inform them if nothing used. */ | ||
| 565 | if (!vq->pending_used) | ||
| 566 | return; | ||
| 567 | vq->pending_used = 0; | ||
| 568 | |||
| 569 | /* If they don't want an interrupt, don't send one, unless empty. */ | ||
| 570 | if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | ||
| 571 | && lg_last_avail(vq) != vq->vring.avail->idx) | ||
| 572 | return; | ||
| 573 | |||
| 574 | /* Send the Guest an interrupt tell them we used something up. */ | ||
| 575 | if (write(lguest_fd, buf, sizeof(buf)) != 0) | ||
| 576 | err(1, "Triggering irq %i", vq->config.irq); | ||
| 577 | } | ||
| 578 | |||
| 645 | /* This looks in the virtqueue and for the first available buffer, and converts | 579 | /* This looks in the virtqueue and for the first available buffer, and converts |
| 646 | * it to an iovec for convenient access. Since descriptors consist of some | 580 | * it to an iovec for convenient access. Since descriptors consist of some |
| 647 | * number of output then some number of input descriptors, it's actually two | 581 | * number of output then some number of input descriptors, it's actually two |
| 648 | * iovecs, but we pack them into one and note how many of each there were. | 582 | * iovecs, but we pack them into one and note how many of each there were. |
| 649 | * | 583 | * |
| 650 | * This function returns the descriptor number found, or vq->vring.num (which | 584 | * This function returns the descriptor number found. */ |
| 651 | * is never a valid descriptor number) if none was found. */ | 585 | static unsigned wait_for_vq_desc(struct virtqueue *vq, |
| 652 | static unsigned get_vq_desc(struct virtqueue *vq, | 586 | struct iovec iov[], |
| 653 | struct iovec iov[], | 587 | unsigned int *out_num, unsigned int *in_num) |
| 654 | unsigned int *out_num, unsigned int *in_num) | ||
| 655 | { | 588 | { |
| 656 | unsigned int i, head; | 589 | unsigned int i, head, max; |
| 657 | u16 last_avail; | 590 | struct vring_desc *desc; |
| 591 | u16 last_avail = lg_last_avail(vq); | ||
| 592 | |||
| 593 | while (last_avail == vq->vring.avail->idx) { | ||
| 594 | u64 event; | ||
| 595 | |||
| 596 | /* OK, tell Guest about progress up to now. */ | ||
| 597 | trigger_irq(vq); | ||
| 598 | |||
| 599 | /* OK, now we need to know about added descriptors. */ | ||
| 600 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
| 601 | |||
| 602 | /* They could have slipped one in as we were doing that: make | ||
| 603 | * sure it's written, then check again. */ | ||
| 604 | mb(); | ||
| 605 | if (last_avail != vq->vring.avail->idx) { | ||
| 606 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
| 607 | break; | ||
| 608 | } | ||
| 609 | |||
| 610 | /* Nothing new? Wait for eventfd to tell us they refilled. */ | ||
| 611 | if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event)) | ||
| 612 | errx(1, "Event read failed?"); | ||
| 613 | |||
| 614 | /* We don't need to be notified again. */ | ||
| 615 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
| 616 | } | ||
| 658 | 617 | ||
| 659 | /* Check it isn't doing very strange things with descriptor numbers. */ | 618 | /* Check it isn't doing very strange things with descriptor numbers. */ |
| 660 | last_avail = lg_last_avail(vq); | ||
| 661 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) | 619 | if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) |
| 662 | errx(1, "Guest moved used index from %u to %u", | 620 | errx(1, "Guest moved used index from %u to %u", |
| 663 | last_avail, vq->vring.avail->idx); | 621 | last_avail, vq->vring.avail->idx); |
| 664 | 622 | ||
| 665 | /* If there's nothing new since last we looked, return invalid. */ | ||
| 666 | if (vq->vring.avail->idx == last_avail) | ||
| 667 | return vq->vring.num; | ||
| 668 | |||
| 669 | /* Grab the next descriptor number they're advertising, and increment | 623 | /* Grab the next descriptor number they're advertising, and increment |
| 670 | * the index we've seen. */ | 624 | * the index we've seen. */ |
| 671 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; | 625 | head = vq->vring.avail->ring[last_avail % vq->vring.num]; |
| @@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
| 678 | /* When we start there are none of either input nor output. */ | 632 | /* When we start there are none of either input nor output. */ |
| 679 | *out_num = *in_num = 0; | 633 | *out_num = *in_num = 0; |
| 680 | 634 | ||
| 635 | max = vq->vring.num; | ||
| 636 | desc = vq->vring.desc; | ||
| 681 | i = head; | 637 | i = head; |
| 638 | |||
| 639 | /* If this is an indirect entry, then this buffer contains a descriptor | ||
| 640 | * table which we handle as if it's any normal descriptor chain. */ | ||
| 641 | if (desc[i].flags & VRING_DESC_F_INDIRECT) { | ||
| 642 | if (desc[i].len % sizeof(struct vring_desc)) | ||
| 643 | errx(1, "Invalid size for indirect buffer table"); | ||
| 644 | |||
| 645 | max = desc[i].len / sizeof(struct vring_desc); | ||
| 646 | desc = check_pointer(desc[i].addr, desc[i].len); | ||
| 647 | i = 0; | ||
| 648 | } | ||
| 649 | |||
| 682 | do { | 650 | do { |
| 683 | /* Grab the first descriptor, and check it's OK. */ | 651 | /* Grab the first descriptor, and check it's OK. */ |
| 684 | iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; | 652 | iov[*out_num + *in_num].iov_len = desc[i].len; |
| 685 | iov[*out_num + *in_num].iov_base | 653 | iov[*out_num + *in_num].iov_base |
| 686 | = check_pointer(vq->vring.desc[i].addr, | 654 | = check_pointer(desc[i].addr, desc[i].len); |
| 687 | vq->vring.desc[i].len); | ||
| 688 | /* If this is an input descriptor, increment that count. */ | 655 | /* If this is an input descriptor, increment that count. */ |
| 689 | if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) | 656 | if (desc[i].flags & VRING_DESC_F_WRITE) |
| 690 | (*in_num)++; | 657 | (*in_num)++; |
| 691 | else { | 658 | else { |
| 692 | /* If it's an output descriptor, they're all supposed | 659 | /* If it's an output descriptor, they're all supposed |
| @@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq, | |||
| 697 | } | 664 | } |
| 698 | 665 | ||
| 699 | /* If we've got too many, that implies a descriptor loop. */ | 666 | /* If we've got too many, that implies a descriptor loop. */ |
| 700 | if (*out_num + *in_num > vq->vring.num) | 667 | if (*out_num + *in_num > max) |
| 701 | errx(1, "Looped descriptor"); | 668 | errx(1, "Looped descriptor"); |
| 702 | } while ((i = next_desc(vq, i)) != vq->vring.num); | 669 | } while ((i = next_desc(desc, i, max)) != max); |
| 703 | 670 | ||
| 704 | vq->inflight++; | ||
| 705 | return head; | 671 | return head; |
| 706 | } | 672 | } |
| 707 | 673 | ||
| @@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len) | |||
| 719 | /* Make sure buffer is written before we update index. */ | 685 | /* Make sure buffer is written before we update index. */ |
| 720 | wmb(); | 686 | wmb(); |
| 721 | vq->vring.used->idx++; | 687 | vq->vring.used->idx++; |
| 722 | vq->inflight--; | 688 | vq->pending_used++; |
| 723 | } | ||
| 724 | |||
| 725 | /* This actually sends the interrupt for this virtqueue */ | ||
| 726 | static void trigger_irq(int fd, struct virtqueue *vq) | ||
| 727 | { | ||
| 728 | unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; | ||
| 729 | |||
| 730 | /* If they don't want an interrupt, don't send one, unless empty. */ | ||
| 731 | if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) | ||
| 732 | && vq->inflight) | ||
| 733 | return; | ||
| 734 | |||
| 735 | /* Send the Guest an interrupt tell them we used something up. */ | ||
| 736 | if (write(fd, buf, sizeof(buf)) != 0) | ||
| 737 | err(1, "Triggering irq %i", vq->config.irq); | ||
| 738 | } | 689 | } |
| 739 | 690 | ||
| 740 | /* And here's the combo meal deal. Supersize me! */ | 691 | /* And here's the combo meal deal. Supersize me! */ |
| 741 | static void add_used_and_trigger(int fd, struct virtqueue *vq, | 692 | static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) |
| 742 | unsigned int head, int len) | ||
| 743 | { | 693 | { |
| 744 | add_used(vq, head, len); | 694 | add_used(vq, head, len); |
| 745 | trigger_irq(fd, vq); | 695 | trigger_irq(vq); |
| 746 | } | 696 | } |
| 747 | 697 | ||
| 748 | /* | 698 | /* |
| 749 | * The Console | 699 | * The Console |
| 750 | * | 700 | * |
| 751 | * Here is the input terminal setting we save, and the routine to restore them | 701 | * We associate some data with the console for our exit hack. */ |
| 752 | * on exit so the user gets their terminal back. */ | ||
| 753 | static struct termios orig_term; | ||
| 754 | static void restore_term(void) | ||
| 755 | { | ||
| 756 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
| 757 | } | ||
| 758 | |||
| 759 | /* We associate some data with the console for our exit hack. */ | ||
| 760 | struct console_abort | 702 | struct console_abort |
| 761 | { | 703 | { |
| 762 | /* How many times have they hit ^C? */ | 704 | /* How many times have they hit ^C? */ |
| @@ -766,276 +708,275 @@ struct console_abort | |||
| 766 | }; | 708 | }; |
| 767 | 709 | ||
| 768 | /* This is the routine which handles console input (ie. stdin). */ | 710 | /* This is the routine which handles console input (ie. stdin). */ |
| 769 | static bool handle_console_input(int fd, struct device *dev) | 711 | static void console_input(struct virtqueue *vq) |
| 770 | { | 712 | { |
| 771 | int len; | 713 | int len; |
| 772 | unsigned int head, in_num, out_num; | 714 | unsigned int head, in_num, out_num; |
| 773 | struct iovec iov[dev->vq->vring.num]; | 715 | struct console_abort *abort = vq->dev->priv; |
| 774 | struct console_abort *abort = dev->priv; | 716 | struct iovec iov[vq->vring.num]; |
| 775 | |||
| 776 | /* First we need a console buffer from the Guests's input virtqueue. */ | ||
| 777 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
| 778 | |||
| 779 | /* If they're not ready for input, stop listening to this file | ||
| 780 | * descriptor. We'll start again once they add an input buffer. */ | ||
| 781 | if (head == dev->vq->vring.num) | ||
| 782 | return false; | ||
| 783 | 717 | ||
| 718 | /* Make sure there's a descriptor waiting. */ | ||
| 719 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); | ||
| 784 | if (out_num) | 720 | if (out_num) |
| 785 | errx(1, "Output buffers in console in queue?"); | 721 | errx(1, "Output buffers in console in queue?"); |
| 786 | 722 | ||
| 787 | /* This is why we convert to iovecs: the readv() call uses them, and so | 723 | /* Read it in. */ |
| 788 | * it reads straight into the Guest's buffer. */ | 724 | len = readv(STDIN_FILENO, iov, in_num); |
| 789 | len = readv(dev->fd, iov, in_num); | ||
| 790 | if (len <= 0) { | 725 | if (len <= 0) { |
| 791 | /* This implies that the console is closed, is /dev/null, or | 726 | /* Ran out of input? */ |
| 792 | * something went terribly wrong. */ | ||
| 793 | warnx("Failed to get console input, ignoring console."); | 727 | warnx("Failed to get console input, ignoring console."); |
| 794 | /* Put the input terminal back. */ | 728 | /* For simplicity, dying threads kill the whole Launcher. So |
| 795 | restore_term(); | 729 | * just nap here. */ |
| 796 | /* Remove callback from input vq, so it doesn't restart us. */ | 730 | for (;;) |
| 797 | dev->vq->handle_output = NULL; | 731 | pause(); |
| 798 | /* Stop listening to this fd: don't call us again. */ | ||
| 799 | return false; | ||
| 800 | } | 732 | } |
| 801 | 733 | ||
| 802 | /* Tell the Guest about the new input. */ | 734 | add_used_and_trigger(vq, head, len); |
| 803 | add_used_and_trigger(fd, dev->vq, head, len); | ||
| 804 | 735 | ||
| 805 | /* Three ^C within one second? Exit. | 736 | /* Three ^C within one second? Exit. |
| 806 | * | 737 | * |
| 807 | * This is such a hack, but works surprisingly well. Each ^C has to be | 738 | * This is such a hack, but works surprisingly well. Each ^C has to |
| 808 | * in a buffer by itself, so they can't be too fast. But we check that | 739 | * be in a buffer by itself, so they can't be too fast. But we check |
| 809 | * we get three within about a second, so they can't be too slow. */ | 740 | * that we get three within about a second, so they can't be too |
| 810 | if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { | 741 | * slow. */ |
| 811 | if (!abort->count++) | 742 | if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { |
| 812 | gettimeofday(&abort->start, NULL); | ||
| 813 | else if (abort->count == 3) { | ||
| 814 | struct timeval now; | ||
| 815 | gettimeofday(&now, NULL); | ||
| 816 | if (now.tv_sec <= abort->start.tv_sec+1) { | ||
| 817 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
| 818 | /* Close the fd so Waker will know it has to | ||
| 819 | * exit. */ | ||
| 820 | close(waker_fds.pipe[1]); | ||
| 821 | /* Just in case Waker is blocked in BREAK, send | ||
| 822 | * unbreak now. */ | ||
| 823 | write(fd, args, sizeof(args)); | ||
| 824 | exit(2); | ||
| 825 | } | ||
| 826 | abort->count = 0; | ||
| 827 | } | ||
| 828 | } else | ||
| 829 | /* Any other key resets the abort counter. */ | ||
| 830 | abort->count = 0; | 743 | abort->count = 0; |
| 744 | return; | ||
| 745 | } | ||
| 831 | 746 | ||
| 832 | /* Everything went OK! */ | 747 | abort->count++; |
| 833 | return true; | 748 | if (abort->count == 1) |
| 749 | gettimeofday(&abort->start, NULL); | ||
| 750 | else if (abort->count == 3) { | ||
| 751 | struct timeval now; | ||
| 752 | gettimeofday(&now, NULL); | ||
| 753 | /* Kill all Launcher processes with SIGINT, like normal ^C */ | ||
| 754 | if (now.tv_sec <= abort->start.tv_sec+1) | ||
| 755 | kill(0, SIGINT); | ||
| 756 | abort->count = 0; | ||
| 757 | } | ||
| 834 | } | 758 | } |
| 835 | 759 | ||
| 836 | /* Handling output for console is simple: we just get all the output buffers | 760 | /* This is the routine which handles console output (ie. stdout). */ |
| 837 | * and write them to stdout. */ | 761 | static void console_output(struct virtqueue *vq) |
| 838 | static void handle_console_output(int fd, struct virtqueue *vq, bool timeout) | ||
| 839 | { | 762 | { |
| 840 | unsigned int head, out, in; | 763 | unsigned int head, out, in; |
| 841 | int len; | ||
| 842 | struct iovec iov[vq->vring.num]; | 764 | struct iovec iov[vq->vring.num]; |
| 843 | 765 | ||
| 844 | /* Keep getting output buffers from the Guest until we run out. */ | 766 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 845 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | 767 | if (in) |
| 846 | if (in) | 768 | errx(1, "Input buffers in console output queue?"); |
| 847 | errx(1, "Input buffers in output queue?"); | 769 | while (!iov_empty(iov, out)) { |
| 848 | len = writev(STDOUT_FILENO, iov, out); | 770 | int len = writev(STDOUT_FILENO, iov, out); |
| 849 | add_used_and_trigger(fd, vq, head, len); | 771 | if (len <= 0) |
| 772 | err(1, "Write to stdout gave %i", len); | ||
| 773 | iov_consume(iov, out, len); | ||
| 850 | } | 774 | } |
| 851 | } | 775 | add_used(vq, head, 0); |
| 852 | |||
| 853 | /* This is called when we no longer want to hear about Guest changes to a | ||
| 854 | * virtqueue. This is more efficient in high-traffic cases, but it means we | ||
| 855 | * have to set a timer to check if any more changes have occurred. */ | ||
| 856 | static void block_vq(struct virtqueue *vq) | ||
| 857 | { | ||
| 858 | struct itimerval itm; | ||
| 859 | |||
| 860 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
| 861 | vq->blocked = true; | ||
| 862 | |||
| 863 | itm.it_interval.tv_sec = 0; | ||
| 864 | itm.it_interval.tv_usec = 0; | ||
| 865 | itm.it_value.tv_sec = 0; | ||
| 866 | itm.it_value.tv_usec = timeout_usec; | ||
| 867 | |||
| 868 | setitimer(ITIMER_REAL, &itm, NULL); | ||
| 869 | } | 776 | } |
| 870 | 777 | ||
| 871 | /* | 778 | /* |
| 872 | * The Network | 779 | * The Network |
| 873 | * | 780 | * |
| 874 | * Handling output for network is also simple: we get all the output buffers | 781 | * Handling output for network is also simple: we get all the output buffers |
| 875 | * and write them (ignoring the first element) to this device's file descriptor | 782 | * and write them to /dev/net/tun. |
| 876 | * (/dev/net/tun). | ||
| 877 | */ | 783 | */ |
| 878 | static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) | 784 | struct net_info { |
| 785 | int tunfd; | ||
| 786 | }; | ||
| 787 | |||
| 788 | static void net_output(struct virtqueue *vq) | ||
| 879 | { | 789 | { |
| 880 | unsigned int head, out, in, num = 0; | 790 | struct net_info *net_info = vq->dev->priv; |
| 881 | int len; | 791 | unsigned int head, out, in; |
| 882 | struct iovec iov[vq->vring.num]; | 792 | struct iovec iov[vq->vring.num]; |
| 883 | static int last_timeout_num; | ||
| 884 | |||
| 885 | /* Keep getting output buffers from the Guest until we run out. */ | ||
| 886 | while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { | ||
| 887 | if (in) | ||
| 888 | errx(1, "Input buffers in output queue?"); | ||
| 889 | len = writev(vq->dev->fd, iov, out); | ||
| 890 | if (len < 0) | ||
| 891 | err(1, "Writing network packet to tun"); | ||
| 892 | add_used_and_trigger(fd, vq, head, len); | ||
| 893 | num++; | ||
| 894 | } | ||
| 895 | 793 | ||
| 896 | /* Block further kicks and set up a timer if we saw anything. */ | 794 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 897 | if (!timeout && num) | 795 | if (in) |
| 898 | block_vq(vq); | 796 | errx(1, "Input buffers in net output queue?"); |
| 899 | 797 | if (writev(net_info->tunfd, iov, out) < 0) | |
| 900 | /* We never quite know how long should we wait before we check the | 798 | errx(1, "Write to tun failed?"); |
| 901 | * queue again for more packets. We start at 500 microseconds, and if | 799 | add_used(vq, head, 0); |
| 902 | * we get fewer packets than last time, we assume we made the timeout | 800 | } |
| 903 | * too small and increase it by 10 microseconds. Otherwise, we drop it | 801 | |
| 904 | * by one microsecond every time. It seems to work well enough. */ | 802 | /* Will reading from this file descriptor block? */ |
| 905 | if (timeout) { | 803 | static bool will_block(int fd) |
| 906 | if (num < last_timeout_num) | 804 | { |
| 907 | timeout_usec += 10; | 805 | fd_set fdset; |
| 908 | else if (timeout_usec > 1) | 806 | struct timeval zero = { 0, 0 }; |
| 909 | timeout_usec--; | 807 | FD_ZERO(&fdset); |
| 910 | last_timeout_num = num; | 808 | FD_SET(fd, &fdset); |
| 911 | } | 809 | return select(fd+1, &fdset, NULL, NULL, &zero) != 1; |
| 912 | } | 810 | } |
| 913 | 811 | ||
| 914 | /* This is where we handle a packet coming in from the tun device to our | 812 | /* This is where we handle packets coming in from the tun device to our |
| 915 | * Guest. */ | 813 | * Guest. */ |
| 916 | static bool handle_tun_input(int fd, struct device *dev) | 814 | static void net_input(struct virtqueue *vq) |
| 917 | { | 815 | { |
| 918 | unsigned int head, in_num, out_num; | ||
| 919 | int len; | 816 | int len; |
| 920 | struct iovec iov[dev->vq->vring.num]; | 817 | unsigned int head, out, in; |
| 921 | 818 | struct iovec iov[vq->vring.num]; | |
| 922 | /* First we need a network buffer from the Guests's recv virtqueue. */ | 819 | struct net_info *net_info = vq->dev->priv; |
| 923 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | ||
| 924 | if (head == dev->vq->vring.num) { | ||
| 925 | /* Now, it's expected that if we try to send a packet too | ||
| 926 | * early, the Guest won't be ready yet. Wait until the device | ||
| 927 | * status says it's ready. */ | ||
| 928 | /* FIXME: Actually want DRIVER_ACTIVE here. */ | ||
| 929 | |||
| 930 | /* Now tell it we want to know if new things appear. */ | ||
| 931 | dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
| 932 | wmb(); | ||
| 933 | |||
| 934 | /* We'll turn this back on if input buffers are registered. */ | ||
| 935 | return false; | ||
| 936 | } else if (out_num) | ||
| 937 | errx(1, "Output buffers in network recv queue?"); | ||
| 938 | |||
| 939 | /* Read the packet from the device directly into the Guest's buffer. */ | ||
| 940 | len = readv(dev->fd, iov, in_num); | ||
| 941 | if (len <= 0) | ||
| 942 | err(1, "reading network"); | ||
| 943 | 820 | ||
| 944 | /* Tell the Guest about the new packet. */ | 821 | head = wait_for_vq_desc(vq, iov, &out, &in); |
| 945 | add_used_and_trigger(fd, dev->vq, head, len); | 822 | if (out) |
| 823 | errx(1, "Output buffers in net input queue?"); | ||
| 946 | 824 | ||
| 947 | verbose("tun input packet len %i [%02x %02x] (%s)\n", len, | 825 | /* Deliver interrupt now, since we're about to sleep. */ |
| 948 | ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], | 826 | if (vq->pending_used && will_block(net_info->tunfd)) |
| 949 | head != dev->vq->vring.num ? "sent" : "discarded"); | 827 | trigger_irq(vq); |
| 950 | 828 | ||
| 951 | /* All good. */ | 829 | len = readv(net_info->tunfd, iov, in); |
| 952 | return true; | 830 | if (len <= 0) |
| 831 | err(1, "Failed to read from tun."); | ||
| 832 | add_used(vq, head, len); | ||
| 953 | } | 833 | } |
| 954 | 834 | ||
| 955 | /*L:215 This is the callback attached to the network and console input | 835 | /* This is the helper to create threads. */ |
| 956 | * virtqueues: it ensures we try again, in case we stopped console or net | 836 | static int do_thread(void *_vq) |
| 957 | * delivery because Guest didn't have any buffers. */ | ||
| 958 | static void enable_fd(int fd, struct virtqueue *vq, bool timeout) | ||
| 959 | { | 837 | { |
| 960 | add_device_fd(vq->dev->fd); | 838 | struct virtqueue *vq = _vq; |
| 961 | /* Snap the Waker out of its select loop. */ | 839 | |
| 962 | write(waker_fds.pipe[1], "", 1); | 840 | for (;;) |
| 841 | vq->service(vq); | ||
| 842 | return 0; | ||
| 963 | } | 843 | } |
| 964 | 844 | ||
| 965 | static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) | 845 | /* When a child dies, we kill our entire process group with SIGTERM. This |
| 846 | * also has the side effect that the shell restores the console for us! */ | ||
| 847 | static void kill_launcher(int signal) | ||
| 966 | { | 848 | { |
| 967 | /* We don't need to know again when Guest refills receive buffer. */ | 849 | kill(0, SIGTERM); |
| 968 | vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; | ||
| 969 | enable_fd(fd, vq, timeout); | ||
| 970 | } | 850 | } |
| 971 | 851 | ||
| 972 | /* When the Guest tells us they updated the status field, we handle it. */ | 852 | static void reset_device(struct device *dev) |
| 973 | static void update_device_status(struct device *dev) | ||
| 974 | { | 853 | { |
| 975 | struct virtqueue *vq; | 854 | struct virtqueue *vq; |
| 976 | 855 | ||
| 977 | /* This is a reset. */ | 856 | verbose("Resetting device %s\n", dev->name); |
| 978 | if (dev->desc->status == 0) { | ||
| 979 | verbose("Resetting device %s\n", dev->name); | ||
| 980 | 857 | ||
| 981 | /* Clear any features they've acked. */ | 858 | /* Clear any features they've acked. */ |
| 982 | memset(get_feature_bits(dev) + dev->desc->feature_len, 0, | 859 | memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len); |
| 983 | dev->desc->feature_len); | ||
| 984 | 860 | ||
| 985 | /* Zero out the virtqueues. */ | 861 | /* We're going to be explicitly killing threads, so ignore them. */ |
| 986 | for (vq = dev->vq; vq; vq = vq->next) { | 862 | signal(SIGCHLD, SIG_IGN); |
| 987 | memset(vq->vring.desc, 0, | 863 | |
| 988 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | 864 | /* Zero out the virtqueues, get rid of their threads */ |
| 989 | lg_last_avail(vq) = 0; | 865 | for (vq = dev->vq; vq; vq = vq->next) { |
| 866 | if (vq->thread != (pid_t)-1) { | ||
| 867 | kill(vq->thread, SIGTERM); | ||
| 868 | waitpid(vq->thread, NULL, 0); | ||
| 869 | vq->thread = (pid_t)-1; | ||
| 990 | } | 870 | } |
| 991 | } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | 871 | memset(vq->vring.desc, 0, |
| 872 | vring_size(vq->config.num, LGUEST_VRING_ALIGN)); | ||
| 873 | lg_last_avail(vq) = 0; | ||
| 874 | } | ||
| 875 | dev->running = false; | ||
| 876 | |||
| 877 | /* Now we care if threads die. */ | ||
| 878 | signal(SIGCHLD, (void *)kill_launcher); | ||
| 879 | } | ||
| 880 | |||
| 881 | static void create_thread(struct virtqueue *vq) | ||
| 882 | { | ||
| 883 | /* Create stack for thread and run it. Since stack grows | ||
| 884 | * upwards, we point the stack pointer to the end of this | ||
| 885 | * region. */ | ||
| 886 | char *stack = malloc(32768); | ||
| 887 | unsigned long args[] = { LHREQ_EVENTFD, | ||
| 888 | vq->config.pfn*getpagesize(), 0 }; | ||
| 889 | |||
| 890 | /* Create a zero-initialized eventfd. */ | ||
| 891 | vq->eventfd = eventfd(0, 0); | ||
| 892 | if (vq->eventfd < 0) | ||
| 893 | err(1, "Creating eventfd"); | ||
| 894 | args[2] = vq->eventfd; | ||
| 895 | |||
| 896 | /* Attach an eventfd to this virtqueue: it will go off | ||
| 897 | * when the Guest does an LHCALL_NOTIFY for this vq. */ | ||
| 898 | if (write(lguest_fd, &args, sizeof(args)) != 0) | ||
| 899 | err(1, "Attaching eventfd"); | ||
| 900 | |||
| 901 | /* CLONE_VM: because it has to access the Guest memory, and | ||
| 902 | * SIGCHLD so we get a signal if it dies. */ | ||
| 903 | vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); | ||
| 904 | if (vq->thread == (pid_t)-1) | ||
| 905 | err(1, "Creating clone"); | ||
| 906 | /* We close our local copy, now the child has it. */ | ||
| 907 | close(vq->eventfd); | ||
| 908 | } | ||
| 909 | |||
| 910 | static void start_device(struct device *dev) | ||
| 911 | { | ||
| 912 | unsigned int i; | ||
| 913 | struct virtqueue *vq; | ||
| 914 | |||
| 915 | verbose("Device %s OK: offered", dev->name); | ||
| 916 | for (i = 0; i < dev->feature_len; i++) | ||
| 917 | verbose(" %02x", get_feature_bits(dev)[i]); | ||
| 918 | verbose(", accepted"); | ||
| 919 | for (i = 0; i < dev->feature_len; i++) | ||
| 920 | verbose(" %02x", get_feature_bits(dev) | ||
| 921 | [dev->feature_len+i]); | ||
| 922 | |||
| 923 | for (vq = dev->vq; vq; vq = vq->next) { | ||
| 924 | if (vq->service) | ||
| 925 | create_thread(vq); | ||
| 926 | } | ||
| 927 | dev->running = true; | ||
| 928 | } | ||
| 929 | |||
| 930 | static void cleanup_devices(void) | ||
| 931 | { | ||
| 932 | struct device *dev; | ||
| 933 | |||
| 934 | for (dev = devices.dev; dev; dev = dev->next) | ||
| 935 | reset_device(dev); | ||
| 936 | |||
| 937 | /* If we saved off the original terminal settings, restore them now. */ | ||
| 938 | if (orig_term.c_lflag & (ISIG|ICANON|ECHO)) | ||
| 939 | tcsetattr(STDIN_FILENO, TCSANOW, &orig_term); | ||
| 940 | } | ||
| 941 | |||
| 942 | /* When the Guest tells us they updated the status field, we handle it. */ | ||
| 943 | static void update_device_status(struct device *dev) | ||
| 944 | { | ||
| 945 | /* A zero status is a reset, otherwise it's a set of flags. */ | ||
| 946 | if (dev->desc->status == 0) | ||
| 947 | reset_device(dev); | ||
| 948 | else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { | ||
| 992 | warnx("Device %s configuration FAILED", dev->name); | 949 | warnx("Device %s configuration FAILED", dev->name); |
| 950 | if (dev->running) | ||
| 951 | reset_device(dev); | ||
| 993 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { | 952 | } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { |
| 994 | unsigned int i; | 953 | if (!dev->running) |
| 995 | 954 | start_device(dev); | |
| 996 | verbose("Device %s OK: offered", dev->name); | ||
| 997 | for (i = 0; i < dev->desc->feature_len; i++) | ||
| 998 | verbose(" %02x", get_feature_bits(dev)[i]); | ||
| 999 | verbose(", accepted"); | ||
| 1000 | for (i = 0; i < dev->desc->feature_len; i++) | ||
| 1001 | verbose(" %02x", get_feature_bits(dev) | ||
| 1002 | [dev->desc->feature_len+i]); | ||
| 1003 | |||
| 1004 | if (dev->ready) | ||
| 1005 | dev->ready(dev); | ||
| 1006 | } | 955 | } |
| 1007 | } | 956 | } |
| 1008 | 957 | ||
| 1009 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ | 958 | /* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ |
| 1010 | static void handle_output(int fd, unsigned long addr) | 959 | static void handle_output(unsigned long addr) |
| 1011 | { | 960 | { |
| 1012 | struct device *i; | 961 | struct device *i; |
| 1013 | struct virtqueue *vq; | ||
| 1014 | 962 | ||
| 1015 | /* Check each device and virtqueue. */ | 963 | /* Check each device. */ |
| 1016 | for (i = devices.dev; i; i = i->next) { | 964 | for (i = devices.dev; i; i = i->next) { |
| 965 | struct virtqueue *vq; | ||
| 966 | |||
| 1017 | /* Notifications to device descriptors update device status. */ | 967 | /* Notifications to device descriptors update device status. */ |
| 1018 | if (from_guest_phys(addr) == i->desc) { | 968 | if (from_guest_phys(addr) == i->desc) { |
| 1019 | update_device_status(i); | 969 | update_device_status(i); |
| 1020 | return; | 970 | return; |
| 1021 | } | 971 | } |
| 1022 | 972 | ||
| 1023 | /* Notifications to virtqueues mean output has occurred. */ | 973 | /* Devices *can* be used before status is set to DRIVER_OK. */ |
| 1024 | for (vq = i->vq; vq; vq = vq->next) { | 974 | for (vq = i->vq; vq; vq = vq->next) { |
| 1025 | if (vq->config.pfn != addr/getpagesize()) | 975 | if (addr != vq->config.pfn*getpagesize()) |
| 1026 | continue; | 976 | continue; |
| 1027 | 977 | if (i->running) | |
| 1028 | /* Guest should acknowledge (and set features!) before | 978 | errx(1, "Notification on running %s", i->name); |
| 1029 | * using the device. */ | 979 | start_device(i); |
| 1030 | if (i->desc->status == 0) { | ||
| 1031 | warnx("%s gave early output", i->name); | ||
| 1032 | return; | ||
| 1033 | } | ||
| 1034 | |||
| 1035 | if (strcmp(vq->dev->name, "console") != 0) | ||
| 1036 | verbose("Output to %s\n", vq->dev->name); | ||
| 1037 | if (vq->handle_output) | ||
| 1038 | vq->handle_output(fd, vq, false); | ||
| 1039 | return; | 980 | return; |
| 1040 | } | 981 | } |
| 1041 | } | 982 | } |
| @@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr) | |||
| 1049 | strnlen(from_guest_phys(addr), guest_limit - addr)); | 990 | strnlen(from_guest_phys(addr), guest_limit - addr)); |
| 1050 | } | 991 | } |
| 1051 | 992 | ||
| 1052 | static void handle_timeout(int fd) | ||
| 1053 | { | ||
| 1054 | char buf[32]; | ||
| 1055 | struct device *i; | ||
| 1056 | struct virtqueue *vq; | ||
| 1057 | |||
| 1058 | /* Clear the pipe */ | ||
| 1059 | read(timeoutpipe[0], buf, sizeof(buf)); | ||
| 1060 | |||
| 1061 | /* Check each device and virtqueue: flush blocked ones. */ | ||
| 1062 | for (i = devices.dev; i; i = i->next) { | ||
| 1063 | for (vq = i->vq; vq; vq = vq->next) { | ||
| 1064 | if (!vq->blocked) | ||
| 1065 | continue; | ||
| 1066 | |||
| 1067 | vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; | ||
| 1068 | vq->blocked = false; | ||
| 1069 | if (vq->handle_output) | ||
| 1070 | vq->handle_output(fd, vq, true); | ||
| 1071 | } | ||
| 1072 | } | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | /* This is called when the Waker wakes us up: check for incoming file | ||
| 1076 | * descriptors. */ | ||
| 1077 | static void handle_input(int fd) | ||
| 1078 | { | ||
| 1079 | /* select() wants a zeroed timeval to mean "don't wait". */ | ||
| 1080 | struct timeval poll = { .tv_sec = 0, .tv_usec = 0 }; | ||
| 1081 | |||
| 1082 | for (;;) { | ||
| 1083 | struct device *i; | ||
| 1084 | fd_set fds = devices.infds; | ||
| 1085 | int num; | ||
| 1086 | |||
| 1087 | num = select(devices.max_infd+1, &fds, NULL, NULL, &poll); | ||
| 1088 | /* Could get interrupted */ | ||
| 1089 | if (num < 0) | ||
| 1090 | continue; | ||
| 1091 | /* If nothing is ready, we're done. */ | ||
| 1092 | if (num == 0) | ||
| 1093 | break; | ||
| 1094 | |||
| 1095 | /* Otherwise, call the device(s) which have readable file | ||
| 1096 | * descriptors and a method of handling them. */ | ||
| 1097 | for (i = devices.dev; i; i = i->next) { | ||
| 1098 | if (i->handle_input && FD_ISSET(i->fd, &fds)) { | ||
| 1099 | if (i->handle_input(fd, i)) | ||
| 1100 | continue; | ||
| 1101 | |||
| 1102 | /* If handle_input() returns false, it means we | ||
| 1103 | * should no longer service it. Networking and | ||
| 1104 | * console do this when there's no input | ||
| 1105 | * buffers to deliver into. Console also uses | ||
| 1106 | * it when it discovers that stdin is closed. */ | ||
| 1107 | FD_CLR(i->fd, &devices.infds); | ||
| 1108 | } | ||
| 1109 | } | ||
| 1110 | |||
| 1111 | /* Is this the timeout fd? */ | ||
| 1112 | if (FD_ISSET(timeoutpipe[0], &fds)) | ||
| 1113 | handle_timeout(fd); | ||
| 1114 | } | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | /*L:190 | 993 | /*L:190 |
| 1118 | * Device Setup | 994 | * Device Setup |
| 1119 | * | 995 | * |
| @@ -1129,8 +1005,8 @@ static void handle_input(int fd) | |||
| 1129 | static u8 *device_config(const struct device *dev) | 1005 | static u8 *device_config(const struct device *dev) |
| 1130 | { | 1006 | { |
| 1131 | return (void *)(dev->desc + 1) | 1007 | return (void *)(dev->desc + 1) |
| 1132 | + dev->desc->num_vq * sizeof(struct lguest_vqconfig) | 1008 | + dev->num_vq * sizeof(struct lguest_vqconfig) |
| 1133 | + dev->desc->feature_len * 2; | 1009 | + dev->feature_len * 2; |
| 1134 | } | 1010 | } |
| 1135 | 1011 | ||
| 1136 | /* This routine allocates a new "struct lguest_device_desc" from descriptor | 1012 | /* This routine allocates a new "struct lguest_device_desc" from descriptor |
| @@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) | |||
| 1159 | /* Each device descriptor is followed by the description of its virtqueues. We | 1035 | /* Each device descriptor is followed by the description of its virtqueues. We |
| 1160 | * specify how many descriptors the virtqueue is to have. */ | 1036 | * specify how many descriptors the virtqueue is to have. */ |
| 1161 | static void add_virtqueue(struct device *dev, unsigned int num_descs, | 1037 | static void add_virtqueue(struct device *dev, unsigned int num_descs, |
| 1162 | void (*handle_output)(int, struct virtqueue *, bool)) | 1038 | void (*service)(struct virtqueue *)) |
| 1163 | { | 1039 | { |
| 1164 | unsigned int pages; | 1040 | unsigned int pages; |
| 1165 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); | 1041 | struct virtqueue **i, *vq = malloc(sizeof(*vq)); |
| @@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
| 1174 | vq->next = NULL; | 1050 | vq->next = NULL; |
| 1175 | vq->last_avail_idx = 0; | 1051 | vq->last_avail_idx = 0; |
| 1176 | vq->dev = dev; | 1052 | vq->dev = dev; |
| 1177 | vq->inflight = 0; | 1053 | vq->service = service; |
| 1178 | vq->blocked = false; | 1054 | vq->thread = (pid_t)-1; |
| 1179 | 1055 | ||
| 1180 | /* Initialize the configuration. */ | 1056 | /* Initialize the configuration. */ |
| 1181 | vq->config.num = num_descs; | 1057 | vq->config.num = num_descs; |
| @@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
| 1191 | * yet, otherwise we'd be overwriting them. */ | 1067 | * yet, otherwise we'd be overwriting them. */ |
| 1192 | assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); | 1068 | assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); |
| 1193 | memcpy(device_config(dev), &vq->config, sizeof(vq->config)); | 1069 | memcpy(device_config(dev), &vq->config, sizeof(vq->config)); |
| 1070 | dev->num_vq++; | ||
| 1194 | dev->desc->num_vq++; | 1071 | dev->desc->num_vq++; |
| 1195 | 1072 | ||
| 1196 | verbose("Virtqueue page %#lx\n", to_guest_phys(p)); | 1073 | verbose("Virtqueue page %#lx\n", to_guest_phys(p)); |
| @@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, | |||
| 1199 | * second. */ | 1076 | * second. */ |
| 1200 | for (i = &dev->vq; *i; i = &(*i)->next); | 1077 | for (i = &dev->vq; *i; i = &(*i)->next); |
| 1201 | *i = vq; | 1078 | *i = vq; |
| 1202 | |||
| 1203 | /* Set the routine to call when the Guest does something to this | ||
| 1204 | * virtqueue. */ | ||
| 1205 | vq->handle_output = handle_output; | ||
| 1206 | |||
| 1207 | /* As an optimization, set the advisory "Don't Notify Me" flag if we | ||
| 1208 | * don't have a handler */ | ||
| 1209 | if (!handle_output) | ||
| 1210 | vq->vring.used->flags = VRING_USED_F_NO_NOTIFY; | ||
| 1211 | } | 1079 | } |
| 1212 | 1080 | ||
| 1213 | /* The first half of the feature bitmask is for us to advertise features. The | 1081 | /* The first half of the feature bitmask is for us to advertise features. The |
| @@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit) | |||
| 1219 | /* We can't extend the feature bits once we've added config bytes */ | 1087 | /* We can't extend the feature bits once we've added config bytes */ |
| 1220 | if (dev->desc->feature_len <= bit / CHAR_BIT) { | 1088 | if (dev->desc->feature_len <= bit / CHAR_BIT) { |
| 1221 | assert(dev->desc->config_len == 0); | 1089 | assert(dev->desc->config_len == 0); |
| 1222 | dev->desc->feature_len = (bit / CHAR_BIT) + 1; | 1090 | dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1; |
| 1223 | } | 1091 | } |
| 1224 | 1092 | ||
| 1225 | features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); | 1093 | features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); |
| @@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) | |||
| 1243 | * calling new_dev_desc() to allocate the descriptor and device memory. | 1111 | * calling new_dev_desc() to allocate the descriptor and device memory. |
| 1244 | * | 1112 | * |
| 1245 | * See what I mean about userspace being boring? */ | 1113 | * See what I mean about userspace being boring? */ |
| 1246 | static struct device *new_device(const char *name, u16 type, int fd, | 1114 | static struct device *new_device(const char *name, u16 type) |
| 1247 | bool (*handle_input)(int, struct device *)) | ||
| 1248 | { | 1115 | { |
| 1249 | struct device *dev = malloc(sizeof(*dev)); | 1116 | struct device *dev = malloc(sizeof(*dev)); |
| 1250 | 1117 | ||
| 1251 | /* Now we populate the fields one at a time. */ | 1118 | /* Now we populate the fields one at a time. */ |
| 1252 | dev->fd = fd; | ||
| 1253 | /* If we have an input handler for this file descriptor, then we add it | ||
| 1254 | * to the device_list's fdset and maxfd. */ | ||
| 1255 | if (handle_input) | ||
| 1256 | add_device_fd(dev->fd); | ||
| 1257 | dev->desc = new_dev_desc(type); | 1119 | dev->desc = new_dev_desc(type); |
| 1258 | dev->handle_input = handle_input; | ||
| 1259 | dev->name = name; | 1120 | dev->name = name; |
| 1260 | dev->vq = NULL; | 1121 | dev->vq = NULL; |
| 1261 | dev->ready = NULL; | 1122 | dev->feature_len = 0; |
| 1123 | dev->num_vq = 0; | ||
| 1124 | dev->running = false; | ||
| 1262 | 1125 | ||
| 1263 | /* Append to device list. Prepending to a single-linked list is | 1126 | /* Append to device list. Prepending to a single-linked list is |
| 1264 | * easier, but the user expects the devices to be arranged on the bus | 1127 | * easier, but the user expects the devices to be arranged on the bus |
| @@ -1286,13 +1149,10 @@ static void setup_console(void) | |||
| 1286 | * raw input stream to the Guest. */ | 1149 | * raw input stream to the Guest. */ |
| 1287 | term.c_lflag &= ~(ISIG|ICANON|ECHO); | 1150 | term.c_lflag &= ~(ISIG|ICANON|ECHO); |
| 1288 | tcsetattr(STDIN_FILENO, TCSANOW, &term); | 1151 | tcsetattr(STDIN_FILENO, TCSANOW, &term); |
| 1289 | /* If we exit gracefully, the original settings will be | ||
| 1290 | * restored so the user can see what they're typing. */ | ||
| 1291 | atexit(restore_term); | ||
| 1292 | } | 1152 | } |
| 1293 | 1153 | ||
| 1294 | dev = new_device("console", VIRTIO_ID_CONSOLE, | 1154 | dev = new_device("console", VIRTIO_ID_CONSOLE); |
| 1295 | STDIN_FILENO, handle_console_input); | 1155 | |
| 1296 | /* We store the console state in dev->priv, and initialize it. */ | 1156 | /* We store the console state in dev->priv, and initialize it. */ |
| 1297 | dev->priv = malloc(sizeof(struct console_abort)); | 1157 | dev->priv = malloc(sizeof(struct console_abort)); |
| 1298 | ((struct console_abort *)dev->priv)->count = 0; | 1158 | ((struct console_abort *)dev->priv)->count = 0; |
| @@ -1301,31 +1161,13 @@ static void setup_console(void) | |||
| 1301 | * they put something the input queue, we make sure we're listening to | 1161 | * they put something the input queue, we make sure we're listening to |
| 1302 | * stdin. When they put something in the output queue, we write it to | 1162 | * stdin. When they put something in the output queue, we write it to |
| 1303 | * stdout. */ | 1163 | * stdout. */ |
| 1304 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1164 | add_virtqueue(dev, VIRTQUEUE_NUM, console_input); |
| 1305 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); | 1165 | add_virtqueue(dev, VIRTQUEUE_NUM, console_output); |
| 1306 | 1166 | ||
| 1307 | verbose("device %u: console\n", devices.device_num++); | 1167 | verbose("device %u: console\n", ++devices.device_num); |
| 1308 | } | 1168 | } |
| 1309 | /*:*/ | 1169 | /*:*/ |
| 1310 | 1170 | ||
| 1311 | static void timeout_alarm(int sig) | ||
| 1312 | { | ||
| 1313 | write(timeoutpipe[1], "", 1); | ||
| 1314 | } | ||
| 1315 | |||
| 1316 | static void setup_timeout(void) | ||
| 1317 | { | ||
| 1318 | if (pipe(timeoutpipe) != 0) | ||
| 1319 | err(1, "Creating timeout pipe"); | ||
| 1320 | |||
| 1321 | if (fcntl(timeoutpipe[1], F_SETFL, | ||
| 1322 | fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0) | ||
| 1323 | err(1, "Making timeout pipe nonblocking"); | ||
| 1324 | |||
| 1325 | add_device_fd(timeoutpipe[0]); | ||
| 1326 | signal(SIGALRM, timeout_alarm); | ||
| 1327 | } | ||
| 1328 | |||
| 1329 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a | 1171 | /*M:010 Inter-guest networking is an interesting area. Simplest is to have a |
| 1330 | * --sharenet=<name> option which opens or creates a named pipe. This can be | 1172 | * --sharenet=<name> option which opens or creates a named pipe. This can be |
| 1331 | * used to send packets to another guest in a 1:1 manner. | 1173 | * used to send packets to another guest in a 1:1 manner. |
| @@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) | |||
| 1447 | static void setup_tun_net(char *arg) | 1289 | static void setup_tun_net(char *arg) |
| 1448 | { | 1290 | { |
| 1449 | struct device *dev; | 1291 | struct device *dev; |
| 1450 | int netfd, ipfd; | 1292 | struct net_info *net_info = malloc(sizeof(*net_info)); |
| 1293 | int ipfd; | ||
| 1451 | u32 ip = INADDR_ANY; | 1294 | u32 ip = INADDR_ANY; |
| 1452 | bool bridging = false; | 1295 | bool bridging = false; |
| 1453 | char tapif[IFNAMSIZ], *p; | 1296 | char tapif[IFNAMSIZ], *p; |
| 1454 | struct virtio_net_config conf; | 1297 | struct virtio_net_config conf; |
| 1455 | 1298 | ||
| 1456 | netfd = get_tun_device(tapif); | 1299 | net_info->tunfd = get_tun_device(tapif); |
| 1457 | 1300 | ||
| 1458 | /* First we create a new network device. */ | 1301 | /* First we create a new network device. */ |
| 1459 | dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); | 1302 | dev = new_device("net", VIRTIO_ID_NET); |
| 1303 | dev->priv = net_info; | ||
| 1460 | 1304 | ||
| 1461 | /* Network devices need a receive and a send queue, just like | 1305 | /* Network devices need a receive and a send queue, just like |
| 1462 | * console. */ | 1306 | * console. */ |
| 1463 | add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); | 1307 | add_virtqueue(dev, VIRTQUEUE_NUM, net_input); |
| 1464 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); | 1308 | add_virtqueue(dev, VIRTQUEUE_NUM, net_output); |
| 1465 | 1309 | ||
| 1466 | /* We need a socket to perform the magic network ioctls to bring up the | 1310 | /* We need a socket to perform the magic network ioctls to bring up the |
| 1467 | * tap interface, connect to the bridge etc. Any socket will do! */ | 1311 | * tap interface, connect to the bridge etc. Any socket will do! */ |
| @@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg) | |||
| 1502 | add_feature(dev, VIRTIO_NET_F_HOST_TSO4); | 1346 | add_feature(dev, VIRTIO_NET_F_HOST_TSO4); |
| 1503 | add_feature(dev, VIRTIO_NET_F_HOST_TSO6); | 1347 | add_feature(dev, VIRTIO_NET_F_HOST_TSO6); |
| 1504 | add_feature(dev, VIRTIO_NET_F_HOST_ECN); | 1348 | add_feature(dev, VIRTIO_NET_F_HOST_ECN); |
| 1349 | /* We handle indirect ring entries */ | ||
| 1350 | add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC); | ||
| 1505 | set_config(dev, sizeof(conf), &conf); | 1351 | set_config(dev, sizeof(conf), &conf); |
| 1506 | 1352 | ||
| 1507 | /* We don't need the socket any more; setup is done. */ | 1353 | /* We don't need the socket any more; setup is done. */ |
| @@ -1550,20 +1396,18 @@ struct vblk_info | |||
| 1550 | * Remember that the block device is handled by a separate I/O thread. We head | 1396 | * Remember that the block device is handled by a separate I/O thread. We head |
| 1551 | * straight into the core of that thread here: | 1397 | * straight into the core of that thread here: |
| 1552 | */ | 1398 | */ |
| 1553 | static bool service_io(struct device *dev) | 1399 | static void blk_request(struct virtqueue *vq) |
| 1554 | { | 1400 | { |
| 1555 | struct vblk_info *vblk = dev->priv; | 1401 | struct vblk_info *vblk = vq->dev->priv; |
| 1556 | unsigned int head, out_num, in_num, wlen; | 1402 | unsigned int head, out_num, in_num, wlen; |
| 1557 | int ret; | 1403 | int ret; |
| 1558 | u8 *in; | 1404 | u8 *in; |
| 1559 | struct virtio_blk_outhdr *out; | 1405 | struct virtio_blk_outhdr *out; |
| 1560 | struct iovec iov[dev->vq->vring.num]; | 1406 | struct iovec iov[vq->vring.num]; |
| 1561 | off64_t off; | 1407 | off64_t off; |
| 1562 | 1408 | ||
| 1563 | /* See if there's a request waiting. If not, nothing to do. */ | 1409 | /* Get the next request. */ |
| 1564 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1410 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
| 1565 | if (head == dev->vq->vring.num) | ||
| 1566 | return false; | ||
| 1567 | 1411 | ||
| 1568 | /* Every block request should contain at least one output buffer | 1412 | /* Every block request should contain at least one output buffer |
| 1569 | * (detailing the location on disk and the type of request) and one | 1413 | * (detailing the location on disk and the type of request) and one |
| @@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev) | |||
| 1637 | if (out->type & VIRTIO_BLK_T_BARRIER) | 1481 | if (out->type & VIRTIO_BLK_T_BARRIER) |
| 1638 | fdatasync(vblk->fd); | 1482 | fdatasync(vblk->fd); |
| 1639 | 1483 | ||
| 1640 | /* We can't trigger an IRQ, because we're not the Launcher. It does | 1484 | add_used(vq, head, wlen); |
| 1641 | * that when we tell it we're done. */ | ||
| 1642 | add_used(dev->vq, head, wlen); | ||
| 1643 | return true; | ||
| 1644 | } | ||
| 1645 | |||
| 1646 | /* This is the thread which actually services the I/O. */ | ||
| 1647 | static int io_thread(void *_dev) | ||
| 1648 | { | ||
| 1649 | struct device *dev = _dev; | ||
| 1650 | struct vblk_info *vblk = dev->priv; | ||
| 1651 | char c; | ||
| 1652 | |||
| 1653 | /* Close other side of workpipe so we get 0 read when main dies. */ | ||
| 1654 | close(vblk->workpipe[1]); | ||
| 1655 | /* Close the other side of the done_fd pipe. */ | ||
| 1656 | close(dev->fd); | ||
| 1657 | |||
| 1658 | /* When this read fails, it means Launcher died, so we follow. */ | ||
| 1659 | while (read(vblk->workpipe[0], &c, 1) == 1) { | ||
| 1660 | /* We acknowledge each request immediately to reduce latency, | ||
| 1661 | * rather than waiting until we've done them all. I haven't | ||
| 1662 | * measured to see if it makes any difference. | ||
| 1663 | * | ||
| 1664 | * That would be an interesting test, wouldn't it? You could | ||
| 1665 | * also try having more than one I/O thread. */ | ||
| 1666 | while (service_io(dev)) | ||
| 1667 | write(vblk->done_fd, &c, 1); | ||
| 1668 | } | ||
| 1669 | return 0; | ||
| 1670 | } | ||
| 1671 | |||
| 1672 | /* Now we've seen the I/O thread, we return to the Launcher to see what happens | ||
| 1673 | * when that thread tells us it's completed some I/O. */ | ||
| 1674 | static bool handle_io_finish(int fd, struct device *dev) | ||
| 1675 | { | ||
| 1676 | char c; | ||
| 1677 | |||
| 1678 | /* If the I/O thread died, presumably it printed the error, so we | ||
| 1679 | * simply exit. */ | ||
| 1680 | if (read(dev->fd, &c, 1) != 1) | ||
| 1681 | exit(1); | ||
| 1682 | |||
| 1683 | /* It did some work, so trigger the irq. */ | ||
| 1684 | trigger_irq(fd, dev->vq); | ||
| 1685 | return true; | ||
| 1686 | } | ||
| 1687 | |||
| 1688 | /* When the Guest submits some I/O, we just need to wake the I/O thread. */ | ||
| 1689 | static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout) | ||
| 1690 | { | ||
| 1691 | struct vblk_info *vblk = vq->dev->priv; | ||
| 1692 | char c = 0; | ||
| 1693 | |||
| 1694 | /* Wake up I/O thread and tell it to go to work! */ | ||
| 1695 | if (write(vblk->workpipe[1], &c, 1) != 1) | ||
| 1696 | /* Presumably it indicated why it died. */ | ||
| 1697 | exit(1); | ||
| 1698 | } | 1485 | } |
| 1699 | 1486 | ||
| 1700 | /*L:198 This actually sets up a virtual block device. */ | 1487 | /*L:198 This actually sets up a virtual block device. */ |
| 1701 | static void setup_block_file(const char *filename) | 1488 | static void setup_block_file(const char *filename) |
| 1702 | { | 1489 | { |
| 1703 | int p[2]; | ||
| 1704 | struct device *dev; | 1490 | struct device *dev; |
| 1705 | struct vblk_info *vblk; | 1491 | struct vblk_info *vblk; |
| 1706 | void *stack; | ||
| 1707 | struct virtio_blk_config conf; | 1492 | struct virtio_blk_config conf; |
| 1708 | 1493 | ||
| 1709 | /* This is the pipe the I/O thread will use to tell us I/O is done. */ | ||
| 1710 | pipe(p); | ||
| 1711 | |||
| 1712 | /* The device responds to return from I/O thread. */ | 1494 | /* The device responds to return from I/O thread. */ |
| 1713 | dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); | 1495 | dev = new_device("block", VIRTIO_ID_BLOCK); |
| 1714 | 1496 | ||
| 1715 | /* The device has one virtqueue, where the Guest places requests. */ | 1497 | /* The device has one virtqueue, where the Guest places requests. */ |
| 1716 | add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); | 1498 | add_virtqueue(dev, VIRTQUEUE_NUM, blk_request); |
| 1717 | 1499 | ||
| 1718 | /* Allocate the room for our own bookkeeping */ | 1500 | /* Allocate the room for our own bookkeeping */ |
| 1719 | vblk = dev->priv = malloc(sizeof(*vblk)); | 1501 | vblk = dev->priv = malloc(sizeof(*vblk)); |
| @@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename) | |||
| 1735 | 1517 | ||
| 1736 | set_config(dev, sizeof(conf), &conf); | 1518 | set_config(dev, sizeof(conf), &conf); |
| 1737 | 1519 | ||
| 1738 | /* The I/O thread writes to this end of the pipe when done. */ | ||
| 1739 | vblk->done_fd = p[1]; | ||
| 1740 | |||
| 1741 | /* This is the second pipe, which is how we tell the I/O thread about | ||
| 1742 | * more work. */ | ||
| 1743 | pipe(vblk->workpipe); | ||
| 1744 | |||
| 1745 | /* Create stack for thread and run it. Since stack grows upwards, we | ||
| 1746 | * point the stack pointer to the end of this region. */ | ||
| 1747 | stack = malloc(32768); | ||
| 1748 | /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from | ||
| 1749 | * becoming a zombie. */ | ||
| 1750 | if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1) | ||
| 1751 | err(1, "Creating clone"); | ||
| 1752 | |||
| 1753 | /* We don't need to keep the I/O thread's end of the pipes open. */ | ||
| 1754 | close(vblk->done_fd); | ||
| 1755 | close(vblk->workpipe[0]); | ||
| 1756 | |||
| 1757 | verbose("device %u: virtblock %llu sectors\n", | 1520 | verbose("device %u: virtblock %llu sectors\n", |
| 1758 | devices.device_num, le64_to_cpu(conf.capacity)); | 1521 | ++devices.device_num, le64_to_cpu(conf.capacity)); |
| 1759 | } | 1522 | } |
| 1760 | 1523 | ||
| 1524 | struct rng_info { | ||
| 1525 | int rfd; | ||
| 1526 | }; | ||
| 1527 | |||
| 1761 | /* Our random number generator device reads from /dev/random into the Guest's | 1528 | /* Our random number generator device reads from /dev/random into the Guest's |
| 1762 | * input buffers. The usual case is that the Guest doesn't want random numbers | 1529 | * input buffers. The usual case is that the Guest doesn't want random numbers |
| 1763 | * and so has no buffers although /dev/random is still readable, whereas | 1530 | * and so has no buffers although /dev/random is still readable, whereas |
| 1764 | * console is the reverse. | 1531 | * console is the reverse. |
| 1765 | * | 1532 | * |
| 1766 | * The same logic applies, however. */ | 1533 | * The same logic applies, however. */ |
| 1767 | static bool handle_rng_input(int fd, struct device *dev) | 1534 | static void rng_input(struct virtqueue *vq) |
| 1768 | { | 1535 | { |
| 1769 | int len; | 1536 | int len; |
| 1770 | unsigned int head, in_num, out_num, totlen = 0; | 1537 | unsigned int head, in_num, out_num, totlen = 0; |
| 1771 | struct iovec iov[dev->vq->vring.num]; | 1538 | struct rng_info *rng_info = vq->dev->priv; |
| 1539 | struct iovec iov[vq->vring.num]; | ||
| 1772 | 1540 | ||
| 1773 | /* First we need a buffer from the Guests's virtqueue. */ | 1541 | /* First we need a buffer from the Guests's virtqueue. */ |
| 1774 | head = get_vq_desc(dev->vq, iov, &out_num, &in_num); | 1542 | head = wait_for_vq_desc(vq, iov, &out_num, &in_num); |
| 1775 | |||
| 1776 | /* If they're not ready for input, stop listening to this file | ||
| 1777 | * descriptor. We'll start again once they add an input buffer. */ | ||
| 1778 | if (head == dev->vq->vring.num) | ||
| 1779 | return false; | ||
| 1780 | |||
| 1781 | if (out_num) | 1543 | if (out_num) |
| 1782 | errx(1, "Output buffers in rng?"); | 1544 | errx(1, "Output buffers in rng?"); |
| 1783 | 1545 | ||
| @@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev) | |||
| 1785 | * it reads straight into the Guest's buffer. We loop to make sure we | 1547 | * it reads straight into the Guest's buffer. We loop to make sure we |
| 1786 | * fill it. */ | 1548 | * fill it. */ |
| 1787 | while (!iov_empty(iov, in_num)) { | 1549 | while (!iov_empty(iov, in_num)) { |
| 1788 | len = readv(dev->fd, iov, in_num); | 1550 | len = readv(rng_info->rfd, iov, in_num); |
| 1789 | if (len <= 0) | 1551 | if (len <= 0) |
| 1790 | err(1, "Read from /dev/random gave %i", len); | 1552 | err(1, "Read from /dev/random gave %i", len); |
| 1791 | iov_consume(iov, in_num, len); | 1553 | iov_consume(iov, in_num, len); |
| @@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev) | |||
| 1793 | } | 1555 | } |
| 1794 | 1556 | ||
| 1795 | /* Tell the Guest about the new input. */ | 1557 | /* Tell the Guest about the new input. */ |
| 1796 | add_used_and_trigger(fd, dev->vq, head, totlen); | 1558 | add_used(vq, head, totlen); |
| 1797 | |||
| 1798 | /* Everything went OK! */ | ||
| 1799 | return true; | ||
| 1800 | } | 1559 | } |
| 1801 | 1560 | ||
| 1802 | /* And this creates a "hardware" random number device for the Guest. */ | 1561 | /* And this creates a "hardware" random number device for the Guest. */ |
| 1803 | static void setup_rng(void) | 1562 | static void setup_rng(void) |
| 1804 | { | 1563 | { |
| 1805 | struct device *dev; | 1564 | struct device *dev; |
| 1806 | int fd; | 1565 | struct rng_info *rng_info = malloc(sizeof(*rng_info)); |
| 1807 | 1566 | ||
| 1808 | fd = open_or_die("/dev/random", O_RDONLY); | 1567 | rng_info->rfd = open_or_die("/dev/random", O_RDONLY); |
| 1809 | 1568 | ||
| 1810 | /* The device responds to return from I/O thread. */ | 1569 | /* The device responds to return from I/O thread. */ |
| 1811 | dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); | 1570 | dev = new_device("rng", VIRTIO_ID_RNG); |
| 1571 | dev->priv = rng_info; | ||
| 1812 | 1572 | ||
| 1813 | /* The device has one virtqueue, where the Guest places inbufs. */ | 1573 | /* The device has one virtqueue, where the Guest places inbufs. */ |
| 1814 | add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); | 1574 | add_virtqueue(dev, VIRTQUEUE_NUM, rng_input); |
| 1815 | 1575 | ||
| 1816 | verbose("device %u: rng\n", devices.device_num++); | 1576 | verbose("device %u: rng\n", devices.device_num++); |
| 1817 | } | 1577 | } |
| @@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void) | |||
| 1827 | for (i = 3; i < FD_SETSIZE; i++) | 1587 | for (i = 3; i < FD_SETSIZE; i++) |
| 1828 | close(i); | 1588 | close(i); |
| 1829 | 1589 | ||
| 1830 | /* The exec automatically gets rid of the I/O and Waker threads. */ | 1590 | /* Reset all the devices (kills all threads). */ |
| 1591 | cleanup_devices(); | ||
| 1592 | |||
| 1831 | execv(main_args[0], main_args); | 1593 | execv(main_args[0], main_args); |
| 1832 | err(1, "Could not exec %s", main_args[0]); | 1594 | err(1, "Could not exec %s", main_args[0]); |
| 1833 | } | 1595 | } |
| 1834 | 1596 | ||
| 1835 | /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves | 1597 | /*L:220 Finally we reach the core of the Launcher which runs the Guest, serves |
| 1836 | * its input and output, and finally, lays it to rest. */ | 1598 | * its input and output, and finally, lays it to rest. */ |
| 1837 | static void __attribute__((noreturn)) run_guest(int lguest_fd) | 1599 | static void __attribute__((noreturn)) run_guest(void) |
| 1838 | { | 1600 | { |
| 1839 | for (;;) { | 1601 | for (;;) { |
| 1840 | unsigned long args[] = { LHREQ_BREAK, 0 }; | ||
| 1841 | unsigned long notify_addr; | 1602 | unsigned long notify_addr; |
| 1842 | int readval; | 1603 | int readval; |
| 1843 | 1604 | ||
| @@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
| 1848 | /* One unsigned long means the Guest did HCALL_NOTIFY */ | 1609 | /* One unsigned long means the Guest did HCALL_NOTIFY */ |
| 1849 | if (readval == sizeof(notify_addr)) { | 1610 | if (readval == sizeof(notify_addr)) { |
| 1850 | verbose("Notify on address %#lx\n", notify_addr); | 1611 | verbose("Notify on address %#lx\n", notify_addr); |
| 1851 | handle_output(lguest_fd, notify_addr); | 1612 | handle_output(notify_addr); |
| 1852 | continue; | ||
| 1853 | /* ENOENT means the Guest died. Reading tells us why. */ | 1613 | /* ENOENT means the Guest died. Reading tells us why. */ |
| 1854 | } else if (errno == ENOENT) { | 1614 | } else if (errno == ENOENT) { |
| 1855 | char reason[1024] = { 0 }; | 1615 | char reason[1024] = { 0 }; |
| @@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd) | |||
| 1858 | /* ERESTART means that we need to reboot the guest */ | 1618 | /* ERESTART means that we need to reboot the guest */ |
| 1859 | } else if (errno == ERESTART) { | 1619 | } else if (errno == ERESTART) { |
| 1860 | restart_guest(); | 1620 | restart_guest(); |
| 1861 | /* EAGAIN means a signal (timeout). | 1621 | /* Anything else means a bug or incompatible change. */ |
| 1862 | * Anything else means a bug or incompatible change. */ | 1622 | } else |
| 1863 | } else if (errno != EAGAIN) | ||
| 1864 | err(1, "Running guest failed"); | 1623 | err(1, "Running guest failed"); |
| 1865 | |||
| 1866 | /* Only service input on thread for CPU 0. */ | ||
| 1867 | if (cpu_id != 0) | ||
| 1868 | continue; | ||
| 1869 | |||
| 1870 | /* Service input, then unset the BREAK to release the Waker. */ | ||
| 1871 | handle_input(lguest_fd); | ||
| 1872 | if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0) | ||
| 1873 | err(1, "Resetting break"); | ||
| 1874 | } | 1624 | } |
| 1875 | } | 1625 | } |
| 1876 | /*L:240 | 1626 | /*L:240 |
| @@ -1904,8 +1654,8 @@ int main(int argc, char *argv[]) | |||
| 1904 | /* Memory, top-level pagetable, code startpoint and size of the | 1654 | /* Memory, top-level pagetable, code startpoint and size of the |
| 1905 | * (optional) initrd. */ | 1655 | * (optional) initrd. */ |
| 1906 | unsigned long mem = 0, start, initrd_size = 0; | 1656 | unsigned long mem = 0, start, initrd_size = 0; |
| 1907 | /* Two temporaries and the /dev/lguest file descriptor. */ | 1657 | /* Two temporaries. */ |
| 1908 | int i, c, lguest_fd; | 1658 | int i, c; |
| 1909 | /* The boot information for the Guest. */ | 1659 | /* The boot information for the Guest. */ |
| 1910 | struct boot_params *boot; | 1660 | struct boot_params *boot; |
| 1911 | /* If they specify an initrd file to load. */ | 1661 | /* If they specify an initrd file to load. */ |
| @@ -1913,18 +1663,10 @@ int main(int argc, char *argv[]) | |||
| 1913 | 1663 | ||
| 1914 | /* Save the args: we "reboot" by execing ourselves again. */ | 1664 | /* Save the args: we "reboot" by execing ourselves again. */ |
| 1915 | main_args = argv; | 1665 | main_args = argv; |
| 1916 | /* We don't "wait" for the children, so prevent them from becoming | ||
| 1917 | * zombies. */ | ||
| 1918 | signal(SIGCHLD, SIG_IGN); | ||
| 1919 | 1666 | ||
| 1920 | /* First we initialize the device list. Since console and network | 1667 | /* First we initialize the device list. We keep a pointer to the last |
| 1921 | * device receive input from a file descriptor, we keep an fdset | 1668 | * device, and the next interrupt number to use for devices (1: |
| 1922 | * (infds) and the maximum fd number (max_infd) with the head of the | 1669 | * remember that 0 is used by the timer). */ |
| 1923 | * list. We also keep a pointer to the last device. Finally, we keep | ||
| 1924 | * the next interrupt number to use for devices (1: remember that 0 is | ||
| 1925 | * used by the timer). */ | ||
| 1926 | FD_ZERO(&devices.infds); | ||
| 1927 | devices.max_infd = -1; | ||
| 1928 | devices.lastdev = NULL; | 1670 | devices.lastdev = NULL; |
| 1929 | devices.next_irq = 1; | 1671 | devices.next_irq = 1; |
| 1930 | 1672 | ||
| @@ -1982,9 +1724,6 @@ int main(int argc, char *argv[]) | |||
| 1982 | /* We always have a console device */ | 1724 | /* We always have a console device */ |
| 1983 | setup_console(); | 1725 | setup_console(); |
| 1984 | 1726 | ||
| 1985 | /* We can timeout waiting for Guest network transmit. */ | ||
| 1986 | setup_timeout(); | ||
| 1987 | |||
| 1988 | /* Now we load the kernel */ | 1727 | /* Now we load the kernel */ |
| 1989 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); | 1728 | start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); |
| 1990 | 1729 | ||
| @@ -2023,15 +1762,16 @@ int main(int argc, char *argv[]) | |||
| 2023 | 1762 | ||
| 2024 | /* We tell the kernel to initialize the Guest: this returns the open | 1763 | /* We tell the kernel to initialize the Guest: this returns the open |
| 2025 | * /dev/lguest file descriptor. */ | 1764 | * /dev/lguest file descriptor. */ |
| 2026 | lguest_fd = tell_kernel(start); | 1765 | tell_kernel(start); |
| 1766 | |||
| 1767 | /* Ensure that we terminate if a child dies. */ | ||
| 1768 | signal(SIGCHLD, kill_launcher); | ||
| 2027 | 1769 | ||
| 2028 | /* We clone off a thread, which wakes the Launcher whenever one of the | 1770 | /* If we exit via err(), this kills all the threads, restores tty. */ |
| 2029 | * input file descriptors needs attention. We call this the Waker, and | 1771 | atexit(cleanup_devices); |
| 2030 | * we'll cover it in a moment. */ | ||
| 2031 | setup_waker(lguest_fd); | ||
| 2032 | 1772 | ||
| 2033 | /* Finally, run the Guest. This doesn't return. */ | 1773 | /* Finally, run the Guest. This doesn't return. */ |
| 2034 | run_guest(lguest_fd); | 1774 | run_guest(); |
| 2035 | } | 1775 | } |
| 2036 | /*:*/ | 1776 | /*:*/ |
| 2037 | 1777 | ||
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt index 28c747362f95..efb3a6a045a2 100644 --- a/Documentation/lguest/lguest.txt +++ b/Documentation/lguest/lguest.txt | |||
| @@ -37,7 +37,6 @@ Running Lguest: | |||
| 37 | "Paravirtualized guest support" = Y | 37 | "Paravirtualized guest support" = Y |
| 38 | "Lguest guest support" = Y | 38 | "Lguest guest support" = Y |
| 39 | "High Memory Support" = off/4GB | 39 | "High Memory Support" = off/4GB |
| 40 | "PAE (Physical Address Extension) Support" = N | ||
| 41 | "Alignment value to which kernel should be aligned" = 0x100000 | 40 | "Alignment value to which kernel should be aligned" = 0x100000 |
| 42 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and | 41 | (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and |
| 43 | CONFIG_PHYSICAL_ALIGN=0x100000) | 42 | CONFIG_PHYSICAL_ALIGN=0x100000) |
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 1caf57628b9c..313389cd50d2 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h | |||
| @@ -17,8 +17,13 @@ | |||
| 17 | /* Pages for switcher itself, then two pages per cpu */ | 17 | /* Pages for switcher itself, then two pages per cpu */ |
| 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) | 18 | #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) |
| 19 | 19 | ||
| 20 | /* We map at -4M for ease of mapping into the guest (one PTE page). */ | 20 | /* We map at -4M (-2M when PAE is activated) for ease of mapping |
| 21 | * into the guest (one PTE page). */ | ||
| 22 | #ifdef CONFIG_X86_PAE | ||
| 23 | #define SWITCHER_ADDR 0xFFE00000 | ||
| 24 | #else | ||
| 21 | #define SWITCHER_ADDR 0xFFC00000 | 25 | #define SWITCHER_ADDR 0xFFC00000 |
| 26 | #endif | ||
| 22 | 27 | ||
| 23 | /* Found in switcher.S */ | 28 | /* Found in switcher.S */ |
| 24 | extern unsigned long default_idt_entries[]; | 29 | extern unsigned long default_idt_entries[]; |
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index faae1996487b..d31c4a684078 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h | |||
| @@ -12,11 +12,13 @@ | |||
| 12 | #define LHCALL_TS 8 | 12 | #define LHCALL_TS 8 |
| 13 | #define LHCALL_SET_CLOCKEVENT 9 | 13 | #define LHCALL_SET_CLOCKEVENT 9 |
| 14 | #define LHCALL_HALT 10 | 14 | #define LHCALL_HALT 10 |
| 15 | #define LHCALL_SET_PMD 13 | ||
| 15 | #define LHCALL_SET_PTE 14 | 16 | #define LHCALL_SET_PTE 14 |
| 16 | #define LHCALL_SET_PMD 15 | 17 | #define LHCALL_SET_PGD 15 |
| 17 | #define LHCALL_LOAD_TLS 16 | 18 | #define LHCALL_LOAD_TLS 16 |
| 18 | #define LHCALL_NOTIFY 17 | 19 | #define LHCALL_NOTIFY 17 |
| 19 | #define LHCALL_LOAD_GDT_ENTRY 18 | 20 | #define LHCALL_LOAD_GDT_ENTRY 18 |
| 21 | #define LHCALL_SEND_INTERRUPTS 19 | ||
| 20 | 22 | ||
| 21 | #define LGUEST_TRAP_ENTRY 0x1F | 23 | #define LGUEST_TRAP_ENTRY 0x1F |
| 22 | 24 | ||
| @@ -32,10 +34,10 @@ | |||
| 32 | * operations? There are two ways: the direct way is to make a "hypercall", | 34 | * operations? There are two ways: the direct way is to make a "hypercall", |
| 33 | * to make requests of the Host Itself. | 35 | * to make requests of the Host Itself. |
| 34 | * | 36 | * |
| 35 | * We use the KVM hypercall mechanism. Eighteen hypercalls are | 37 | * We use the KVM hypercall mechanism. Seventeen hypercalls are |
| 36 | * available: the hypercall number is put in the %eax register, and the | 38 | * available: the hypercall number is put in the %eax register, and the |
| 37 | * arguments (when required) are placed in %ebx, %ecx and %edx. If a return | 39 | * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. |
| 38 | * value makes sense, it's returned in %eax. | 40 | * If a return value makes sense, it's returned in %eax. |
| 39 | * | 41 | * |
| 40 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful | 42 | * Grossly invalid calls result in Sudden Death at the hands of the vengeful |
| 41 | * Host, rather than returning failure. This reflects Winston Churchill's | 43 | * Host, rather than returning failure. This reflects Winston Churchill's |
| @@ -47,8 +49,9 @@ | |||
| 47 | 49 | ||
| 48 | #define LHCALL_RING_SIZE 64 | 50 | #define LHCALL_RING_SIZE 64 |
| 49 | struct hcall_args { | 51 | struct hcall_args { |
| 50 | /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ | 52 | /* These map directly onto eax, ebx, ecx, edx and esi |
| 51 | unsigned long arg0, arg1, arg2, arg3; | 53 | * in struct lguest_regs */ |
| 54 | unsigned long arg0, arg1, arg2, arg3, arg4; | ||
| 52 | }; | 55 | }; |
| 53 | 56 | ||
| 54 | #endif /* !__ASSEMBLY__ */ | 57 | #endif /* !__ASSEMBLY__ */ |
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 1a830cbd7015..dfdbf6403895 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c | |||
| @@ -126,6 +126,7 @@ void foo(void) | |||
| 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) | 126 | #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) |
| 127 | BLANK(); | 127 | BLANK(); |
| 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); | 128 | OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); |
| 129 | OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending); | ||
| 129 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); | 130 | OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); |
| 130 | 131 | ||
| 131 | BLANK(); | 132 | BLANK(); |
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig index 8dab8f7844d3..38718041efc3 100644 --- a/arch/x86/lguest/Kconfig +++ b/arch/x86/lguest/Kconfig | |||
| @@ -2,7 +2,6 @@ config LGUEST_GUEST | |||
| 2 | bool "Lguest guest support" | 2 | bool "Lguest guest support" |
| 3 | select PARAVIRT | 3 | select PARAVIRT |
| 4 | depends on X86_32 | 4 | depends on X86_32 |
| 5 | depends on !X86_PAE | ||
| 6 | select VIRTIO | 5 | select VIRTIO |
| 7 | select VIRTIO_RING | 6 | select VIRTIO_RING |
| 8 | select VIRTIO_CONSOLE | 7 | select VIRTIO_CONSOLE |
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 4e0c26559395..7bc65f0f62c4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c | |||
| @@ -87,7 +87,7 @@ struct lguest_data lguest_data = { | |||
| 87 | 87 | ||
| 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a | 88 | /*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a |
| 89 | * ring buffer of stored hypercalls which the Host will run though next time we | 89 | * ring buffer of stored hypercalls which the Host will run though next time we |
| 90 | * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall | 90 | * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall |
| 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, | 91 | * arguments, and a "hcall_status" word which is 0 if the call is ready to go, |
| 92 | * and 255 once the Host has finished with it. | 92 | * and 255 once the Host has finished with it. |
| 93 | * | 93 | * |
| @@ -96,7 +96,8 @@ struct lguest_data lguest_data = { | |||
| 96 | * effect of causing the Host to run all the stored calls in the ring buffer | 96 | * effect of causing the Host to run all the stored calls in the ring buffer |
| 97 | * which empties it for next time! */ | 97 | * which empties it for next time! */ |
| 98 | static void async_hcall(unsigned long call, unsigned long arg1, | 98 | static void async_hcall(unsigned long call, unsigned long arg1, |
| 99 | unsigned long arg2, unsigned long arg3) | 99 | unsigned long arg2, unsigned long arg3, |
| 100 | unsigned long arg4) | ||
| 100 | { | 101 | { |
| 101 | /* Note: This code assumes we're uniprocessor. */ | 102 | /* Note: This code assumes we're uniprocessor. */ |
| 102 | static unsigned int next_call; | 103 | static unsigned int next_call; |
| @@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1, | |||
| 108 | local_irq_save(flags); | 109 | local_irq_save(flags); |
| 109 | if (lguest_data.hcall_status[next_call] != 0xFF) { | 110 | if (lguest_data.hcall_status[next_call] != 0xFF) { |
| 110 | /* Table full, so do normal hcall which will flush table. */ | 111 | /* Table full, so do normal hcall which will flush table. */ |
| 111 | kvm_hypercall3(call, arg1, arg2, arg3); | 112 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); |
| 112 | } else { | 113 | } else { |
| 113 | lguest_data.hcalls[next_call].arg0 = call; | 114 | lguest_data.hcalls[next_call].arg0 = call; |
| 114 | lguest_data.hcalls[next_call].arg1 = arg1; | 115 | lguest_data.hcalls[next_call].arg1 = arg1; |
| 115 | lguest_data.hcalls[next_call].arg2 = arg2; | 116 | lguest_data.hcalls[next_call].arg2 = arg2; |
| 116 | lguest_data.hcalls[next_call].arg3 = arg3; | 117 | lguest_data.hcalls[next_call].arg3 = arg3; |
| 118 | lguest_data.hcalls[next_call].arg4 = arg4; | ||
| 117 | /* Arguments must all be written before we mark it to go */ | 119 | /* Arguments must all be written before we mark it to go */ |
| 118 | wmb(); | 120 | wmb(); |
| 119 | lguest_data.hcall_status[next_call] = 0; | 121 | lguest_data.hcall_status[next_call] = 0; |
| @@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call, | |||
| 141 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 143 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 142 | kvm_hypercall1(call, arg1); | 144 | kvm_hypercall1(call, arg1); |
| 143 | else | 145 | else |
| 144 | async_hcall(call, arg1, 0, 0); | 146 | async_hcall(call, arg1, 0, 0, 0); |
| 145 | } | 147 | } |
| 146 | 148 | ||
| 147 | static void lazy_hcall2(unsigned long call, | 149 | static void lazy_hcall2(unsigned long call, |
| @@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call, | |||
| 151 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 153 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 152 | kvm_hypercall2(call, arg1, arg2); | 154 | kvm_hypercall2(call, arg1, arg2); |
| 153 | else | 155 | else |
| 154 | async_hcall(call, arg1, arg2, 0); | 156 | async_hcall(call, arg1, arg2, 0, 0); |
| 155 | } | 157 | } |
| 156 | 158 | ||
| 157 | static void lazy_hcall3(unsigned long call, | 159 | static void lazy_hcall3(unsigned long call, |
| @@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call, | |||
| 162 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | 164 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) |
| 163 | kvm_hypercall3(call, arg1, arg2, arg3); | 165 | kvm_hypercall3(call, arg1, arg2, arg3); |
| 164 | else | 166 | else |
| 165 | async_hcall(call, arg1, arg2, arg3); | 167 | async_hcall(call, arg1, arg2, arg3, 0); |
| 166 | } | 168 | } |
| 167 | 169 | ||
| 170 | #ifdef CONFIG_X86_PAE | ||
| 171 | static void lazy_hcall4(unsigned long call, | ||
| 172 | unsigned long arg1, | ||
| 173 | unsigned long arg2, | ||
| 174 | unsigned long arg3, | ||
| 175 | unsigned long arg4) | ||
| 176 | { | ||
| 177 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) | ||
| 178 | kvm_hypercall4(call, arg1, arg2, arg3, arg4); | ||
| 179 | else | ||
| 180 | async_hcall(call, arg1, arg2, arg3, arg4); | ||
| 181 | } | ||
| 182 | #endif | ||
| 183 | |||
| 168 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then | 184 | /* When lazy mode is turned off reset the per-cpu lazy mode variable and then |
| 169 | * issue the do-nothing hypercall to flush any stored calls. */ | 185 | * issue the do-nothing hypercall to flush any stored calls. */ |
| 170 | static void lguest_leave_lazy_mmu_mode(void) | 186 | static void lguest_leave_lazy_mmu_mode(void) |
| @@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next) | |||
| 179 | paravirt_end_context_switch(next); | 195 | paravirt_end_context_switch(next); |
| 180 | } | 196 | } |
| 181 | 197 | ||
| 182 | /*G:033 | 198 | /*G:032 |
| 183 | * After that diversion we return to our first native-instruction | 199 | * After that diversion we return to our first native-instruction |
| 184 | * replacements: four functions for interrupt control. | 200 | * replacements: four functions for interrupt control. |
| 185 | * | 201 | * |
| @@ -199,30 +215,28 @@ static unsigned long save_fl(void) | |||
| 199 | { | 215 | { |
| 200 | return lguest_data.irq_enabled; | 216 | return lguest_data.irq_enabled; |
| 201 | } | 217 | } |
| 202 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
| 203 | |||
| 204 | /* restore_flags() just sets the flags back to the value given. */ | ||
| 205 | static void restore_fl(unsigned long flags) | ||
| 206 | { | ||
| 207 | lguest_data.irq_enabled = flags; | ||
| 208 | } | ||
| 209 | PV_CALLEE_SAVE_REGS_THUNK(restore_fl); | ||
| 210 | 218 | ||
| 211 | /* Interrupts go off... */ | 219 | /* Interrupts go off... */ |
| 212 | static void irq_disable(void) | 220 | static void irq_disable(void) |
| 213 | { | 221 | { |
| 214 | lguest_data.irq_enabled = 0; | 222 | lguest_data.irq_enabled = 0; |
| 215 | } | 223 | } |
| 224 | |||
| 225 | /* Let's pause a moment. Remember how I said these are called so often? | ||
| 226 | * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to | ||
| 227 | * break some rules. In particular, these functions are assumed to save their | ||
| 228 | * own registers if they need to: normal C functions assume they can trash the | ||
| 229 | * eax register. To use normal C functions, we use | ||
| 230 | * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the | ||
| 231 | * C function, then restores it. */ | ||
| 232 | PV_CALLEE_SAVE_REGS_THUNK(save_fl); | ||
| 216 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); | 233 | PV_CALLEE_SAVE_REGS_THUNK(irq_disable); |
| 234 | /*:*/ | ||
| 217 | 235 | ||
| 218 | /* Interrupts go on... */ | 236 | /* These are in i386_head.S */ |
| 219 | static void irq_enable(void) | 237 | extern void lg_irq_enable(void); |
| 220 | { | 238 | extern void lg_restore_fl(unsigned long flags); |
| 221 | lguest_data.irq_enabled = X86_EFLAGS_IF; | ||
| 222 | } | ||
| 223 | PV_CALLEE_SAVE_REGS_THUNK(irq_enable); | ||
| 224 | 239 | ||
| 225 | /*:*/ | ||
| 226 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable | 240 | /*M:003 Note that we don't check for outstanding interrupts when we re-enable |
| 227 | * them (or when we unmask an interrupt). This seems to work for the moment, | 241 | * them (or when we unmask an interrupt). This seems to work for the moment, |
| 228 | * since interrupts are rare and we'll just get the interrupt on the next timer | 242 | * since interrupts are rare and we'll just get the interrupt on the next timer |
| @@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 368 | case 1: /* Basic feature request. */ | 382 | case 1: /* Basic feature request. */ |
| 369 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ | 383 | /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ |
| 370 | *cx &= 0x00002201; | 384 | *cx &= 0x00002201; |
| 371 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ | 385 | /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ |
| 372 | *dx &= 0x07808111; | 386 | *dx &= 0x07808151; |
| 373 | /* The Host can do a nice optimization if it knows that the | 387 | /* The Host can do a nice optimization if it knows that the |
| 374 | * kernel mappings (addresses above 0xC0000000 or whatever | 388 | * kernel mappings (addresses above 0xC0000000 or whatever |
| 375 | * PAGE_OFFSET is set to) haven't changed. But Linux calls | 389 | * PAGE_OFFSET is set to) haven't changed. But Linux calls |
| @@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, | |||
| 388 | if (*ax > 0x80000008) | 402 | if (*ax > 0x80000008) |
| 389 | *ax = 0x80000008; | 403 | *ax = 0x80000008; |
| 390 | break; | 404 | break; |
| 405 | case 0x80000001: | ||
| 406 | /* Here we should fix nx cap depending on host. */ | ||
| 407 | /* For this version of PAE, we just clear NX bit. */ | ||
| 408 | *dx &= ~(1 << 20); | ||
| 409 | break; | ||
| 391 | } | 410 | } |
| 392 | } | 411 | } |
| 393 | 412 | ||
| @@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val) | |||
| 521 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, | 540 | static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, |
| 522 | pte_t *ptep) | 541 | pte_t *ptep) |
| 523 | { | 542 | { |
| 543 | #ifdef CONFIG_X86_PAE | ||
| 544 | lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, | ||
| 545 | ptep->pte_low, ptep->pte_high); | ||
| 546 | #else | ||
| 524 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); | 547 | lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); |
| 548 | #endif | ||
| 525 | } | 549 | } |
| 526 | 550 | ||
| 527 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, | 551 | static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, |
| 528 | pte_t *ptep, pte_t pteval) | 552 | pte_t *ptep, pte_t pteval) |
| 529 | { | 553 | { |
| 530 | *ptep = pteval; | 554 | native_set_pte(ptep, pteval); |
| 531 | lguest_pte_update(mm, addr, ptep); | 555 | lguest_pte_update(mm, addr, ptep); |
| 532 | } | 556 | } |
| 533 | 557 | ||
| 534 | /* The Guest calls this to set a top-level entry. Again, we set the entry then | 558 | /* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd |
| 535 | * tell the Host which top-level page we changed, and the index of the entry we | 559 | * to set a middle-level entry when PAE is activated. |
| 536 | * changed. */ | 560 | * Again, we set the entry then tell the Host which page we changed, |
| 561 | * and the index of the entry we changed. */ | ||
| 562 | #ifdef CONFIG_X86_PAE | ||
| 563 | static void lguest_set_pud(pud_t *pudp, pud_t pudval) | ||
| 564 | { | ||
| 565 | native_set_pud(pudp, pudval); | ||
| 566 | |||
| 567 | /* 32 bytes aligned pdpt address and the index. */ | ||
| 568 | lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0, | ||
| 569 | (__pa(pudp) & 0x1F) / sizeof(pud_t)); | ||
| 570 | } | ||
| 571 | |||
| 537 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | 572 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) |
| 538 | { | 573 | { |
| 539 | *pmdp = pmdval; | 574 | native_set_pmd(pmdp, pmdval); |
| 540 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, | 575 | lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, |
| 541 | (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); | 576 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); |
| 542 | } | 577 | } |
| 578 | #else | ||
| 579 | |||
| 580 | /* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not | ||
| 581 | * activated. */ | ||
| 582 | static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | ||
| 583 | { | ||
| 584 | native_set_pmd(pmdp, pmdval); | ||
| 585 | lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK, | ||
| 586 | (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t)); | ||
| 587 | } | ||
| 588 | #endif | ||
| 543 | 589 | ||
| 544 | /* There are a couple of legacy places where the kernel sets a PTE, but we | 590 | /* There are a couple of legacy places where the kernel sets a PTE, but we |
| 545 | * don't know the top level any more. This is useless for us, since we don't | 591 | * don't know the top level any more. This is useless for us, since we don't |
| @@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) | |||
| 552 | * which brings boot back to 0.25 seconds. */ | 598 | * which brings boot back to 0.25 seconds. */ |
| 553 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) | 599 | static void lguest_set_pte(pte_t *ptep, pte_t pteval) |
| 554 | { | 600 | { |
| 555 | *ptep = pteval; | 601 | native_set_pte(ptep, pteval); |
| 602 | if (cr3_changed) | ||
| 603 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | ||
| 604 | } | ||
| 605 | |||
| 606 | #ifdef CONFIG_X86_PAE | ||
| 607 | static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) | ||
| 608 | { | ||
| 609 | native_set_pte_atomic(ptep, pte); | ||
| 556 | if (cr3_changed) | 610 | if (cr3_changed) |
| 557 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); | 611 | lazy_hcall1(LHCALL_FLUSH_TLB, 1); |
| 558 | } | 612 | } |
| 559 | 613 | ||
| 614 | void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) | ||
| 615 | { | ||
| 616 | native_pte_clear(mm, addr, ptep); | ||
| 617 | lguest_pte_update(mm, addr, ptep); | ||
| 618 | } | ||
| 619 | |||
| 620 | void lguest_pmd_clear(pmd_t *pmdp) | ||
| 621 | { | ||
| 622 | lguest_set_pmd(pmdp, __pmd(0)); | ||
| 623 | } | ||
| 624 | #endif | ||
| 625 | |||
| 560 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on | 626 | /* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on |
| 561 | * native page table operations. On native hardware you can set a new page | 627 | * native page table operations. On native hardware you can set a new page |
| 562 | * table entry whenever you want, but if you want to remove one you have to do | 628 | * table entry whenever you want, but if you want to remove one you have to do |
| @@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void) | |||
| 628 | { | 694 | { |
| 629 | unsigned int i; | 695 | unsigned int i; |
| 630 | 696 | ||
| 631 | for (i = 0; i < LGUEST_IRQS; i++) { | 697 | for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { |
| 632 | int vector = FIRST_EXTERNAL_VECTOR + i; | ||
| 633 | /* Some systems map "vectors" to interrupts weirdly. Lguest has | 698 | /* Some systems map "vectors" to interrupts weirdly. Lguest has |
| 634 | * a straightforward 1 to 1 mapping, so force that here. */ | 699 | * a straightforward 1 to 1 mapping, so force that here. */ |
| 635 | __get_cpu_var(vector_irq)[vector] = i; | 700 | __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; |
| 636 | if (vector != SYSCALL_VECTOR) | 701 | if (i != SYSCALL_VECTOR) |
| 637 | set_intr_gate(vector, interrupt[i]); | 702 | set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); |
| 638 | } | 703 | } |
| 639 | /* This call is required to set up for 4k stacks, where we have | 704 | /* This call is required to set up for 4k stacks, where we have |
| 640 | * separate stacks for hard and soft interrupts. */ | 705 | * separate stacks for hard and soft interrupts. */ |
| @@ -973,10 +1038,10 @@ static void lguest_restart(char *reason) | |||
| 973 | * | 1038 | * |
| 974 | * Our current solution is to allow the paravirt back end to optionally patch | 1039 | * Our current solution is to allow the paravirt back end to optionally patch |
| 975 | * over the indirect calls to replace them with something more efficient. We | 1040 | * over the indirect calls to replace them with something more efficient. We |
| 976 | * patch the four most commonly called functions: disable interrupts, enable | 1041 | * patch two of the simplest of the most commonly called functions: disable |
| 977 | * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 | 1042 | * interrupts and save interrupts. We usually have 6 or 10 bytes to patch |
| 978 | * bytes to patch into: the Guest versions of these operations are small enough | 1043 | * into: the Guest versions of these operations are small enough that we can |
| 979 | * that we can fit comfortably. | 1044 | * fit comfortably. |
| 980 | * | 1045 | * |
| 981 | * First we need assembly templates of each of the patchable Guest operations, | 1046 | * First we need assembly templates of each of the patchable Guest operations, |
| 982 | * and these are in i386_head.S. */ | 1047 | * and these are in i386_head.S. */ |
| @@ -987,8 +1052,6 @@ static const struct lguest_insns | |||
| 987 | const char *start, *end; | 1052 | const char *start, *end; |
| 988 | } lguest_insns[] = { | 1053 | } lguest_insns[] = { |
| 989 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, | 1054 | [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, |
| 990 | [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, | ||
| 991 | [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, | ||
| 992 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, | 1055 | [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, |
| 993 | }; | 1056 | }; |
| 994 | 1057 | ||
| @@ -1026,6 +1089,7 @@ __init void lguest_init(void) | |||
| 1026 | pv_info.name = "lguest"; | 1089 | pv_info.name = "lguest"; |
| 1027 | pv_info.paravirt_enabled = 1; | 1090 | pv_info.paravirt_enabled = 1; |
| 1028 | pv_info.kernel_rpl = 1; | 1091 | pv_info.kernel_rpl = 1; |
| 1092 | pv_info.shared_kernel_pmd = 1; | ||
| 1029 | 1093 | ||
| 1030 | /* We set up all the lguest overrides for sensitive operations. These | 1094 | /* We set up all the lguest overrides for sensitive operations. These |
| 1031 | * are detailed with the operations themselves. */ | 1095 | * are detailed with the operations themselves. */ |
| @@ -1033,9 +1097,9 @@ __init void lguest_init(void) | |||
| 1033 | /* interrupt-related operations */ | 1097 | /* interrupt-related operations */ |
| 1034 | pv_irq_ops.init_IRQ = lguest_init_IRQ; | 1098 | pv_irq_ops.init_IRQ = lguest_init_IRQ; |
| 1035 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); | 1099 | pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); |
| 1036 | pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); | 1100 | pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); |
| 1037 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); | 1101 | pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); |
| 1038 | pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); | 1102 | pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); |
| 1039 | pv_irq_ops.safe_halt = lguest_safe_halt; | 1103 | pv_irq_ops.safe_halt = lguest_safe_halt; |
| 1040 | 1104 | ||
| 1041 | /* init-time operations */ | 1105 | /* init-time operations */ |
| @@ -1071,6 +1135,12 @@ __init void lguest_init(void) | |||
| 1071 | pv_mmu_ops.set_pte = lguest_set_pte; | 1135 | pv_mmu_ops.set_pte = lguest_set_pte; |
| 1072 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; | 1136 | pv_mmu_ops.set_pte_at = lguest_set_pte_at; |
| 1073 | pv_mmu_ops.set_pmd = lguest_set_pmd; | 1137 | pv_mmu_ops.set_pmd = lguest_set_pmd; |
| 1138 | #ifdef CONFIG_X86_PAE | ||
| 1139 | pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic; | ||
| 1140 | pv_mmu_ops.pte_clear = lguest_pte_clear; | ||
| 1141 | pv_mmu_ops.pmd_clear = lguest_pmd_clear; | ||
| 1142 | pv_mmu_ops.set_pud = lguest_set_pud; | ||
| 1143 | #endif | ||
| 1074 | pv_mmu_ops.read_cr2 = lguest_read_cr2; | 1144 | pv_mmu_ops.read_cr2 = lguest_read_cr2; |
| 1075 | pv_mmu_ops.read_cr3 = lguest_read_cr3; | 1145 | pv_mmu_ops.read_cr3 = lguest_read_cr3; |
| 1076 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; | 1146 | pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; |
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index f79541989471..a9c8cfe61cd4 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S | |||
| @@ -46,10 +46,64 @@ ENTRY(lguest_entry) | |||
| 46 | .globl lgstart_##name; .globl lgend_##name | 46 | .globl lgstart_##name; .globl lgend_##name |
| 47 | 47 | ||
| 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) | 48 | LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) |
| 49 | LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled) | ||
| 50 | LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled) | ||
| 51 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) | 49 | LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) |
| 52 | /*:*/ | 50 | |
| 51 | /*G:033 But using those wrappers is inefficient (we'll see why that doesn't | ||
| 52 | * matter for save_fl and irq_disable later). If we write our routines | ||
| 53 | * carefully in assembler, we can avoid clobbering any registers and avoid | ||
| 54 | * jumping through the wrapper functions. | ||
| 55 | * | ||
| 56 | * I skipped over our first piece of assembler, but this one is worth studying | ||
| 57 | * in a bit more detail so I'll describe in easy stages. First, the routine | ||
| 58 | * to enable interrupts: */ | ||
| 59 | ENTRY(lg_irq_enable) | ||
| 60 | /* The reverse of irq_disable, this sets lguest_data.irq_enabled to | ||
| 61 | * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ | ||
| 62 | movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled | ||
| 63 | /* But now we need to check if the Host wants to know: there might have | ||
| 64 | * been interrupts waiting to be delivered, in which case it will have | ||
| 65 | * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we | ||
| 66 | * jump to send_interrupts, otherwise we're done. */ | ||
| 67 | testl $0, lguest_data+LGUEST_DATA_irq_pending | ||
| 68 | jnz send_interrupts | ||
| 69 | /* One cool thing about x86 is that you can do many things without using | ||
| 70 | * a register. In this case, the normal path hasn't needed to save or | ||
| 71 | * restore any registers at all! */ | ||
| 72 | ret | ||
| 73 | send_interrupts: | ||
| 74 | /* OK, now we need a register: eax is used for the hypercall number, | ||
| 75 | * which is LHCALL_SEND_INTERRUPTS. | ||
| 76 | * | ||
| 77 | * We used not to bother with this pending detection at all, which was | ||
| 78 | * much simpler. Sooner or later the Host would realize it had to | ||
| 79 | * send us an interrupt. But that turns out to make performance 7 | ||
| 80 | * times worse on a simple tcp benchmark. So now we do this the hard | ||
| 81 | * way. */ | ||
| 82 | pushl %eax | ||
| 83 | movl $LHCALL_SEND_INTERRUPTS, %eax | ||
| 84 | /* This is a vmcall instruction (same thing that KVM uses). Older | ||
| 85 | * assembler versions might not know the "vmcall" instruction, so we | ||
| 86 | * create one manually here. */ | ||
| 87 | .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ | ||
| 88 | popl %eax | ||
| 89 | ret | ||
| 90 | |||
| 91 | /* Finally, the "popf" or "restore flags" routine. The %eax register holds the | ||
| 92 | * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're | ||
| 93 | * enabling interrupts again, if it's 0 we're leaving them off. */ | ||
| 94 | ENTRY(lg_restore_fl) | ||
| 95 | /* This is just "lguest_data.irq_enabled = flags;" */ | ||
| 96 | movl %eax, lguest_data+LGUEST_DATA_irq_enabled | ||
| 97 | /* Now, if the %eax value has enabled interrupts and | ||
| 98 | * lguest_data.irq_pending is set, we want to tell the Host so it can | ||
| 99 | * deliver any outstanding interrupts. Fortunately, both values will | ||
| 100 | * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" | ||
| 101 | * instruction will AND them together for us. If both are set, we | ||
| 102 | * jump to send_interrupts. */ | ||
| 103 | testl lguest_data+LGUEST_DATA_irq_pending, %eax | ||
| 104 | jnz send_interrupts | ||
| 105 | /* Again, the normal path has used no extra registers. Clever, huh? */ | ||
| 106 | ret | ||
| 53 | 107 | ||
| 54 | /* These demark the EIP range where host should never deliver interrupts. */ | 108 | /* These demark the EIP range where host should never deliver interrupts. */ |
| 55 | .global lguest_noirq_start | 109 | .global lguest_noirq_start |
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig index a3d3cbab359a..0aaa0597a622 100644 --- a/drivers/lguest/Kconfig +++ b/drivers/lguest/Kconfig | |||
| @@ -1,6 +1,6 @@ | |||
| 1 | config LGUEST | 1 | config LGUEST |
| 2 | tristate "Linux hypervisor example code" | 2 | tristate "Linux hypervisor example code" |
| 3 | depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX | 3 | depends on X86_32 && EXPERIMENTAL && EVENTFD |
| 4 | select HVC_DRIVER | 4 | select HVC_DRIVER |
| 5 | ---help--- | 5 | ---help--- |
| 6 | This is a very simple module which allows you to run | 6 | This is a very simple module which allows you to run |
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 4845fb3cf74b..a6974e9b8ebf 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
| @@ -95,7 +95,7 @@ static __init int map_switcher(void) | |||
| 95 | * array of struct pages. It increments that pointer, but we don't | 95 | * array of struct pages. It increments that pointer, but we don't |
| 96 | * care. */ | 96 | * care. */ |
| 97 | pagep = switcher_page; | 97 | pagep = switcher_page; |
| 98 | err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); | 98 | err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); |
| 99 | if (err) { | 99 | if (err) { |
| 100 | printk("lguest: map_vm_area failed: %i\n", err); | 100 | printk("lguest: map_vm_area failed: %i\n", err); |
| 101 | goto free_vma; | 101 | goto free_vma; |
| @@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 188 | { | 188 | { |
| 189 | /* We stop running once the Guest is dead. */ | 189 | /* We stop running once the Guest is dead. */ |
| 190 | while (!cpu->lg->dead) { | 190 | while (!cpu->lg->dead) { |
| 191 | unsigned int irq; | ||
| 192 | bool more; | ||
| 193 | |||
| 191 | /* First we run any hypercalls the Guest wants done. */ | 194 | /* First we run any hypercalls the Guest wants done. */ |
| 192 | if (cpu->hcall) | 195 | if (cpu->hcall) |
| 193 | do_hypercalls(cpu); | 196 | do_hypercalls(cpu); |
| @@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 195 | /* It's possible the Guest did a NOTIFY hypercall to the | 198 | /* It's possible the Guest did a NOTIFY hypercall to the |
| 196 | * Launcher, in which case we return from the read() now. */ | 199 | * Launcher, in which case we return from the read() now. */ |
| 197 | if (cpu->pending_notify) { | 200 | if (cpu->pending_notify) { |
| 198 | if (put_user(cpu->pending_notify, user)) | 201 | if (!send_notify_to_eventfd(cpu)) { |
| 199 | return -EFAULT; | 202 | if (put_user(cpu->pending_notify, user)) |
| 200 | return sizeof(cpu->pending_notify); | 203 | return -EFAULT; |
| 204 | return sizeof(cpu->pending_notify); | ||
| 205 | } | ||
| 201 | } | 206 | } |
| 202 | 207 | ||
| 203 | /* Check for signals */ | 208 | /* Check for signals */ |
| 204 | if (signal_pending(current)) | 209 | if (signal_pending(current)) |
| 205 | return -ERESTARTSYS; | 210 | return -ERESTARTSYS; |
| 206 | 211 | ||
| 207 | /* If Waker set break_out, return to Launcher. */ | ||
| 208 | if (cpu->break_out) | ||
| 209 | return -EAGAIN; | ||
| 210 | |||
| 211 | /* Check if there are any interrupts which can be delivered now: | 212 | /* Check if there are any interrupts which can be delivered now: |
| 212 | * if so, this sets up the hander to be executed when we next | 213 | * if so, this sets up the hander to be executed when we next |
| 213 | * run the Guest. */ | 214 | * run the Guest. */ |
| 214 | maybe_do_interrupt(cpu); | 215 | irq = interrupt_pending(cpu, &more); |
| 216 | if (irq < LGUEST_IRQS) | ||
| 217 | try_deliver_interrupt(cpu, irq, more); | ||
| 215 | 218 | ||
| 216 | /* All long-lived kernel loops need to check with this horrible | 219 | /* All long-lived kernel loops need to check with this horrible |
| 217 | * thing called the freezer. If the Host is trying to suspend, | 220 | * thing called the freezer. If the Host is trying to suspend, |
| @@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
| 224 | break; | 227 | break; |
| 225 | 228 | ||
| 226 | /* If the Guest asked to be stopped, we sleep. The Guest's | 229 | /* If the Guest asked to be stopped, we sleep. The Guest's |
| 227 | * clock timer or LHREQ_BREAK from the Waker will wake us. */ | 230 | * clock timer will wake us. */ |
| 228 | if (cpu->halted) { | 231 | if (cpu->halted) { |
| 229 | set_current_state(TASK_INTERRUPTIBLE); | 232 | set_current_state(TASK_INTERRUPTIBLE); |
| 230 | schedule(); | 233 | /* Just before we sleep, make sure no interrupt snuck in |
| 234 | * which we should be doing. */ | ||
| 235 | if (interrupt_pending(cpu, &more) < LGUEST_IRQS) | ||
| 236 | set_current_state(TASK_RUNNING); | ||
| 237 | else | ||
| 238 | schedule(); | ||
| 231 | continue; | 239 | continue; |
| 232 | } | 240 | } |
| 233 | 241 | ||
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 54d66f05fefa..c29ffa19cb74 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
| @@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 37 | /* This call does nothing, except by breaking out of the Guest | 37 | /* This call does nothing, except by breaking out of the Guest |
| 38 | * it makes us process all the asynchronous hypercalls. */ | 38 | * it makes us process all the asynchronous hypercalls. */ |
| 39 | break; | 39 | break; |
| 40 | case LHCALL_SEND_INTERRUPTS: | ||
| 41 | /* This call does nothing too, but by breaking out of the Guest | ||
| 42 | * it makes us process any pending interrupts. */ | ||
| 43 | break; | ||
| 40 | case LHCALL_LGUEST_INIT: | 44 | case LHCALL_LGUEST_INIT: |
| 41 | /* You can't get here unless you're already initialized. Don't | 45 | /* You can't get here unless you're already initialized. Don't |
| 42 | * do that. */ | 46 | * do that. */ |
| @@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
| 73 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); | 77 | guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); |
| 74 | break; | 78 | break; |
| 75 | case LHCALL_SET_PTE: | 79 | case LHCALL_SET_PTE: |
| 80 | #ifdef CONFIG_X86_PAE | ||
| 81 | guest_set_pte(cpu, args->arg1, args->arg2, | ||
| 82 | __pte(args->arg3 | (u64)args->arg4 << 32)); | ||
| 83 | #else | ||
| 76 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); | 84 | guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); |
| 85 | #endif | ||
| 86 | break; | ||
| 87 | case LHCALL_SET_PGD: | ||
| 88 | guest_set_pgd(cpu->lg, args->arg1, args->arg2); | ||
| 77 | break; | 89 | break; |
| 90 | #ifdef CONFIG_X86_PAE | ||
| 78 | case LHCALL_SET_PMD: | 91 | case LHCALL_SET_PMD: |
| 79 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); | 92 | guest_set_pmd(cpu->lg, args->arg1, args->arg2); |
| 80 | break; | 93 | break; |
| 94 | #endif | ||
| 81 | case LHCALL_SET_CLOCKEVENT: | 95 | case LHCALL_SET_CLOCKEVENT: |
| 82 | guest_set_clockevent(cpu, args->arg1); | 96 | guest_set_clockevent(cpu, args->arg1); |
| 83 | break; | 97 | break; |
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 6e99adbe1946..0e9067b0d507 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c | |||
| @@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, | |||
| 128 | /*H:205 | 128 | /*H:205 |
| 129 | * Virtual Interrupts. | 129 | * Virtual Interrupts. |
| 130 | * | 130 | * |
| 131 | * maybe_do_interrupt() gets called before every entry to the Guest, to see if | 131 | * interrupt_pending() returns the first pending interrupt which isn't blocked |
| 132 | * we should divert the Guest to running an interrupt handler. */ | 132 | * by the Guest. It is called before every entry to the Guest, and just before |
| 133 | void maybe_do_interrupt(struct lg_cpu *cpu) | 133 | * we go to sleep when the Guest has halted itself. */ |
| 134 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) | ||
| 134 | { | 135 | { |
| 135 | unsigned int irq; | 136 | unsigned int irq; |
| 136 | DECLARE_BITMAP(blk, LGUEST_IRQS); | 137 | DECLARE_BITMAP(blk, LGUEST_IRQS); |
| 137 | struct desc_struct *idt; | ||
| 138 | 138 | ||
| 139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ | 139 | /* If the Guest hasn't even initialized yet, we can do nothing. */ |
| 140 | if (!cpu->lg->lguest_data) | 140 | if (!cpu->lg->lguest_data) |
| 141 | return; | 141 | return LGUEST_IRQS; |
| 142 | 142 | ||
| 143 | /* Take our "irqs_pending" array and remove any interrupts the Guest | 143 | /* Take our "irqs_pending" array and remove any interrupts the Guest |
| 144 | * wants blocked: the result ends up in "blk". */ | 144 | * wants blocked: the result ends up in "blk". */ |
| 145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, | 145 | if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, |
| 146 | sizeof(blk))) | 146 | sizeof(blk))) |
| 147 | return; | 147 | return LGUEST_IRQS; |
| 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); | 148 | bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); |
| 149 | 149 | ||
| 150 | /* Find the first interrupt. */ | 150 | /* Find the first interrupt. */ |
| 151 | irq = find_first_bit(blk, LGUEST_IRQS); | 151 | irq = find_first_bit(blk, LGUEST_IRQS); |
| 152 | /* None? Nothing to do */ | 152 | *more = find_next_bit(blk, LGUEST_IRQS, irq+1); |
| 153 | if (irq >= LGUEST_IRQS) | 153 | |
| 154 | return; | 154 | return irq; |
| 155 | } | ||
| 156 | |||
| 157 | /* This actually diverts the Guest to running an interrupt handler, once an | ||
| 158 | * interrupt has been identified by interrupt_pending(). */ | ||
| 159 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) | ||
| 160 | { | ||
| 161 | struct desc_struct *idt; | ||
| 162 | |||
| 163 | BUG_ON(irq >= LGUEST_IRQS); | ||
| 155 | 164 | ||
| 156 | /* They may be in the middle of an iret, where they asked us never to | 165 | /* They may be in the middle of an iret, where they asked us never to |
| 157 | * deliver interrupts. */ | 166 | * deliver interrupts. */ |
| @@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
| 170 | u32 irq_enabled; | 179 | u32 irq_enabled; |
| 171 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) | 180 | if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) |
| 172 | irq_enabled = 0; | 181 | irq_enabled = 0; |
| 173 | if (!irq_enabled) | 182 | if (!irq_enabled) { |
| 183 | /* Make sure they know an IRQ is pending. */ | ||
| 184 | put_user(X86_EFLAGS_IF, | ||
| 185 | &cpu->lg->lguest_data->irq_pending); | ||
| 174 | return; | 186 | return; |
| 187 | } | ||
| 175 | } | 188 | } |
| 176 | 189 | ||
| 177 | /* Look at the IDT entry the Guest gave us for this interrupt. The | 190 | /* Look at the IDT entry the Guest gave us for this interrupt. The |
| @@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu) | |||
| 194 | * here is a compromise which means at least it gets updated every | 207 | * here is a compromise which means at least it gets updated every |
| 195 | * timer interrupt. */ | 208 | * timer interrupt. */ |
| 196 | write_timestamp(cpu); | 209 | write_timestamp(cpu); |
| 210 | |||
| 211 | /* If there are no other interrupts we want to deliver, clear | ||
| 212 | * the pending flag. */ | ||
| 213 | if (!more) | ||
| 214 | put_user(0, &cpu->lg->lguest_data->irq_pending); | ||
| 215 | } | ||
| 216 | |||
| 217 | /* And this is the routine when we want to set an interrupt for the Guest. */ | ||
| 218 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq) | ||
| 219 | { | ||
| 220 | /* Next time the Guest runs, the core code will see if it can deliver | ||
| 221 | * this interrupt. */ | ||
| 222 | set_bit(irq, cpu->irqs_pending); | ||
| 223 | |||
| 224 | /* Make sure it sees it; it might be asleep (eg. halted), or | ||
| 225 | * running the Guest right now, in which case kick_process() | ||
| 226 | * will knock it out. */ | ||
| 227 | if (!wake_up_process(cpu->tsk)) | ||
| 228 | kick_process(cpu->tsk); | ||
| 197 | } | 229 | } |
| 198 | /*:*/ | 230 | /*:*/ |
| 199 | 231 | ||
| @@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer) | |||
| 510 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); | 542 | struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); |
| 511 | 543 | ||
| 512 | /* Remember the first interrupt is the timer interrupt. */ | 544 | /* Remember the first interrupt is the timer interrupt. */ |
| 513 | set_bit(0, cpu->irqs_pending); | 545 | set_interrupt(cpu, 0); |
| 514 | /* If the Guest is actually stopped, we need to wake it up. */ | ||
| 515 | if (cpu->halted) | ||
| 516 | wake_up_process(cpu->tsk); | ||
| 517 | return HRTIMER_NORESTART; | 546 | return HRTIMER_NORESTART; |
| 518 | } | 547 | } |
| 519 | 548 | ||
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index af92a176697f..d4e8979735cb 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
| @@ -49,7 +49,7 @@ struct lg_cpu { | |||
| 49 | u32 cr2; | 49 | u32 cr2; |
| 50 | int ts; | 50 | int ts; |
| 51 | u32 esp1; | 51 | u32 esp1; |
| 52 | u8 ss1; | 52 | u16 ss1; |
| 53 | 53 | ||
| 54 | /* Bitmap of what has changed: see CHANGED_* above. */ | 54 | /* Bitmap of what has changed: see CHANGED_* above. */ |
| 55 | int changed; | 55 | int changed; |
| @@ -71,9 +71,7 @@ struct lg_cpu { | |||
| 71 | /* Virtual clock device */ | 71 | /* Virtual clock device */ |
| 72 | struct hrtimer hrt; | 72 | struct hrtimer hrt; |
| 73 | 73 | ||
| 74 | /* Do we need to stop what we're doing and return to userspace? */ | 74 | /* Did the Guest tell us to halt? */ |
| 75 | int break_out; | ||
| 76 | wait_queue_head_t break_wq; | ||
| 77 | int halted; | 75 | int halted; |
| 78 | 76 | ||
| 79 | /* Pending virtual interrupts */ | 77 | /* Pending virtual interrupts */ |
| @@ -82,6 +80,16 @@ struct lg_cpu { | |||
| 82 | struct lg_cpu_arch arch; | 80 | struct lg_cpu_arch arch; |
| 83 | }; | 81 | }; |
| 84 | 82 | ||
| 83 | struct lg_eventfd { | ||
| 84 | unsigned long addr; | ||
| 85 | struct file *event; | ||
| 86 | }; | ||
| 87 | |||
| 88 | struct lg_eventfd_map { | ||
| 89 | unsigned int num; | ||
| 90 | struct lg_eventfd map[]; | ||
| 91 | }; | ||
| 92 | |||
| 85 | /* The private info the thread maintains about the guest. */ | 93 | /* The private info the thread maintains about the guest. */ |
| 86 | struct lguest | 94 | struct lguest |
| 87 | { | 95 | { |
| @@ -102,6 +110,8 @@ struct lguest | |||
| 102 | unsigned int stack_pages; | 110 | unsigned int stack_pages; |
| 103 | u32 tsc_khz; | 111 | u32 tsc_khz; |
| 104 | 112 | ||
| 113 | struct lg_eventfd_map *eventfds; | ||
| 114 | |||
| 105 | /* Dead? */ | 115 | /* Dead? */ |
| 106 | const char *dead; | 116 | const char *dead; |
| 107 | }; | 117 | }; |
| @@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user); | |||
| 137 | * in the kernel. */ | 147 | * in the kernel. */ |
| 138 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | 148 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) |
| 139 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | 149 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) |
| 150 | #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) | ||
| 151 | #define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT) | ||
| 140 | 152 | ||
| 141 | /* interrupts_and_traps.c: */ | 153 | /* interrupts_and_traps.c: */ |
| 142 | void maybe_do_interrupt(struct lg_cpu *cpu); | 154 | unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more); |
| 155 | void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more); | ||
| 156 | void set_interrupt(struct lg_cpu *cpu, unsigned int irq); | ||
| 143 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); | 157 | bool deliver_trap(struct lg_cpu *cpu, unsigned int num); |
| 144 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, | 158 | void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, |
| 145 | u32 low, u32 hi); | 159 | u32 low, u32 hi); |
| @@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state, | |||
| 150 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, | 164 | void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, |
| 151 | const unsigned long *def); | 165 | const unsigned long *def); |
| 152 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); | 166 | void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); |
| 167 | bool send_notify_to_eventfd(struct lg_cpu *cpu); | ||
| 153 | void init_clockdev(struct lg_cpu *cpu); | 168 | void init_clockdev(struct lg_cpu *cpu); |
| 154 | bool check_syscall_vector(struct lguest *lg); | 169 | bool check_syscall_vector(struct lguest *lg); |
| 155 | int init_interrupts(void); | 170 | int init_interrupts(void); |
| @@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt); | |||
| 168 | int init_guest_pagetable(struct lguest *lg); | 183 | int init_guest_pagetable(struct lguest *lg); |
| 169 | void free_guest_pagetable(struct lguest *lg); | 184 | void free_guest_pagetable(struct lguest *lg); |
| 170 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); | 185 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); |
| 186 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i); | ||
| 187 | #ifdef CONFIG_X86_PAE | ||
| 171 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); | 188 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); |
| 189 | #endif | ||
| 172 | void guest_pagetable_clear_all(struct lg_cpu *cpu); | 190 | void guest_pagetable_clear_all(struct lg_cpu *cpu); |
| 173 | void guest_pagetable_flush_user(struct lg_cpu *cpu); | 191 | void guest_pagetable_flush_user(struct lg_cpu *cpu); |
| 174 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, | 192 | void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index b8ee103eed5f..32e297121058 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
| @@ -7,32 +7,83 @@ | |||
| 7 | #include <linux/miscdevice.h> | 7 | #include <linux/miscdevice.h> |
| 8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
| 9 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 10 | #include <linux/eventfd.h> | ||
| 11 | #include <linux/file.h> | ||
| 10 | #include "lg.h" | 12 | #include "lg.h" |
| 11 | 13 | ||
| 12 | /*L:055 When something happens, the Waker process needs a way to stop the | 14 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
| 13 | * kernel running the Guest and return to the Launcher. So the Waker writes | ||
| 14 | * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher | ||
| 15 | * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release | ||
| 16 | * the Waker. */ | ||
| 17 | static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input) | ||
| 18 | { | 15 | { |
| 19 | unsigned long on; | 16 | unsigned int i; |
| 17 | struct lg_eventfd_map *map; | ||
| 18 | |||
| 19 | /* lg->eventfds is RCU-protected */ | ||
| 20 | rcu_read_lock(); | ||
| 21 | map = rcu_dereference(cpu->lg->eventfds); | ||
| 22 | for (i = 0; i < map->num; i++) { | ||
| 23 | if (map->map[i].addr == cpu->pending_notify) { | ||
| 24 | eventfd_signal(map->map[i].event, 1); | ||
| 25 | cpu->pending_notify = 0; | ||
| 26 | break; | ||
| 27 | } | ||
| 28 | } | ||
| 29 | rcu_read_unlock(); | ||
| 30 | return cpu->pending_notify == 0; | ||
| 31 | } | ||
| 20 | 32 | ||
| 21 | /* Fetch whether they're turning break on or off. */ | 33 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
| 22 | if (get_user(on, input) != 0) | 34 | { |
| 23 | return -EFAULT; | 35 | struct lg_eventfd_map *new, *old = lg->eventfds; |
| 24 | 36 | ||
| 25 | if (on) { | 37 | if (!addr) |
| 26 | cpu->break_out = 1; | 38 | return -EINVAL; |
| 27 | /* Pop it out of the Guest (may be running on different CPU) */ | 39 | |
| 28 | wake_up_process(cpu->tsk); | 40 | /* Replace the old array with the new one, carefully: others can |
| 29 | /* Wait for them to reset it */ | 41 | * be accessing it at the same time */ |
| 30 | return wait_event_interruptible(cpu->break_wq, !cpu->break_out); | 42 | new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), |
| 31 | } else { | 43 | GFP_KERNEL); |
| 32 | cpu->break_out = 0; | 44 | if (!new) |
| 33 | wake_up(&cpu->break_wq); | 45 | return -ENOMEM; |
| 34 | return 0; | 46 | |
| 47 | /* First make identical copy. */ | ||
| 48 | memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); | ||
| 49 | new->num = old->num; | ||
| 50 | |||
| 51 | /* Now append new entry. */ | ||
| 52 | new->map[new->num].addr = addr; | ||
| 53 | new->map[new->num].event = eventfd_fget(fd); | ||
| 54 | if (IS_ERR(new->map[new->num].event)) { | ||
| 55 | kfree(new); | ||
| 56 | return PTR_ERR(new->map[new->num].event); | ||
| 35 | } | 57 | } |
| 58 | new->num++; | ||
| 59 | |||
| 60 | /* Now put new one in place. */ | ||
| 61 | rcu_assign_pointer(lg->eventfds, new); | ||
| 62 | |||
| 63 | /* We're not in a big hurry. Wait until noone's looking at old | ||
| 64 | * version, then delete it. */ | ||
| 65 | synchronize_rcu(); | ||
| 66 | kfree(old); | ||
| 67 | |||
| 68 | return 0; | ||
| 69 | } | ||
| 70 | |||
| 71 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | ||
| 72 | { | ||
| 73 | unsigned long addr, fd; | ||
| 74 | int err; | ||
| 75 | |||
| 76 | if (get_user(addr, input) != 0) | ||
| 77 | return -EFAULT; | ||
| 78 | input++; | ||
| 79 | if (get_user(fd, input) != 0) | ||
| 80 | return -EFAULT; | ||
| 81 | |||
| 82 | mutex_lock(&lguest_lock); | ||
| 83 | err = add_eventfd(lg, addr, fd); | ||
| 84 | mutex_unlock(&lguest_lock); | ||
| 85 | |||
| 86 | return 0; | ||
| 36 | } | 87 | } |
| 37 | 88 | ||
| 38 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt | 89 | /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt |
| @@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
| 45 | return -EFAULT; | 96 | return -EFAULT; |
| 46 | if (irq >= LGUEST_IRQS) | 97 | if (irq >= LGUEST_IRQS) |
| 47 | return -EINVAL; | 98 | return -EINVAL; |
| 48 | /* Next time the Guest runs, the core code will see if it can deliver | 99 | |
| 49 | * this interrupt. */ | 100 | set_interrupt(cpu, irq); |
| 50 | set_bit(irq, cpu->irqs_pending); | ||
| 51 | return 0; | 101 | return 0; |
| 52 | } | 102 | } |
| 53 | 103 | ||
| @@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) | |||
| 126 | * address. */ | 176 | * address. */ |
| 127 | lguest_arch_setup_regs(cpu, start_ip); | 177 | lguest_arch_setup_regs(cpu, start_ip); |
| 128 | 178 | ||
| 129 | /* Initialize the queue for the Waker to wait on */ | ||
| 130 | init_waitqueue_head(&cpu->break_wq); | ||
| 131 | |||
| 132 | /* We keep a pointer to the Launcher task (ie. current task) for when | 179 | /* We keep a pointer to the Launcher task (ie. current task) for when |
| 133 | * other Guests want to wake this one (eg. console input). */ | 180 | * other Guests want to wake this one (eg. console input). */ |
| 134 | cpu->tsk = current; | 181 | cpu->tsk = current; |
| @@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 185 | goto unlock; | 232 | goto unlock; |
| 186 | } | 233 | } |
| 187 | 234 | ||
| 235 | lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); | ||
| 236 | if (!lg->eventfds) { | ||
| 237 | err = -ENOMEM; | ||
| 238 | goto free_lg; | ||
| 239 | } | ||
| 240 | lg->eventfds->num = 0; | ||
| 241 | |||
| 188 | /* Populate the easy fields of our "struct lguest" */ | 242 | /* Populate the easy fields of our "struct lguest" */ |
| 189 | lg->mem_base = (void __user *)args[0]; | 243 | lg->mem_base = (void __user *)args[0]; |
| 190 | lg->pfn_limit = args[1]; | 244 | lg->pfn_limit = args[1]; |
| @@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 192 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ | 246 | /* This is the first cpu (cpu 0) and it will start booting at args[2] */ |
| 193 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); | 247 | err = lg_cpu_start(&lg->cpus[0], 0, args[2]); |
| 194 | if (err) | 248 | if (err) |
| 195 | goto release_guest; | 249 | goto free_eventfds; |
| 196 | 250 | ||
| 197 | /* Initialize the Guest's shadow page tables, using the toplevel | 251 | /* Initialize the Guest's shadow page tables, using the toplevel |
| 198 | * address the Launcher gave us. This allocates memory, so can fail. */ | 252 | * address the Launcher gave us. This allocates memory, so can fail. */ |
| @@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input) | |||
| 211 | free_regs: | 265 | free_regs: |
| 212 | /* FIXME: This should be in free_vcpu */ | 266 | /* FIXME: This should be in free_vcpu */ |
| 213 | free_page(lg->cpus[0].regs_page); | 267 | free_page(lg->cpus[0].regs_page); |
| 214 | release_guest: | 268 | free_eventfds: |
| 269 | kfree(lg->eventfds); | ||
| 270 | free_lg: | ||
| 215 | kfree(lg); | 271 | kfree(lg); |
| 216 | unlock: | 272 | unlock: |
| 217 | mutex_unlock(&lguest_lock); | 273 | mutex_unlock(&lguest_lock); |
| @@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 252 | /* Once the Guest is dead, you can only read() why it died. */ | 308 | /* Once the Guest is dead, you can only read() why it died. */ |
| 253 | if (lg->dead) | 309 | if (lg->dead) |
| 254 | return -ENOENT; | 310 | return -ENOENT; |
| 255 | |||
| 256 | /* If you're not the task which owns the Guest, all you can do | ||
| 257 | * is break the Launcher out of running the Guest. */ | ||
| 258 | if (current != cpu->tsk && req != LHREQ_BREAK) | ||
| 259 | return -EPERM; | ||
| 260 | } | 311 | } |
| 261 | 312 | ||
| 262 | switch (req) { | 313 | switch (req) { |
| @@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in, | |||
| 264 | return initialize(file, input); | 315 | return initialize(file, input); |
| 265 | case LHREQ_IRQ: | 316 | case LHREQ_IRQ: |
| 266 | return user_send_irq(cpu, input); | 317 | return user_send_irq(cpu, input); |
| 267 | case LHREQ_BREAK: | 318 | case LHREQ_EVENTFD: |
| 268 | return break_guest_out(cpu, input); | 319 | return attach_eventfd(lg, input); |
| 269 | default: | 320 | default: |
| 270 | return -EINVAL; | 321 | return -EINVAL; |
| 271 | } | 322 | } |
| @@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file) | |||
| 303 | * the Launcher's memory management structure. */ | 354 | * the Launcher's memory management structure. */ |
| 304 | mmput(lg->cpus[i].mm); | 355 | mmput(lg->cpus[i].mm); |
| 305 | } | 356 | } |
| 357 | |||
| 358 | /* Release any eventfds they registered. */ | ||
| 359 | for (i = 0; i < lg->eventfds->num; i++) | ||
| 360 | fput(lg->eventfds->map[i].event); | ||
| 361 | kfree(lg->eventfds); | ||
| 362 | |||
| 306 | /* If lg->dead doesn't contain an error code it will be NULL or a | 363 | /* If lg->dead doesn't contain an error code it will be NULL or a |
| 307 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ | 364 | * kmalloc()ed string, either of which is ok to hand to kfree(). */ |
| 308 | if (!IS_ERR(lg->dead)) | 365 | if (!IS_ERR(lg->dead)) |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a059cf9980f7..a6fe1abda240 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
| @@ -53,6 +53,17 @@ | |||
| 53 | * page. */ | 53 | * page. */ |
| 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 54 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
| 55 | 55 | ||
| 56 | /* For PAE we need the PMD index as well. We use the last 2MB, so we | ||
| 57 | * will need the last pmd entry of the last pmd page. */ | ||
| 58 | #ifdef CONFIG_X86_PAE | ||
| 59 | #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) | ||
| 60 | #define RESERVE_MEM 2U | ||
| 61 | #define CHECK_GPGD_MASK _PAGE_PRESENT | ||
| 62 | #else | ||
| 63 | #define RESERVE_MEM 4U | ||
| 64 | #define CHECK_GPGD_MASK _PAGE_TABLE | ||
| 65 | #endif | ||
| 66 | |||
| 56 | /* We actually need a separate PTE page for each CPU. Remember that after the | 67 | /* We actually need a separate PTE page for each CPU. Remember that after the |
| 57 | * Switcher code itself comes two pages for each CPU, and we don't want this | 68 | * Switcher code itself comes two pages for each CPU, and we don't want this |
| 58 | * CPU's guest to see the pages of any other CPU. */ | 69 | * CPU's guest to see the pages of any other CPU. */ |
| @@ -73,24 +84,59 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) | |||
| 73 | { | 84 | { |
| 74 | unsigned int index = pgd_index(vaddr); | 85 | unsigned int index = pgd_index(vaddr); |
| 75 | 86 | ||
| 87 | #ifndef CONFIG_X86_PAE | ||
| 76 | /* We kill any Guest trying to touch the Switcher addresses. */ | 88 | /* We kill any Guest trying to touch the Switcher addresses. */ |
| 77 | if (index >= SWITCHER_PGD_INDEX) { | 89 | if (index >= SWITCHER_PGD_INDEX) { |
| 78 | kill_guest(cpu, "attempt to access switcher pages"); | 90 | kill_guest(cpu, "attempt to access switcher pages"); |
| 79 | index = 0; | 91 | index = 0; |
| 80 | } | 92 | } |
| 93 | #endif | ||
| 81 | /* Return a pointer index'th pgd entry for the i'th page table. */ | 94 | /* Return a pointer index'th pgd entry for the i'th page table. */ |
| 82 | return &cpu->lg->pgdirs[i].pgdir[index]; | 95 | return &cpu->lg->pgdirs[i].pgdir[index]; |
| 83 | } | 96 | } |
| 84 | 97 | ||
| 98 | #ifdef CONFIG_X86_PAE | ||
| 99 | /* This routine then takes the PGD entry given above, which contains the | ||
| 100 | * address of the PMD page. It then returns a pointer to the PMD entry for the | ||
| 101 | * given address. */ | ||
| 102 | static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | ||
| 103 | { | ||
| 104 | unsigned int index = pmd_index(vaddr); | ||
| 105 | pmd_t *page; | ||
| 106 | |||
| 107 | /* We kill any Guest trying to touch the Switcher addresses. */ | ||
| 108 | if (pgd_index(vaddr) == SWITCHER_PGD_INDEX && | ||
| 109 | index >= SWITCHER_PMD_INDEX) { | ||
| 110 | kill_guest(cpu, "attempt to access switcher pages"); | ||
| 111 | index = 0; | ||
| 112 | } | ||
| 113 | |||
| 114 | /* You should never call this if the PGD entry wasn't valid */ | ||
| 115 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | ||
| 116 | page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | ||
| 117 | |||
| 118 | return &page[index]; | ||
| 119 | } | ||
| 120 | #endif | ||
| 121 | |||
| 85 | /* This routine then takes the page directory entry returned above, which | 122 | /* This routine then takes the page directory entry returned above, which |
| 86 | * contains the address of the page table entry (PTE) page. It then returns a | 123 | * contains the address of the page table entry (PTE) page. It then returns a |
| 87 | * pointer to the PTE entry for the given address. */ | 124 | * pointer to the PTE entry for the given address. */ |
| 88 | static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) | 125 | static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) |
| 89 | { | 126 | { |
| 127 | #ifdef CONFIG_X86_PAE | ||
| 128 | pmd_t *pmd = spmd_addr(cpu, spgd, vaddr); | ||
| 129 | pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT); | ||
| 130 | |||
| 131 | /* You should never call this if the PMD entry wasn't valid */ | ||
| 132 | BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT)); | ||
| 133 | #else | ||
| 90 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); | 134 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
| 91 | /* You should never call this if the PGD entry wasn't valid */ | 135 | /* You should never call this if the PGD entry wasn't valid */ |
| 92 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); | 136 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
| 93 | return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; | 137 | #endif |
| 138 | |||
| 139 | return &page[pte_index(vaddr)]; | ||
| 94 | } | 140 | } |
| 95 | 141 | ||
| 96 | /* These two functions just like the above two, except they access the Guest | 142 | /* These two functions just like the above two, except they access the Guest |
| @@ -101,12 +147,32 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 101 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); | 147 | return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); |
| 102 | } | 148 | } |
| 103 | 149 | ||
| 104 | static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) | 150 | #ifdef CONFIG_X86_PAE |
| 151 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | ||
| 152 | { | ||
| 153 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | ||
| 154 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | ||
| 155 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | ||
| 156 | } | ||
| 157 | |||
| 158 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
| 159 | pmd_t gpmd, unsigned long vaddr) | ||
| 160 | { | ||
| 161 | unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT; | ||
| 162 | |||
| 163 | BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT)); | ||
| 164 | return gpage + pte_index(vaddr) * sizeof(pte_t); | ||
| 165 | } | ||
| 166 | #else | ||
| 167 | static unsigned long gpte_addr(struct lg_cpu *cpu, | ||
| 168 | pgd_t gpgd, unsigned long vaddr) | ||
| 105 | { | 169 | { |
| 106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
| 171 | |||
| 107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); | 172 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
| 108 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); | 173 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
| 109 | } | 174 | } |
| 175 | #endif | ||
| 110 | /*:*/ | 176 | /*:*/ |
| 111 | 177 | ||
| 112 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as | 178 | /*M:014 get_pfn is slow: we could probably try to grab batches of pages here as |
| @@ -171,7 +237,7 @@ static void release_pte(pte_t pte) | |||
| 171 | /* Remember that get_user_pages_fast() took a reference to the page, in | 237 | /* Remember that get_user_pages_fast() took a reference to the page, in |
| 172 | * get_pfn()? We have to put it back now. */ | 238 | * get_pfn()? We have to put it back now. */ |
| 173 | if (pte_flags(pte) & _PAGE_PRESENT) | 239 | if (pte_flags(pte) & _PAGE_PRESENT) |
| 174 | put_page(pfn_to_page(pte_pfn(pte))); | 240 | put_page(pte_page(pte)); |
| 175 | } | 241 | } |
| 176 | /*:*/ | 242 | /*:*/ |
| 177 | 243 | ||
| @@ -184,11 +250,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte) | |||
| 184 | 250 | ||
| 185 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) | 251 | static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) |
| 186 | { | 252 | { |
| 187 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || | 253 | if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) || |
| 188 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) | 254 | (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) |
| 189 | kill_guest(cpu, "bad page directory entry"); | 255 | kill_guest(cpu, "bad page directory entry"); |
| 190 | } | 256 | } |
| 191 | 257 | ||
| 258 | #ifdef CONFIG_X86_PAE | ||
| 259 | static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) | ||
| 260 | { | ||
| 261 | if ((pmd_flags(gpmd) & ~_PAGE_TABLE) || | ||
| 262 | (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) | ||
| 263 | kill_guest(cpu, "bad page middle directory entry"); | ||
| 264 | } | ||
| 265 | #endif | ||
| 266 | |||
| 192 | /*H:330 | 267 | /*H:330 |
| 193 | * (i) Looking up a page table entry when the Guest faults. | 268 | * (i) Looking up a page table entry when the Guest faults. |
| 194 | * | 269 | * |
| @@ -207,6 +282,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 207 | pte_t gpte; | 282 | pte_t gpte; |
| 208 | pte_t *spte; | 283 | pte_t *spte; |
| 209 | 284 | ||
| 285 | #ifdef CONFIG_X86_PAE | ||
| 286 | pmd_t *spmd; | ||
| 287 | pmd_t gpmd; | ||
| 288 | #endif | ||
| 289 | |||
| 210 | /* First step: get the top-level Guest page table entry. */ | 290 | /* First step: get the top-level Guest page table entry. */ |
| 211 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 291 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 212 | /* Toplevel not present? We can't map it in. */ | 292 | /* Toplevel not present? We can't map it in. */ |
| @@ -228,12 +308,45 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 228 | check_gpgd(cpu, gpgd); | 308 | check_gpgd(cpu, gpgd); |
| 229 | /* And we copy the flags to the shadow PGD entry. The page | 309 | /* And we copy the flags to the shadow PGD entry. The page |
| 230 | * number in the shadow PGD is the page we just allocated. */ | 310 | * number in the shadow PGD is the page we just allocated. */ |
| 231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); | 311 | set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); |
| 232 | } | 312 | } |
| 233 | 313 | ||
| 314 | #ifdef CONFIG_X86_PAE | ||
| 315 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 316 | /* middle level not present? We can't map it in. */ | ||
| 317 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 318 | return false; | ||
| 319 | |||
| 320 | /* Now look at the matching shadow entry. */ | ||
| 321 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 322 | |||
| 323 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) { | ||
| 324 | /* No shadow entry: allocate a new shadow PTE page. */ | ||
| 325 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | ||
| 326 | |||
| 327 | /* This is not really the Guest's fault, but killing it is | ||
| 328 | * simple for this corner case. */ | ||
| 329 | if (!ptepage) { | ||
| 330 | kill_guest(cpu, "out of memory allocating pte page"); | ||
| 331 | return false; | ||
| 332 | } | ||
| 333 | |||
| 334 | /* We check that the Guest pmd is OK. */ | ||
| 335 | check_gpmd(cpu, gpmd); | ||
| 336 | |||
| 337 | /* And we copy the flags to the shadow PMD entry. The page | ||
| 338 | * number in the shadow PMD is the page we just allocated. */ | ||
| 339 | native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); | ||
| 340 | } | ||
| 341 | |||
| 342 | /* OK, now we look at the lower level in the Guest page table: keep its | ||
| 343 | * address, because we might update it later. */ | ||
| 344 | gpte_ptr = gpte_addr(cpu, gpmd, vaddr); | ||
| 345 | #else | ||
| 234 | /* OK, now we look at the lower level in the Guest page table: keep its | 346 | /* OK, now we look at the lower level in the Guest page table: keep its |
| 235 | * address, because we might update it later. */ | 347 | * address, because we might update it later. */ |
| 236 | gpte_ptr = gpte_addr(gpgd, vaddr); | 348 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
| 349 | #endif | ||
| 237 | gpte = lgread(cpu, gpte_ptr, pte_t); | 350 | gpte = lgread(cpu, gpte_ptr, pte_t); |
| 238 | 351 | ||
| 239 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 352 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
| @@ -259,7 +372,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 259 | gpte = pte_mkdirty(gpte); | 372 | gpte = pte_mkdirty(gpte); |
| 260 | 373 | ||
| 261 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 374 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
| 262 | spte = spte_addr(*spgd, vaddr); | 375 | spte = spte_addr(cpu, *spgd, vaddr); |
| 263 | /* If there was a valid shadow PTE entry here before, we release it. | 376 | /* If there was a valid shadow PTE entry here before, we release it. |
| 264 | * This can happen with a write to a previously read-only entry. */ | 377 | * This can happen with a write to a previously read-only entry. */ |
| 265 | release_pte(*spte); | 378 | release_pte(*spte); |
| @@ -273,7 +386,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
| 273 | * table entry, even if the Guest says it's writable. That way | 386 | * table entry, even if the Guest says it's writable. That way |
| 274 | * we will come back here when a write does actually occur, so | 387 | * we will come back here when a write does actually occur, so |
| 275 | * we can update the Guest's _PAGE_DIRTY flag. */ | 388 | * we can update the Guest's _PAGE_DIRTY flag. */ |
| 276 | *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); | 389 | native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); |
| 277 | 390 | ||
| 278 | /* Finally, we write the Guest PTE entry back: we've set the | 391 | /* Finally, we write the Guest PTE entry back: we've set the |
| 279 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 392 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
| @@ -301,14 +414,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 301 | pgd_t *spgd; | 414 | pgd_t *spgd; |
| 302 | unsigned long flags; | 415 | unsigned long flags; |
| 303 | 416 | ||
| 417 | #ifdef CONFIG_X86_PAE | ||
| 418 | pmd_t *spmd; | ||
| 419 | #endif | ||
| 304 | /* Look at the current top level entry: is it present? */ | 420 | /* Look at the current top level entry: is it present? */ |
| 305 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); | 421 | spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); |
| 306 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) | 422 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
| 307 | return false; | 423 | return false; |
| 308 | 424 | ||
| 425 | #ifdef CONFIG_X86_PAE | ||
| 426 | spmd = spmd_addr(cpu, *spgd, vaddr); | ||
| 427 | if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) | ||
| 428 | return false; | ||
| 429 | #endif | ||
| 430 | |||
| 309 | /* Check the flags on the pte entry itself: it must be present and | 431 | /* Check the flags on the pte entry itself: it must be present and |
| 310 | * writable. */ | 432 | * writable. */ |
| 311 | flags = pte_flags(*(spte_addr(*spgd, vaddr))); | 433 | flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); |
| 312 | 434 | ||
| 313 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 435 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
| 314 | } | 436 | } |
| @@ -322,8 +444,43 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 322 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 444 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
| 323 | } | 445 | } |
| 324 | 446 | ||
| 447 | #ifdef CONFIG_X86_PAE | ||
| 448 | static void release_pmd(pmd_t *spmd) | ||
| 449 | { | ||
| 450 | /* If the entry's not present, there's nothing to release. */ | ||
| 451 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { | ||
| 452 | unsigned int i; | ||
| 453 | pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT); | ||
| 454 | /* For each entry in the page, we might need to release it. */ | ||
| 455 | for (i = 0; i < PTRS_PER_PTE; i++) | ||
| 456 | release_pte(ptepage[i]); | ||
| 457 | /* Now we can free the page of PTEs */ | ||
| 458 | free_page((long)ptepage); | ||
| 459 | /* And zero out the PMD entry so we never release it twice. */ | ||
| 460 | native_set_pmd(spmd, __pmd(0)); | ||
| 461 | } | ||
| 462 | } | ||
| 463 | |||
| 464 | static void release_pgd(pgd_t *spgd) | ||
| 465 | { | ||
| 466 | /* If the entry's not present, there's nothing to release. */ | ||
| 467 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | ||
| 468 | unsigned int i; | ||
| 469 | pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 470 | |||
| 471 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
| 472 | release_pmd(&pmdpage[i]); | ||
| 473 | |||
| 474 | /* Now we can free the page of PMDs */ | ||
| 475 | free_page((long)pmdpage); | ||
| 476 | /* And zero out the PGD entry so we never release it twice. */ | ||
| 477 | set_pgd(spgd, __pgd(0)); | ||
| 478 | } | ||
| 479 | } | ||
| 480 | |||
| 481 | #else /* !CONFIG_X86_PAE */ | ||
| 325 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 482 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
| 326 | static void release_pgd(struct lguest *lg, pgd_t *spgd) | 483 | static void release_pgd(pgd_t *spgd) |
| 327 | { | 484 | { |
| 328 | /* If the entry's not present, there's nothing to release. */ | 485 | /* If the entry's not present, there's nothing to release. */ |
| 329 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 486 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| @@ -341,7 +498,7 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd) | |||
| 341 | *spgd = __pgd(0); | 498 | *spgd = __pgd(0); |
| 342 | } | 499 | } |
| 343 | } | 500 | } |
| 344 | 501 | #endif | |
| 345 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() | 502 | /*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() |
| 346 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. | 503 | * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. |
| 347 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ | 504 | * It simply releases every PTE page from 0 up to the Guest's kernel address. */ |
| @@ -350,7 +507,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
| 350 | unsigned int i; | 507 | unsigned int i; |
| 351 | /* Release every pgd entry up to the kernel's address. */ | 508 | /* Release every pgd entry up to the kernel's address. */ |
| 352 | for (i = 0; i < pgd_index(lg->kernel_address); i++) | 509 | for (i = 0; i < pgd_index(lg->kernel_address); i++) |
| 353 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 510 | release_pgd(lg->pgdirs[idx].pgdir + i); |
| 354 | } | 511 | } |
| 355 | 512 | ||
| 356 | /*H:440 (v) Flushing (throwing away) page tables, | 513 | /*H:440 (v) Flushing (throwing away) page tables, |
| @@ -369,7 +526,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 369 | { | 526 | { |
| 370 | pgd_t gpgd; | 527 | pgd_t gpgd; |
| 371 | pte_t gpte; | 528 | pte_t gpte; |
| 372 | 529 | #ifdef CONFIG_X86_PAE | |
| 530 | pmd_t gpmd; | ||
| 531 | #endif | ||
| 373 | /* First step: get the top-level Guest page table entry. */ | 532 | /* First step: get the top-level Guest page table entry. */ |
| 374 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); | 533 | gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); |
| 375 | /* Toplevel not present? We can't map it in. */ | 534 | /* Toplevel not present? We can't map it in. */ |
| @@ -378,7 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) | |||
| 378 | return -1UL; | 537 | return -1UL; |
| 379 | } | 538 | } |
| 380 | 539 | ||
| 381 | gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); | 540 | #ifdef CONFIG_X86_PAE |
| 541 | gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); | ||
| 542 | if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) | ||
| 543 | kill_guest(cpu, "Bad address %#lx", vaddr); | ||
| 544 | gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); | ||
| 545 | #else | ||
| 546 | gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); | ||
| 547 | #endif | ||
| 382 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | 548 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
| 383 | kill_guest(cpu, "Bad address %#lx", vaddr); | 549 | kill_guest(cpu, "Bad address %#lx", vaddr); |
| 384 | 550 | ||
| @@ -405,6 +571,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 405 | int *blank_pgdir) | 571 | int *blank_pgdir) |
| 406 | { | 572 | { |
| 407 | unsigned int next; | 573 | unsigned int next; |
| 574 | #ifdef CONFIG_X86_PAE | ||
| 575 | pmd_t *pmd_table; | ||
| 576 | #endif | ||
| 408 | 577 | ||
| 409 | /* We pick one entry at random to throw out. Choosing the Least | 578 | /* We pick one entry at random to throw out. Choosing the Least |
| 410 | * Recently Used might be better, but this is easy. */ | 579 | * Recently Used might be better, but this is easy. */ |
| @@ -416,10 +585,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 416 | /* If the allocation fails, just keep using the one we have */ | 585 | /* If the allocation fails, just keep using the one we have */ |
| 417 | if (!cpu->lg->pgdirs[next].pgdir) | 586 | if (!cpu->lg->pgdirs[next].pgdir) |
| 418 | next = cpu->cpu_pgd; | 587 | next = cpu->cpu_pgd; |
| 419 | else | 588 | else { |
| 420 | /* This is a blank page, so there are no kernel | 589 | #ifdef CONFIG_X86_PAE |
| 421 | * mappings: caller must map the stack! */ | 590 | /* In PAE mode, allocate a pmd page and populate the |
| 591 | * last pgd entry. */ | ||
| 592 | pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); | ||
| 593 | if (!pmd_table) { | ||
| 594 | free_page((long)cpu->lg->pgdirs[next].pgdir); | ||
| 595 | set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0)); | ||
| 596 | next = cpu->cpu_pgd; | ||
| 597 | } else { | ||
| 598 | set_pgd(cpu->lg->pgdirs[next].pgdir + | ||
| 599 | SWITCHER_PGD_INDEX, | ||
| 600 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 601 | /* This is a blank page, so there are no kernel | ||
| 602 | * mappings: caller must map the stack! */ | ||
| 603 | *blank_pgdir = 1; | ||
| 604 | } | ||
| 605 | #else | ||
| 422 | *blank_pgdir = 1; | 606 | *blank_pgdir = 1; |
| 607 | #endif | ||
| 608 | } | ||
| 423 | } | 609 | } |
| 424 | /* Record which Guest toplevel this shadows. */ | 610 | /* Record which Guest toplevel this shadows. */ |
| 425 | cpu->lg->pgdirs[next].gpgdir = gpgdir; | 611 | cpu->lg->pgdirs[next].gpgdir = gpgdir; |
| @@ -431,7 +617,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, | |||
| 431 | 617 | ||
| 432 | /*H:430 (iv) Switching page tables | 618 | /*H:430 (iv) Switching page tables |
| 433 | * | 619 | * |
| 434 | * Now we've seen all the page table setting and manipulation, let's see what | 620 | * Now we've seen all the page table setting and manipulation, let's see |
| 435 | * what happens when the Guest changes page tables (ie. changes the top-level | 621 | * what happens when the Guest changes page tables (ie. changes the top-level |
| 436 | * pgdir). This occurs on almost every context switch. */ | 622 | * pgdir). This occurs on almost every context switch. */ |
| 437 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) | 623 | void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) |
| @@ -460,10 +646,25 @@ static void release_all_pagetables(struct lguest *lg) | |||
| 460 | 646 | ||
| 461 | /* Every shadow pagetable this Guest has */ | 647 | /* Every shadow pagetable this Guest has */ |
| 462 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 648 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
| 463 | if (lg->pgdirs[i].pgdir) | 649 | if (lg->pgdirs[i].pgdir) { |
| 650 | #ifdef CONFIG_X86_PAE | ||
| 651 | pgd_t *spgd; | ||
| 652 | pmd_t *pmdpage; | ||
| 653 | unsigned int k; | ||
| 654 | |||
| 655 | /* Get the last pmd page. */ | ||
| 656 | spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; | ||
| 657 | pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); | ||
| 658 | |||
| 659 | /* And release the pmd entries of that pmd page, | ||
| 660 | * except for the switcher pmd. */ | ||
| 661 | for (k = 0; k < SWITCHER_PMD_INDEX; k++) | ||
| 662 | release_pmd(&pmdpage[k]); | ||
| 663 | #endif | ||
| 464 | /* Every PGD entry except the Switcher at the top */ | 664 | /* Every PGD entry except the Switcher at the top */ |
| 465 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) | 665 | for (j = 0; j < SWITCHER_PGD_INDEX; j++) |
| 466 | release_pgd(lg, lg->pgdirs[i].pgdir + j); | 666 | release_pgd(lg->pgdirs[i].pgdir + j); |
| 667 | } | ||
| 467 | } | 668 | } |
| 468 | 669 | ||
| 469 | /* We also throw away everything when a Guest tells us it's changed a kernel | 670 | /* We also throw away everything when a Guest tells us it's changed a kernel |
| @@ -504,24 +705,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, | |||
| 504 | { | 705 | { |
| 505 | /* Look up the matching shadow page directory entry. */ | 706 | /* Look up the matching shadow page directory entry. */ |
| 506 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); | 707 | pgd_t *spgd = spgd_addr(cpu, idx, vaddr); |
| 708 | #ifdef CONFIG_X86_PAE | ||
| 709 | pmd_t *spmd; | ||
| 710 | #endif | ||
| 507 | 711 | ||
| 508 | /* If the top level isn't present, there's no entry to update. */ | 712 | /* If the top level isn't present, there's no entry to update. */ |
| 509 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { | 713 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
| 510 | /* Otherwise, we start by releasing the existing entry. */ | 714 | #ifdef CONFIG_X86_PAE |
| 511 | pte_t *spte = spte_addr(*spgd, vaddr); | 715 | spmd = spmd_addr(cpu, *spgd, vaddr); |
| 512 | release_pte(*spte); | 716 | if (pmd_flags(*spmd) & _PAGE_PRESENT) { |
| 513 | 717 | #endif | |
| 514 | /* If they're setting this entry as dirty or accessed, we might | 718 | /* Otherwise, we start by releasing |
| 515 | * as well put that entry they've given us in now. This shaves | 719 | * the existing entry. */ |
| 516 | * 10% off a copy-on-write micro-benchmark. */ | 720 | pte_t *spte = spte_addr(cpu, *spgd, vaddr); |
| 517 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 721 | release_pte(*spte); |
| 518 | check_gpte(cpu, gpte); | 722 | |
| 519 | *spte = gpte_to_spte(cpu, gpte, | 723 | /* If they're setting this entry as dirty or accessed, |
| 520 | pte_flags(gpte) & _PAGE_DIRTY); | 724 | * we might as well put that entry they've given us |
| 521 | } else | 725 | * in now. This shaves 10% off a |
| 522 | /* Otherwise kill it and we can demand_page() it in | 726 | * copy-on-write micro-benchmark. */ |
| 523 | * later. */ | 727 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
| 524 | *spte = __pte(0); | 728 | check_gpte(cpu, gpte); |
| 729 | native_set_pte(spte, | ||
| 730 | gpte_to_spte(cpu, gpte, | ||
| 731 | pte_flags(gpte) & _PAGE_DIRTY)); | ||
| 732 | } else | ||
| 733 | /* Otherwise kill it and we can demand_page() | ||
| 734 | * it in later. */ | ||
| 735 | native_set_pte(spte, __pte(0)); | ||
| 736 | #ifdef CONFIG_X86_PAE | ||
| 737 | } | ||
| 738 | #endif | ||
| 525 | } | 739 | } |
| 526 | } | 740 | } |
| 527 | 741 | ||
| @@ -568,12 +782,10 @@ void guest_set_pte(struct lg_cpu *cpu, | |||
| 568 | * | 782 | * |
| 569 | * So with that in mind here's our code to to update a (top-level) PGD entry: | 783 | * So with that in mind here's our code to to update a (top-level) PGD entry: |
| 570 | */ | 784 | */ |
| 571 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | 785 | void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) |
| 572 | { | 786 | { |
| 573 | int pgdir; | 787 | int pgdir; |
| 574 | 788 | ||
| 575 | /* The kernel seems to try to initialize this early on: we ignore its | ||
| 576 | * attempts to map over the Switcher. */ | ||
| 577 | if (idx >= SWITCHER_PGD_INDEX) | 789 | if (idx >= SWITCHER_PGD_INDEX) |
| 578 | return; | 790 | return; |
| 579 | 791 | ||
| @@ -581,8 +793,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
| 581 | pgdir = find_pgdir(lg, gpgdir); | 793 | pgdir = find_pgdir(lg, gpgdir); |
| 582 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 794 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
| 583 | /* ... throw it away. */ | 795 | /* ... throw it away. */ |
| 584 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 796 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
| 585 | } | 797 | } |
| 798 | #ifdef CONFIG_X86_PAE | ||
| 799 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | ||
| 800 | { | ||
| 801 | guest_pagetable_clear_all(&lg->cpus[0]); | ||
| 802 | } | ||
| 803 | #endif | ||
| 586 | 804 | ||
| 587 | /* Once we know how much memory we have we can construct simple identity | 805 | /* Once we know how much memory we have we can construct simple identity |
| 588 | * (which set virtual == physical) and linear mappings | 806 | * (which set virtual == physical) and linear mappings |
| @@ -596,8 +814,16 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 596 | { | 814 | { |
| 597 | pgd_t __user *pgdir; | 815 | pgd_t __user *pgdir; |
| 598 | pte_t __user *linear; | 816 | pte_t __user *linear; |
| 599 | unsigned int mapped_pages, i, linear_pages, phys_linear; | ||
| 600 | unsigned long mem_base = (unsigned long)lg->mem_base; | 817 | unsigned long mem_base = (unsigned long)lg->mem_base; |
| 818 | unsigned int mapped_pages, i, linear_pages; | ||
| 819 | #ifdef CONFIG_X86_PAE | ||
| 820 | pmd_t __user *pmds; | ||
| 821 | unsigned int j; | ||
| 822 | pgd_t pgd; | ||
| 823 | pmd_t pmd; | ||
| 824 | #else | ||
| 825 | unsigned int phys_linear; | ||
| 826 | #endif | ||
| 601 | 827 | ||
| 602 | /* We have mapped_pages frames to map, so we need | 828 | /* We have mapped_pages frames to map, so we need |
| 603 | * linear_pages page tables to map them. */ | 829 | * linear_pages page tables to map them. */ |
| @@ -610,6 +836,9 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 610 | /* Now we use the next linear_pages pages as pte pages */ | 836 | /* Now we use the next linear_pages pages as pte pages */ |
| 611 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 837 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
| 612 | 838 | ||
| 839 | #ifdef CONFIG_X86_PAE | ||
| 840 | pmds = (void *)linear - PAGE_SIZE; | ||
| 841 | #endif | ||
| 613 | /* Linear mapping is easy: put every page's address into the | 842 | /* Linear mapping is easy: put every page's address into the |
| 614 | * mapping in order. */ | 843 | * mapping in order. */ |
| 615 | for (i = 0; i < mapped_pages; i++) { | 844 | for (i = 0; i < mapped_pages; i++) { |
| @@ -621,6 +850,22 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 621 | 850 | ||
| 622 | /* The top level points to the linear page table pages above. | 851 | /* The top level points to the linear page table pages above. |
| 623 | * We setup the identity and linear mappings here. */ | 852 | * We setup the identity and linear mappings here. */ |
| 853 | #ifdef CONFIG_X86_PAE | ||
| 854 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | ||
| 855 | i += PTRS_PER_PTE, j++) { | ||
| 856 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | ||
| 857 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | ||
| 858 | |||
| 859 | if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0) | ||
| 860 | return -EFAULT; | ||
| 861 | } | ||
| 862 | |||
| 863 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | ||
| 864 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | ||
| 865 | return -EFAULT; | ||
| 866 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | ||
| 867 | return -EFAULT; | ||
| 868 | #else | ||
| 624 | phys_linear = (unsigned long)linear - mem_base; | 869 | phys_linear = (unsigned long)linear - mem_base; |
| 625 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 870 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
| 626 | pgd_t pgd; | 871 | pgd_t pgd; |
| @@ -633,6 +878,7 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
| 633 | &pgd, sizeof(pgd))) | 878 | &pgd, sizeof(pgd))) |
| 634 | return -EFAULT; | 879 | return -EFAULT; |
| 635 | } | 880 | } |
| 881 | #endif | ||
| 636 | 882 | ||
| 637 | /* We return the top level (guest-physical) address: remember where | 883 | /* We return the top level (guest-physical) address: remember where |
| 638 | * this is. */ | 884 | * this is. */ |
| @@ -648,7 +894,10 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 648 | u64 mem; | 894 | u64 mem; |
| 649 | u32 initrd_size; | 895 | u32 initrd_size; |
| 650 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; | 896 | struct boot_params __user *boot = (struct boot_params *)lg->mem_base; |
| 651 | 897 | #ifdef CONFIG_X86_PAE | |
| 898 | pgd_t *pgd; | ||
| 899 | pmd_t *pmd_table; | ||
| 900 | #endif | ||
| 652 | /* Get the Guest memory size and the ramdisk size from the boot header | 901 | /* Get the Guest memory size and the ramdisk size from the boot header |
| 653 | * located at lg->mem_base (Guest address 0). */ | 902 | * located at lg->mem_base (Guest address 0). */ |
| 654 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) | 903 | if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) |
| @@ -663,6 +912,15 @@ int init_guest_pagetable(struct lguest *lg) | |||
| 663 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 912 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 664 | if (!lg->pgdirs[0].pgdir) | 913 | if (!lg->pgdirs[0].pgdir) |
| 665 | return -ENOMEM; | 914 | return -ENOMEM; |
| 915 | #ifdef CONFIG_X86_PAE | ||
| 916 | pgd = lg->pgdirs[0].pgdir; | ||
| 917 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | ||
| 918 | if (!pmd_table) | ||
| 919 | return -ENOMEM; | ||
| 920 | |||
| 921 | set_pgd(pgd + SWITCHER_PGD_INDEX, | ||
| 922 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | ||
| 923 | #endif | ||
| 666 | lg->cpus[0].cpu_pgd = 0; | 924 | lg->cpus[0].cpu_pgd = 0; |
| 667 | return 0; | 925 | return 0; |
| 668 | } | 926 | } |
| @@ -672,17 +930,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu) | |||
| 672 | { | 930 | { |
| 673 | /* We get the kernel address: above this is all kernel memory. */ | 931 | /* We get the kernel address: above this is all kernel memory. */ |
| 674 | if (get_user(cpu->lg->kernel_address, | 932 | if (get_user(cpu->lg->kernel_address, |
| 675 | &cpu->lg->lguest_data->kernel_address) | 933 | &cpu->lg->lguest_data->kernel_address) |
| 676 | /* We tell the Guest that it can't use the top 4MB of virtual | 934 | /* We tell the Guest that it can't use the top 2 or 4 MB |
| 677 | * addresses used by the Switcher. */ | 935 | * of virtual addresses used by the Switcher. */ |
| 678 | || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) | 936 | || put_user(RESERVE_MEM * 1024 * 1024, |
| 679 | || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) | 937 | &cpu->lg->lguest_data->reserve_mem) |
| 938 | || put_user(cpu->lg->pgdirs[0].gpgdir, | ||
| 939 | &cpu->lg->lguest_data->pgdir)) | ||
| 680 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); | 940 | kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); |
| 681 | 941 | ||
| 682 | /* In flush_user_mappings() we loop from 0 to | 942 | /* In flush_user_mappings() we loop from 0 to |
| 683 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | 943 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the |
| 684 | * Switcher mappings, so check that now. */ | 944 | * Switcher mappings, so check that now. */ |
| 945 | #ifdef CONFIG_X86_PAE | ||
| 946 | if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && | ||
| 947 | pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) | ||
| 948 | #else | ||
| 685 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) | 949 | if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) |
| 950 | #endif | ||
| 686 | kill_guest(cpu, "bad kernel address %#lx", | 951 | kill_guest(cpu, "bad kernel address %#lx", |
| 687 | cpu->lg->kernel_address); | 952 | cpu->lg->kernel_address); |
| 688 | } | 953 | } |
| @@ -708,16 +973,30 @@ void free_guest_pagetable(struct lguest *lg) | |||
| 708 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | 973 | void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) |
| 709 | { | 974 | { |
| 710 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 975 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
| 711 | pgd_t switcher_pgd; | ||
| 712 | pte_t regs_pte; | 976 | pte_t regs_pte; |
| 713 | unsigned long pfn; | 977 | unsigned long pfn; |
| 714 | 978 | ||
| 979 | #ifdef CONFIG_X86_PAE | ||
| 980 | pmd_t switcher_pmd; | ||
| 981 | pmd_t *pmd_table; | ||
| 982 | |||
| 983 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | ||
| 984 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | ||
| 985 | |||
| 986 | pmd_table = __va(pgd_pfn(cpu->lg-> | ||
| 987 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | ||
| 988 | << PAGE_SHIFT); | ||
| 989 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | ||
| 990 | #else | ||
| 991 | pgd_t switcher_pgd; | ||
| 992 | |||
| 715 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 993 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
| 716 | * page for this CPU (with appropriate flags). */ | 994 | * page for this CPU (with appropriate flags). */ |
| 717 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); | 995 | switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); |
| 718 | 996 | ||
| 719 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 997 | cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
| 720 | 998 | ||
| 999 | #endif | ||
| 721 | /* We also change the Switcher PTE page. When we're running the Guest, | 1000 | /* We also change the Switcher PTE page. When we're running the Guest, |
| 722 | * we want the Guest's "regs" page to appear where the first Switcher | 1001 | * we want the Guest's "regs" page to appear where the first Switcher |
| 723 | * page for this CPU is. This is an optimization: when the Switcher | 1002 | * page for this CPU is. This is an optimization: when the Switcher |
| @@ -726,8 +1005,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
| 726 | * page is already mapped there, we don't have to copy them out | 1005 | * page is already mapped there, we don't have to copy them out |
| 727 | * again. */ | 1006 | * again. */ |
| 728 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; | 1007 | pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; |
| 729 | regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); | 1008 | native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); |
| 730 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; | 1009 | native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], |
| 1010 | regs_pte); | ||
| 731 | } | 1011 | } |
| 732 | /*:*/ | 1012 | /*:*/ |
| 733 | 1013 | ||
| @@ -752,21 +1032,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
| 752 | 1032 | ||
| 753 | /* The first entries are easy: they map the Switcher code. */ | 1033 | /* The first entries are easy: they map the Switcher code. */ |
| 754 | for (i = 0; i < pages; i++) { | 1034 | for (i = 0; i < pages; i++) { |
| 755 | pte[i] = mk_pte(switcher_page[i], | 1035 | native_set_pte(&pte[i], mk_pte(switcher_page[i], |
| 756 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1036 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
| 757 | } | 1037 | } |
| 758 | 1038 | ||
| 759 | /* The only other thing we map is this CPU's pair of pages. */ | 1039 | /* The only other thing we map is this CPU's pair of pages. */ |
| 760 | i = pages + cpu*2; | 1040 | i = pages + cpu*2; |
| 761 | 1041 | ||
| 762 | /* First page (Guest registers) is writable from the Guest */ | 1042 | /* First page (Guest registers) is writable from the Guest */ |
| 763 | pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), | 1043 | native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), |
| 764 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); | 1044 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); |
| 765 | 1045 | ||
| 766 | /* The second page contains the "struct lguest_ro_state", and is | 1046 | /* The second page contains the "struct lguest_ro_state", and is |
| 767 | * read-only. */ | 1047 | * read-only. */ |
| 768 | pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), | 1048 | native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), |
| 769 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); | 1049 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); |
| 770 | } | 1050 | } |
| 771 | 1051 | ||
| 772 | /* We've made it through the page table code. Perhaps our tired brains are | 1052 | /* We've made it through the page table code. Perhaps our tired brains are |
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 7ede64ffeef9..482ed5a18750 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c | |||
| @@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) | |||
| 150 | { | 150 | { |
| 151 | /* We assume the Guest has the same number of GDT entries as the | 151 | /* We assume the Guest has the same number of GDT entries as the |
| 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ | 152 | * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ |
| 153 | if (num > ARRAY_SIZE(cpu->arch.gdt)) | 153 | if (num >= ARRAY_SIZE(cpu->arch.gdt)) |
| 154 | kill_guest(cpu, "too many gdt entries %i", num); | 154 | kill_guest(cpu, "too many gdt entries %i", num); |
| 155 | 155 | ||
| 156 | /* Set it up, then fix it. */ | 156 | /* Set it up, then fix it. */ |
diff --git a/fs/eventfd.c b/fs/eventfd.c index 2a701d593d35..3f0e1974abdc 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/anon_inodes.h> | 16 | #include <linux/anon_inodes.h> |
| 17 | #include <linux/eventfd.h> | 17 | #include <linux/eventfd.h> |
| 18 | #include <linux/syscalls.h> | 18 | #include <linux/syscalls.h> |
| 19 | #include <linux/module.h> | ||
| 19 | 20 | ||
| 20 | struct eventfd_ctx { | 21 | struct eventfd_ctx { |
| 21 | wait_queue_head_t wqh; | 22 | wait_queue_head_t wqh; |
| @@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n) | |||
| 56 | 57 | ||
| 57 | return n; | 58 | return n; |
| 58 | } | 59 | } |
| 60 | EXPORT_SYMBOL_GPL(eventfd_signal); | ||
| 59 | 61 | ||
| 60 | static int eventfd_release(struct inode *inode, struct file *file) | 62 | static int eventfd_release(struct inode *inode, struct file *file) |
| 61 | { | 63 | { |
| @@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd) | |||
| 197 | 199 | ||
| 198 | return file; | 200 | return file; |
| 199 | } | 201 | } |
| 202 | EXPORT_SYMBOL_GPL(eventfd_fget); | ||
| 200 | 203 | ||
| 201 | SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) | 204 | SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) |
| 202 | { | 205 | { |
diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 175e63f4a8c0..7bc1440fc473 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h | |||
| @@ -30,6 +30,10 @@ struct lguest_data | |||
| 30 | /* Wallclock time set by the Host. */ | 30 | /* Wallclock time set by the Host. */ |
| 31 | struct timespec time; | 31 | struct timespec time; |
| 32 | 32 | ||
| 33 | /* Interrupt pending set by the Host. The Guest should do a hypercall | ||
| 34 | * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ | ||
| 35 | int irq_pending; | ||
| 36 | |||
| 33 | /* Async hypercall ring. Instead of directly making hypercalls, we can | 37 | /* Async hypercall ring. Instead of directly making hypercalls, we can |
| 34 | * place them in here for processing the next time the Host wants. | 38 | * place them in here for processing the next time the Host wants. |
| 35 | * This batching can be quite efficient. */ | 39 | * This batching can be quite efficient. */ |
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index a53407a4165c..bfefbdf7498a 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h | |||
| @@ -57,7 +57,8 @@ enum lguest_req | |||
| 57 | LHREQ_INITIALIZE, /* + base, pfnlimit, start */ | 57 | LHREQ_INITIALIZE, /* + base, pfnlimit, start */ |
| 58 | LHREQ_GETDMA, /* No longer used */ | 58 | LHREQ_GETDMA, /* No longer used */ |
| 59 | LHREQ_IRQ, /* + irq */ | 59 | LHREQ_IRQ, /* + irq */ |
| 60 | LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ | 60 | LHREQ_BREAK, /* No longer used */ |
| 61 | LHREQ_EVENTFD, /* + address, fd. */ | ||
| 61 | }; | 62 | }; |
| 62 | 63 | ||
| 63 | /* The alignment to use between consumer and producer parts of vring. | 64 | /* The alignment to use between consumer and producer parts of vring. |
diff --git a/kernel/sched.c b/kernel/sched.c index f04aa9664504..8ec9d13140be 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p) | |||
| 2192 | smp_send_reschedule(cpu); | 2192 | smp_send_reschedule(cpu); |
| 2193 | preempt_enable(); | 2193 | preempt_enable(); |
| 2194 | } | 2194 | } |
| 2195 | EXPORT_SYMBOL_GPL(kick_process); | ||
| 2195 | 2196 | ||
| 2196 | /* | 2197 | /* |
| 2197 | * Return a low guess at the load of a migration-source cpu weighted | 2198 | * Return a low guess at the load of a migration-source cpu weighted |
