aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-12 12:32:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-12 12:32:26 -0400
commit7f3591cfacf2d79c4f42238e46c7d053da8e020d (patch)
treef2e9ed7b6b0bc176facaa49846734790023a6b16
parent16ffc3eeaa00d513b0076b7b2b96419f28acc912 (diff)
parentd1f0132e76a11b05167313c606a853953f416081 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (31 commits) lguest: add support for indirect ring entries lguest: suppress notifications in example Launcher lguest: try to batch interrupts on network receive lguest: avoid sending interrupts to Guest when no activity occurs. lguest: implement deferred interrupts in example Launcher lguest: remove obsolete LHREQ_BREAK call lguest: have example Launcher service all devices in separate threads lguest: use eventfds for device notification eventfd: export eventfd_signal and eventfd_fget for lguest lguest: allow any process to send interrupts lguest: PAE fixes lguest: PAE support lguest: Add support for kvm_hypercall4() lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated lguest: map switcher with executable page table entries lguest: fix writev returning short on console output lguest: clean up length-used value in example launcher lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition. lguest: beyond ARRAY_SIZE of cpu->arch.gdt ...
-rw-r--r--Documentation/lguest/Makefile3
-rw-r--r--Documentation/lguest/lguest.c1008
-rw-r--r--Documentation/lguest/lguest.txt1
-rw-r--r--arch/x86/include/asm/lguest.h7
-rw-r--r--arch/x86/include/asm/lguest_hcall.h15
-rw-r--r--arch/x86/kernel/asm-offsets_32.c1
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c158
-rw-r--r--arch/x86/lguest/i386_head.S60
-rw-r--r--drivers/lguest/Kconfig2
-rw-r--r--drivers/lguest/core.c30
-rw-r--r--drivers/lguest/hypercalls.c14
-rw-r--r--drivers/lguest/interrupts_and_traps.c57
-rw-r--r--drivers/lguest/lg.h28
-rw-r--r--drivers/lguest/lguest_user.c127
-rw-r--r--drivers/lguest/page_tables.c396
-rw-r--r--drivers/lguest/segments.c2
-rw-r--r--fs/eventfd.c3
-rw-r--r--include/linux/lguest.h4
-rw-r--r--include/linux/lguest_launcher.h3
-rw-r--r--kernel/sched.c1
21 files changed, 1103 insertions, 818 deletions
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 1f4f9e888bd1..28c8cdfcafd8 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -1,6 +1,5 @@
1# This creates the demonstration utility "lguest" which runs a Linux guest. 1# This creates the demonstration utility "lguest" which runs a Linux guest.
2CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE 2CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
3LDLIBS:=-lz
4 3
5all: lguest 4all: lguest
6 5
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index d36fcc0f2715..9ebcd6ef361b 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -16,6 +16,7 @@
16#include <sys/types.h> 16#include <sys/types.h>
17#include <sys/stat.h> 17#include <sys/stat.h>
18#include <sys/wait.h> 18#include <sys/wait.h>
19#include <sys/eventfd.h>
19#include <fcntl.h> 20#include <fcntl.h>
20#include <stdbool.h> 21#include <stdbool.h>
21#include <errno.h> 22#include <errno.h>
@@ -59,7 +60,6 @@ typedef uint8_t u8;
59/*:*/ 60/*:*/
60 61
61#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ 62#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
62#define NET_PEERNUM 1
63#define BRIDGE_PFX "bridge:" 63#define BRIDGE_PFX "bridge:"
64#ifndef SIOCBRADDIF 64#ifndef SIOCBRADDIF
65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ 65#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
@@ -76,19 +76,12 @@ static bool verbose;
76 do { if (verbose) printf(args); } while(0) 76 do { if (verbose) printf(args); } while(0)
77/*:*/ 77/*:*/
78 78
79/* File descriptors for the Waker. */
80struct {
81 int pipe[2];
82 int lguest_fd;
83} waker_fds;
84
85/* The pointer to the start of guest memory. */ 79/* The pointer to the start of guest memory. */
86static void *guest_base; 80static void *guest_base;
87/* The maximum guest physical address allowed, and maximum possible. */ 81/* The maximum guest physical address allowed, and maximum possible. */
88static unsigned long guest_limit, guest_max; 82static unsigned long guest_limit, guest_max;
89/* The pipe for signal hander to write to. */ 83/* The /dev/lguest file descriptor. */
90static int timeoutpipe[2]; 84static int lguest_fd;
91static unsigned int timeout_usec = 500;
92 85
93/* a per-cpu variable indicating whose vcpu is currently running */ 86/* a per-cpu variable indicating whose vcpu is currently running */
94static unsigned int __thread cpu_id; 87static unsigned int __thread cpu_id;
@@ -96,11 +89,6 @@ static unsigned int __thread cpu_id;
96/* This is our list of devices. */ 89/* This is our list of devices. */
97struct device_list 90struct device_list
98{ 91{
99 /* Summary information about the devices in our list: ready to pass to
100 * select() to ask which need servicing.*/
101 fd_set infds;
102 int max_infd;
103
104 /* Counter to assign interrupt numbers. */ 92 /* Counter to assign interrupt numbers. */
105 unsigned int next_irq; 93 unsigned int next_irq;
106 94
@@ -126,22 +114,21 @@ struct device
126 /* The linked-list pointer. */ 114 /* The linked-list pointer. */
127 struct device *next; 115 struct device *next;
128 116
129 /* The this device's descriptor, as mapped into the Guest. */ 117 /* The device's descriptor, as mapped into the Guest. */
130 struct lguest_device_desc *desc; 118 struct lguest_device_desc *desc;
131 119
120 /* We can't trust desc values once Guest has booted: we use these. */
121 unsigned int feature_len;
122 unsigned int num_vq;
123
132 /* The name of this device, for --verbose. */ 124 /* The name of this device, for --verbose. */
133 const char *name; 125 const char *name;
134 126
135 /* If handle_input is set, it wants to be called when this file
136 * descriptor is ready. */
137 int fd;
138 bool (*handle_input)(int fd, struct device *me);
139
140 /* Any queues attached to this device */ 127 /* Any queues attached to this device */
141 struct virtqueue *vq; 128 struct virtqueue *vq;
142 129
143 /* Handle status being finalized (ie. feature bits stable). */ 130 /* Is it operational */
144 void (*ready)(struct device *me); 131 bool running;
145 132
146 /* Device-specific data. */ 133 /* Device-specific data. */
147 void *priv; 134 void *priv;
@@ -164,22 +151,28 @@ struct virtqueue
164 /* Last available index we saw. */ 151 /* Last available index we saw. */
165 u16 last_avail_idx; 152 u16 last_avail_idx;
166 153
167 /* The routine to call when the Guest pings us, or timeout. */ 154 /* How many are used since we sent last irq? */
168 void (*handle_output)(int fd, struct virtqueue *me, bool timeout); 155 unsigned int pending_used;
169 156
170 /* Outstanding buffers */ 157 /* Eventfd where Guest notifications arrive. */
171 unsigned int inflight; 158 int eventfd;
172 159
173 /* Is this blocked awaiting a timer? */ 160 /* Function for the thread which is servicing this virtqueue. */
174 bool blocked; 161 void (*service)(struct virtqueue *vq);
162 pid_t thread;
175}; 163};
176 164
177/* Remember the arguments to the program so we can "reboot" */ 165/* Remember the arguments to the program so we can "reboot" */
178static char **main_args; 166static char **main_args;
179 167
180/* Since guest is UP and we don't run at the same time, we don't need barriers. 168/* The original tty settings to restore on exit. */
181 * But I include them in the code in case others copy it. */ 169static struct termios orig_term;
182#define wmb() 170
171/* We have to be careful with barriers: our devices are all run in separate
172 * threads and so we need to make sure that changes visible to the Guest happen
173 * in precise order. */
174#define wmb() __asm__ __volatile__("" : : : "memory")
175#define mb() __asm__ __volatile__("" : : : "memory")
183 176
184/* Convert an iovec element to the given type. 177/* Convert an iovec element to the given type.
185 * 178 *
@@ -245,7 +238,7 @@ static void iov_consume(struct iovec iov[], unsigned num_iov, unsigned len)
245static u8 *get_feature_bits(struct device *dev) 238static u8 *get_feature_bits(struct device *dev)
246{ 239{
247 return (u8 *)(dev->desc + 1) 240 return (u8 *)(dev->desc + 1)
248 + dev->desc->num_vq * sizeof(struct lguest_vqconfig); 241 + dev->num_vq * sizeof(struct lguest_vqconfig);
249} 242}
250 243
251/*L:100 The Launcher code itself takes us out into userspace, that scary place 244/*L:100 The Launcher code itself takes us out into userspace, that scary place
@@ -505,99 +498,19 @@ static void concat(char *dst, char *args[])
505 * saw the arguments it expects when we looked at initialize() in lguest_user.c: 498 * saw the arguments it expects when we looked at initialize() in lguest_user.c:
506 * the base of Guest "physical" memory, the top physical page to allow and the 499 * the base of Guest "physical" memory, the top physical page to allow and the
507 * entry point for the Guest. */ 500 * entry point for the Guest. */
508static int tell_kernel(unsigned long start) 501static void tell_kernel(unsigned long start)
509{ 502{
510 unsigned long args[] = { LHREQ_INITIALIZE, 503 unsigned long args[] = { LHREQ_INITIALIZE,
511 (unsigned long)guest_base, 504 (unsigned long)guest_base,
512 guest_limit / getpagesize(), start }; 505 guest_limit / getpagesize(), start };
513 int fd;
514
515 verbose("Guest: %p - %p (%#lx)\n", 506 verbose("Guest: %p - %p (%#lx)\n",
516 guest_base, guest_base + guest_limit, guest_limit); 507 guest_base, guest_base + guest_limit, guest_limit);
517 fd = open_or_die("/dev/lguest", O_RDWR); 508 lguest_fd = open_or_die("/dev/lguest", O_RDWR);
518 if (write(fd, args, sizeof(args)) < 0) 509 if (write(lguest_fd, args, sizeof(args)) < 0)
519 err(1, "Writing to /dev/lguest"); 510 err(1, "Writing to /dev/lguest");
520
521 /* We return the /dev/lguest file descriptor to control this Guest */
522 return fd;
523} 511}
524/*:*/ 512/*:*/
525 513
526static void add_device_fd(int fd)
527{
528 FD_SET(fd, &devices.infds);
529 if (fd > devices.max_infd)
530 devices.max_infd = fd;
531}
532
533/*L:200
534 * The Waker.
535 *
536 * With console, block and network devices, we can have lots of input which we
537 * need to process. We could try to tell the kernel what file descriptors to
538 * watch, but handing a file descriptor mask through to the kernel is fairly
539 * icky.
540 *
541 * Instead, we clone off a thread which watches the file descriptors and writes
542 * the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
543 * stop running the Guest. This causes the Launcher to return from the
544 * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
545 * the LHREQ_BREAK and wake us up again.
546 *
547 * This, of course, is merely a different *kind* of icky.
548 *
549 * Given my well-known antipathy to threads, I'd prefer to use processes. But
550 * it's easier to share Guest memory with threads, and trivial to share the
551 * devices.infds as the Launcher changes it.
552 */
553static int waker(void *unused)
554{
555 /* Close the write end of the pipe: only the Launcher has it open. */
556 close(waker_fds.pipe[1]);
557
558 for (;;) {
559 fd_set rfds = devices.infds;
560 unsigned long args[] = { LHREQ_BREAK, 1 };
561 unsigned int maxfd = devices.max_infd;
562
563 /* We also listen to the pipe from the Launcher. */
564 FD_SET(waker_fds.pipe[0], &rfds);
565 if (waker_fds.pipe[0] > maxfd)
566 maxfd = waker_fds.pipe[0];
567
568 /* Wait until input is ready from one of the devices. */
569 select(maxfd+1, &rfds, NULL, NULL, NULL);
570
571 /* Message from Launcher? */
572 if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
573 char c;
574 /* If this fails, then assume Launcher has exited.
575 * Don't do anything on exit: we're just a thread! */
576 if (read(waker_fds.pipe[0], &c, 1) != 1)
577 _exit(0);
578 continue;
579 }
580
581 /* Send LHREQ_BREAK command to snap the Launcher out of it. */
582 pwrite(waker_fds.lguest_fd, args, sizeof(args), cpu_id);
583 }
584 return 0;
585}
586
587/* This routine just sets up a pipe to the Waker process. */
588static void setup_waker(int lguest_fd)
589{
590 /* This pipe is closed when Launcher dies, telling Waker. */
591 if (pipe(waker_fds.pipe) != 0)
592 err(1, "Creating pipe for Waker");
593
594 /* Waker also needs to know the lguest fd */
595 waker_fds.lguest_fd = lguest_fd;
596
597 if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
598 err(1, "Creating Waker");
599}
600
601/* 514/*
602 * Device Handling. 515 * Device Handling.
603 * 516 *
@@ -623,49 +536,90 @@ static void *_check_pointer(unsigned long addr, unsigned int size,
623/* Each buffer in the virtqueues is actually a chain of descriptors. This 536/* Each buffer in the virtqueues is actually a chain of descriptors. This
624 * function returns the next descriptor in the chain, or vq->vring.num if we're 537 * function returns the next descriptor in the chain, or vq->vring.num if we're
625 * at the end. */ 538 * at the end. */
626static unsigned next_desc(struct virtqueue *vq, unsigned int i) 539static unsigned next_desc(struct vring_desc *desc,
540 unsigned int i, unsigned int max)
627{ 541{
628 unsigned int next; 542 unsigned int next;
629 543
630 /* If this descriptor says it doesn't chain, we're done. */ 544 /* If this descriptor says it doesn't chain, we're done. */
631 if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT)) 545 if (!(desc[i].flags & VRING_DESC_F_NEXT))
632 return vq->vring.num; 546 return max;
633 547
634 /* Check they're not leading us off end of descriptors. */ 548 /* Check they're not leading us off end of descriptors. */
635 next = vq->vring.desc[i].next; 549 next = desc[i].next;
636 /* Make sure compiler knows to grab that: we don't want it changing! */ 550 /* Make sure compiler knows to grab that: we don't want it changing! */
637 wmb(); 551 wmb();
638 552
639 if (next >= vq->vring.num) 553 if (next >= max)
640 errx(1, "Desc next is %u", next); 554 errx(1, "Desc next is %u", next);
641 555
642 return next; 556 return next;
643} 557}
644 558
559/* This actually sends the interrupt for this virtqueue */
560static void trigger_irq(struct virtqueue *vq)
561{
562 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
563
564 /* Don't inform them if nothing used. */
565 if (!vq->pending_used)
566 return;
567 vq->pending_used = 0;
568
569 /* If they don't want an interrupt, don't send one, unless empty. */
570 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
571 && lg_last_avail(vq) != vq->vring.avail->idx)
572 return;
573
574 /* Send the Guest an interrupt tell them we used something up. */
575 if (write(lguest_fd, buf, sizeof(buf)) != 0)
576 err(1, "Triggering irq %i", vq->config.irq);
577}
578
645/* This looks in the virtqueue and for the first available buffer, and converts 579/* This looks in the virtqueue and for the first available buffer, and converts
646 * it to an iovec for convenient access. Since descriptors consist of some 580 * it to an iovec for convenient access. Since descriptors consist of some
647 * number of output then some number of input descriptors, it's actually two 581 * number of output then some number of input descriptors, it's actually two
648 * iovecs, but we pack them into one and note how many of each there were. 582 * iovecs, but we pack them into one and note how many of each there were.
649 * 583 *
650 * This function returns the descriptor number found, or vq->vring.num (which 584 * This function returns the descriptor number found. */
651 * is never a valid descriptor number) if none was found. */ 585static unsigned wait_for_vq_desc(struct virtqueue *vq,
652static unsigned get_vq_desc(struct virtqueue *vq, 586 struct iovec iov[],
653 struct iovec iov[], 587 unsigned int *out_num, unsigned int *in_num)
654 unsigned int *out_num, unsigned int *in_num)
655{ 588{
656 unsigned int i, head; 589 unsigned int i, head, max;
657 u16 last_avail; 590 struct vring_desc *desc;
591 u16 last_avail = lg_last_avail(vq);
592
593 while (last_avail == vq->vring.avail->idx) {
594 u64 event;
595
596 /* OK, tell Guest about progress up to now. */
597 trigger_irq(vq);
598
599 /* OK, now we need to know about added descriptors. */
600 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
601
602 /* They could have slipped one in as we were doing that: make
603 * sure it's written, then check again. */
604 mb();
605 if (last_avail != vq->vring.avail->idx) {
606 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
607 break;
608 }
609
610 /* Nothing new? Wait for eventfd to tell us they refilled. */
611 if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
612 errx(1, "Event read failed?");
613
614 /* We don't need to be notified again. */
615 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
616 }
658 617
659 /* Check it isn't doing very strange things with descriptor numbers. */ 618 /* Check it isn't doing very strange things with descriptor numbers. */
660 last_avail = lg_last_avail(vq);
661 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) 619 if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
662 errx(1, "Guest moved used index from %u to %u", 620 errx(1, "Guest moved used index from %u to %u",
663 last_avail, vq->vring.avail->idx); 621 last_avail, vq->vring.avail->idx);
664 622
665 /* If there's nothing new since last we looked, return invalid. */
666 if (vq->vring.avail->idx == last_avail)
667 return vq->vring.num;
668
669 /* Grab the next descriptor number they're advertising, and increment 623 /* Grab the next descriptor number they're advertising, and increment
670 * the index we've seen. */ 624 * the index we've seen. */
671 head = vq->vring.avail->ring[last_avail % vq->vring.num]; 625 head = vq->vring.avail->ring[last_avail % vq->vring.num];
@@ -678,15 +632,28 @@ static unsigned get_vq_desc(struct virtqueue *vq,
678 /* When we start there are none of either input nor output. */ 632 /* When we start there are none of either input nor output. */
679 *out_num = *in_num = 0; 633 *out_num = *in_num = 0;
680 634
635 max = vq->vring.num;
636 desc = vq->vring.desc;
681 i = head; 637 i = head;
638
639 /* If this is an indirect entry, then this buffer contains a descriptor
640 * table which we handle as if it's any normal descriptor chain. */
641 if (desc[i].flags & VRING_DESC_F_INDIRECT) {
642 if (desc[i].len % sizeof(struct vring_desc))
643 errx(1, "Invalid size for indirect buffer table");
644
645 max = desc[i].len / sizeof(struct vring_desc);
646 desc = check_pointer(desc[i].addr, desc[i].len);
647 i = 0;
648 }
649
682 do { 650 do {
683 /* Grab the first descriptor, and check it's OK. */ 651 /* Grab the first descriptor, and check it's OK. */
684 iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len; 652 iov[*out_num + *in_num].iov_len = desc[i].len;
685 iov[*out_num + *in_num].iov_base 653 iov[*out_num + *in_num].iov_base
686 = check_pointer(vq->vring.desc[i].addr, 654 = check_pointer(desc[i].addr, desc[i].len);
687 vq->vring.desc[i].len);
688 /* If this is an input descriptor, increment that count. */ 655 /* If this is an input descriptor, increment that count. */
689 if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE) 656 if (desc[i].flags & VRING_DESC_F_WRITE)
690 (*in_num)++; 657 (*in_num)++;
691 else { 658 else {
692 /* If it's an output descriptor, they're all supposed 659 /* If it's an output descriptor, they're all supposed
@@ -697,11 +664,10 @@ static unsigned get_vq_desc(struct virtqueue *vq,
697 } 664 }
698 665
699 /* If we've got too many, that implies a descriptor loop. */ 666 /* If we've got too many, that implies a descriptor loop. */
700 if (*out_num + *in_num > vq->vring.num) 667 if (*out_num + *in_num > max)
701 errx(1, "Looped descriptor"); 668 errx(1, "Looped descriptor");
702 } while ((i = next_desc(vq, i)) != vq->vring.num); 669 } while ((i = next_desc(desc, i, max)) != max);
703 670
704 vq->inflight++;
705 return head; 671 return head;
706} 672}
707 673
@@ -719,44 +685,20 @@ static void add_used(struct virtqueue *vq, unsigned int head, int len)
719 /* Make sure buffer is written before we update index. */ 685 /* Make sure buffer is written before we update index. */
720 wmb(); 686 wmb();
721 vq->vring.used->idx++; 687 vq->vring.used->idx++;
722 vq->inflight--; 688 vq->pending_used++;
723}
724
725/* This actually sends the interrupt for this virtqueue */
726static void trigger_irq(int fd, struct virtqueue *vq)
727{
728 unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
729
730 /* If they don't want an interrupt, don't send one, unless empty. */
731 if ((vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
732 && vq->inflight)
733 return;
734
735 /* Send the Guest an interrupt tell them we used something up. */
736 if (write(fd, buf, sizeof(buf)) != 0)
737 err(1, "Triggering irq %i", vq->config.irq);
738} 689}
739 690
740/* And here's the combo meal deal. Supersize me! */ 691/* And here's the combo meal deal. Supersize me! */
741static void add_used_and_trigger(int fd, struct virtqueue *vq, 692static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
742 unsigned int head, int len)
743{ 693{
744 add_used(vq, head, len); 694 add_used(vq, head, len);
745 trigger_irq(fd, vq); 695 trigger_irq(vq);
746} 696}
747 697
748/* 698/*
749 * The Console 699 * The Console
750 * 700 *
751 * Here is the input terminal setting we save, and the routine to restore them 701 * We associate some data with the console for our exit hack. */
752 * on exit so the user gets their terminal back. */
753static struct termios orig_term;
754static void restore_term(void)
755{
756 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
757}
758
759/* We associate some data with the console for our exit hack. */
760struct console_abort 702struct console_abort
761{ 703{
762 /* How many times have they hit ^C? */ 704 /* How many times have they hit ^C? */
@@ -766,276 +708,275 @@ struct console_abort
766}; 708};
767 709
768/* This is the routine which handles console input (ie. stdin). */ 710/* This is the routine which handles console input (ie. stdin). */
769static bool handle_console_input(int fd, struct device *dev) 711static void console_input(struct virtqueue *vq)
770{ 712{
771 int len; 713 int len;
772 unsigned int head, in_num, out_num; 714 unsigned int head, in_num, out_num;
773 struct iovec iov[dev->vq->vring.num]; 715 struct console_abort *abort = vq->dev->priv;
774 struct console_abort *abort = dev->priv; 716 struct iovec iov[vq->vring.num];
775
776 /* First we need a console buffer from the Guests's input virtqueue. */
777 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
778
779 /* If they're not ready for input, stop listening to this file
780 * descriptor. We'll start again once they add an input buffer. */
781 if (head == dev->vq->vring.num)
782 return false;
783 717
718 /* Make sure there's a descriptor waiting. */
719 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
784 if (out_num) 720 if (out_num)
785 errx(1, "Output buffers in console in queue?"); 721 errx(1, "Output buffers in console in queue?");
786 722
787 /* This is why we convert to iovecs: the readv() call uses them, and so 723 /* Read it in. */
788 * it reads straight into the Guest's buffer. */ 724 len = readv(STDIN_FILENO, iov, in_num);
789 len = readv(dev->fd, iov, in_num);
790 if (len <= 0) { 725 if (len <= 0) {
791 /* This implies that the console is closed, is /dev/null, or 726 /* Ran out of input? */
792 * something went terribly wrong. */
793 warnx("Failed to get console input, ignoring console."); 727 warnx("Failed to get console input, ignoring console.");
794 /* Put the input terminal back. */ 728 /* For simplicity, dying threads kill the whole Launcher. So
795 restore_term(); 729 * just nap here. */
796 /* Remove callback from input vq, so it doesn't restart us. */ 730 for (;;)
797 dev->vq->handle_output = NULL; 731 pause();
798 /* Stop listening to this fd: don't call us again. */
799 return false;
800 } 732 }
801 733
802 /* Tell the Guest about the new input. */ 734 add_used_and_trigger(vq, head, len);
803 add_used_and_trigger(fd, dev->vq, head, len);
804 735
805 /* Three ^C within one second? Exit. 736 /* Three ^C within one second? Exit.
806 * 737 *
807 * This is such a hack, but works surprisingly well. Each ^C has to be 738 * This is such a hack, but works surprisingly well. Each ^C has to
808 * in a buffer by itself, so they can't be too fast. But we check that 739 * be in a buffer by itself, so they can't be too fast. But we check
809 * we get three within about a second, so they can't be too slow. */ 740 * that we get three within about a second, so they can't be too
810 if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { 741 * slow. */
811 if (!abort->count++) 742 if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
812 gettimeofday(&abort->start, NULL);
813 else if (abort->count == 3) {
814 struct timeval now;
815 gettimeofday(&now, NULL);
816 if (now.tv_sec <= abort->start.tv_sec+1) {
817 unsigned long args[] = { LHREQ_BREAK, 0 };
818 /* Close the fd so Waker will know it has to
819 * exit. */
820 close(waker_fds.pipe[1]);
821 /* Just in case Waker is blocked in BREAK, send
822 * unbreak now. */
823 write(fd, args, sizeof(args));
824 exit(2);
825 }
826 abort->count = 0;
827 }
828 } else
829 /* Any other key resets the abort counter. */
830 abort->count = 0; 743 abort->count = 0;
744 return;
745 }
831 746
832 /* Everything went OK! */ 747 abort->count++;
833 return true; 748 if (abort->count == 1)
749 gettimeofday(&abort->start, NULL);
750 else if (abort->count == 3) {
751 struct timeval now;
752 gettimeofday(&now, NULL);
753 /* Kill all Launcher processes with SIGINT, like normal ^C */
754 if (now.tv_sec <= abort->start.tv_sec+1)
755 kill(0, SIGINT);
756 abort->count = 0;
757 }
834} 758}
835 759
836/* Handling output for console is simple: we just get all the output buffers 760/* This is the routine which handles console output (ie. stdout). */
837 * and write them to stdout. */ 761static void console_output(struct virtqueue *vq)
838static void handle_console_output(int fd, struct virtqueue *vq, bool timeout)
839{ 762{
840 unsigned int head, out, in; 763 unsigned int head, out, in;
841 int len;
842 struct iovec iov[vq->vring.num]; 764 struct iovec iov[vq->vring.num];
843 765
844 /* Keep getting output buffers from the Guest until we run out. */ 766 head = wait_for_vq_desc(vq, iov, &out, &in);
845 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) { 767 if (in)
846 if (in) 768 errx(1, "Input buffers in console output queue?");
847 errx(1, "Input buffers in output queue?"); 769 while (!iov_empty(iov, out)) {
848 len = writev(STDOUT_FILENO, iov, out); 770 int len = writev(STDOUT_FILENO, iov, out);
849 add_used_and_trigger(fd, vq, head, len); 771 if (len <= 0)
772 err(1, "Write to stdout gave %i", len);
773 iov_consume(iov, out, len);
850 } 774 }
851} 775 add_used(vq, head, 0);
852
853/* This is called when we no longer want to hear about Guest changes to a
854 * virtqueue. This is more efficient in high-traffic cases, but it means we
855 * have to set a timer to check if any more changes have occurred. */
856static void block_vq(struct virtqueue *vq)
857{
858 struct itimerval itm;
859
860 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
861 vq->blocked = true;
862
863 itm.it_interval.tv_sec = 0;
864 itm.it_interval.tv_usec = 0;
865 itm.it_value.tv_sec = 0;
866 itm.it_value.tv_usec = timeout_usec;
867
868 setitimer(ITIMER_REAL, &itm, NULL);
869} 776}
870 777
871/* 778/*
872 * The Network 779 * The Network
873 * 780 *
874 * Handling output for network is also simple: we get all the output buffers 781 * Handling output for network is also simple: we get all the output buffers
875 * and write them (ignoring the first element) to this device's file descriptor 782 * and write them to /dev/net/tun.
876 * (/dev/net/tun).
877 */ 783 */
878static void handle_net_output(int fd, struct virtqueue *vq, bool timeout) 784struct net_info {
785 int tunfd;
786};
787
788static void net_output(struct virtqueue *vq)
879{ 789{
880 unsigned int head, out, in, num = 0; 790 struct net_info *net_info = vq->dev->priv;
881 int len; 791 unsigned int head, out, in;
882 struct iovec iov[vq->vring.num]; 792 struct iovec iov[vq->vring.num];
883 static int last_timeout_num;
884
885 /* Keep getting output buffers from the Guest until we run out. */
886 while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
887 if (in)
888 errx(1, "Input buffers in output queue?");
889 len = writev(vq->dev->fd, iov, out);
890 if (len < 0)
891 err(1, "Writing network packet to tun");
892 add_used_and_trigger(fd, vq, head, len);
893 num++;
894 }
895 793
896 /* Block further kicks and set up a timer if we saw anything. */ 794 head = wait_for_vq_desc(vq, iov, &out, &in);
897 if (!timeout && num) 795 if (in)
898 block_vq(vq); 796 errx(1, "Input buffers in net output queue?");
899 797 if (writev(net_info->tunfd, iov, out) < 0)
900 /* We never quite know how long should we wait before we check the 798 errx(1, "Write to tun failed?");
901 * queue again for more packets. We start at 500 microseconds, and if 799 add_used(vq, head, 0);
902 * we get fewer packets than last time, we assume we made the timeout 800}
903 * too small and increase it by 10 microseconds. Otherwise, we drop it 801
904 * by one microsecond every time. It seems to work well enough. */ 802/* Will reading from this file descriptor block? */
905 if (timeout) { 803static bool will_block(int fd)
906 if (num < last_timeout_num) 804{
907 timeout_usec += 10; 805 fd_set fdset;
908 else if (timeout_usec > 1) 806 struct timeval zero = { 0, 0 };
909 timeout_usec--; 807 FD_ZERO(&fdset);
910 last_timeout_num = num; 808 FD_SET(fd, &fdset);
911 } 809 return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
912} 810}
913 811
914/* This is where we handle a packet coming in from the tun device to our 812/* This is where we handle packets coming in from the tun device to our
915 * Guest. */ 813 * Guest. */
916static bool handle_tun_input(int fd, struct device *dev) 814static void net_input(struct virtqueue *vq)
917{ 815{
918 unsigned int head, in_num, out_num;
919 int len; 816 int len;
920 struct iovec iov[dev->vq->vring.num]; 817 unsigned int head, out, in;
921 818 struct iovec iov[vq->vring.num];
922 /* First we need a network buffer from the Guests's recv virtqueue. */ 819 struct net_info *net_info = vq->dev->priv;
923 head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
924 if (head == dev->vq->vring.num) {
925 /* Now, it's expected that if we try to send a packet too
926 * early, the Guest won't be ready yet. Wait until the device
927 * status says it's ready. */
928 /* FIXME: Actually want DRIVER_ACTIVE here. */
929
930 /* Now tell it we want to know if new things appear. */
931 dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
932 wmb();
933
934 /* We'll turn this back on if input buffers are registered. */
935 return false;
936 } else if (out_num)
937 errx(1, "Output buffers in network recv queue?");
938
939 /* Read the packet from the device directly into the Guest's buffer. */
940 len = readv(dev->fd, iov, in_num);
941 if (len <= 0)
942 err(1, "reading network");
943 820
944 /* Tell the Guest about the new packet. */ 821 head = wait_for_vq_desc(vq, iov, &out, &in);
945 add_used_and_trigger(fd, dev->vq, head, len); 822 if (out)
823 errx(1, "Output buffers in net input queue?");
946 824
947 verbose("tun input packet len %i [%02x %02x] (%s)\n", len, 825 /* Deliver interrupt now, since we're about to sleep. */
948 ((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1], 826 if (vq->pending_used && will_block(net_info->tunfd))
949 head != dev->vq->vring.num ? "sent" : "discarded"); 827 trigger_irq(vq);
950 828
951 /* All good. */ 829 len = readv(net_info->tunfd, iov, in);
952 return true; 830 if (len <= 0)
831 err(1, "Failed to read from tun.");
832 add_used(vq, head, len);
953} 833}
954 834
955/*L:215 This is the callback attached to the network and console input 835/* This is the helper to create threads. */
956 * virtqueues: it ensures we try again, in case we stopped console or net 836static int do_thread(void *_vq)
957 * delivery because Guest didn't have any buffers. */
958static void enable_fd(int fd, struct virtqueue *vq, bool timeout)
959{ 837{
960 add_device_fd(vq->dev->fd); 838 struct virtqueue *vq = _vq;
961 /* Snap the Waker out of its select loop. */ 839
962 write(waker_fds.pipe[1], "", 1); 840 for (;;)
841 vq->service(vq);
842 return 0;
963} 843}
964 844
965static void net_enable_fd(int fd, struct virtqueue *vq, bool timeout) 845/* When a child dies, we kill our entire process group with SIGTERM. This
846 * also has the side effect that the shell restores the console for us! */
847static void kill_launcher(int signal)
966{ 848{
967 /* We don't need to know again when Guest refills receive buffer. */ 849 kill(0, SIGTERM);
968 vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
969 enable_fd(fd, vq, timeout);
970} 850}
971 851
972/* When the Guest tells us they updated the status field, we handle it. */ 852static void reset_device(struct device *dev)
973static void update_device_status(struct device *dev)
974{ 853{
975 struct virtqueue *vq; 854 struct virtqueue *vq;
976 855
977 /* This is a reset. */ 856 verbose("Resetting device %s\n", dev->name);
978 if (dev->desc->status == 0) {
979 verbose("Resetting device %s\n", dev->name);
980 857
981 /* Clear any features they've acked. */ 858 /* Clear any features they've acked. */
982 memset(get_feature_bits(dev) + dev->desc->feature_len, 0, 859 memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
983 dev->desc->feature_len);
984 860
985 /* Zero out the virtqueues. */ 861 /* We're going to be explicitly killing threads, so ignore them. */
986 for (vq = dev->vq; vq; vq = vq->next) { 862 signal(SIGCHLD, SIG_IGN);
987 memset(vq->vring.desc, 0, 863
988 vring_size(vq->config.num, LGUEST_VRING_ALIGN)); 864 /* Zero out the virtqueues, get rid of their threads */
989 lg_last_avail(vq) = 0; 865 for (vq = dev->vq; vq; vq = vq->next) {
866 if (vq->thread != (pid_t)-1) {
867 kill(vq->thread, SIGTERM);
868 waitpid(vq->thread, NULL, 0);
869 vq->thread = (pid_t)-1;
990 } 870 }
991 } else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { 871 memset(vq->vring.desc, 0,
872 vring_size(vq->config.num, LGUEST_VRING_ALIGN));
873 lg_last_avail(vq) = 0;
874 }
875 dev->running = false;
876
877 /* Now we care if threads die. */
878 signal(SIGCHLD, (void *)kill_launcher);
879}
880
881static void create_thread(struct virtqueue *vq)
882{
883 /* Create stack for thread and run it. Since stack grows
884 * upwards, we point the stack pointer to the end of this
885 * region. */
886 char *stack = malloc(32768);
887 unsigned long args[] = { LHREQ_EVENTFD,
888 vq->config.pfn*getpagesize(), 0 };
889
890 /* Create a zero-initialized eventfd. */
891 vq->eventfd = eventfd(0, 0);
892 if (vq->eventfd < 0)
893 err(1, "Creating eventfd");
894 args[2] = vq->eventfd;
895
896 /* Attach an eventfd to this virtqueue: it will go off
897 * when the Guest does an LHCALL_NOTIFY for this vq. */
898 if (write(lguest_fd, &args, sizeof(args)) != 0)
899 err(1, "Attaching eventfd");
900
901 /* CLONE_VM: because it has to access the Guest memory, and
902 * SIGCHLD so we get a signal if it dies. */
903 vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
904 if (vq->thread == (pid_t)-1)
905 err(1, "Creating clone");
906 /* We close our local copy, now the child has it. */
907 close(vq->eventfd);
908}
909
910static void start_device(struct device *dev)
911{
912 unsigned int i;
913 struct virtqueue *vq;
914
915 verbose("Device %s OK: offered", dev->name);
916 for (i = 0; i < dev->feature_len; i++)
917 verbose(" %02x", get_feature_bits(dev)[i]);
918 verbose(", accepted");
919 for (i = 0; i < dev->feature_len; i++)
920 verbose(" %02x", get_feature_bits(dev)
921 [dev->feature_len+i]);
922
923 for (vq = dev->vq; vq; vq = vq->next) {
924 if (vq->service)
925 create_thread(vq);
926 }
927 dev->running = true;
928}
929
930static void cleanup_devices(void)
931{
932 struct device *dev;
933
934 for (dev = devices.dev; dev; dev = dev->next)
935 reset_device(dev);
936
937 /* If we saved off the original terminal settings, restore them now. */
938 if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
939 tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
940}
941
942/* When the Guest tells us they updated the status field, we handle it. */
943static void update_device_status(struct device *dev)
944{
945 /* A zero status is a reset, otherwise it's a set of flags. */
946 if (dev->desc->status == 0)
947 reset_device(dev);
948 else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
992 warnx("Device %s configuration FAILED", dev->name); 949 warnx("Device %s configuration FAILED", dev->name);
950 if (dev->running)
951 reset_device(dev);
993 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { 952 } else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
994 unsigned int i; 953 if (!dev->running)
995 954 start_device(dev);
996 verbose("Device %s OK: offered", dev->name);
997 for (i = 0; i < dev->desc->feature_len; i++)
998 verbose(" %02x", get_feature_bits(dev)[i]);
999 verbose(", accepted");
1000 for (i = 0; i < dev->desc->feature_len; i++)
1001 verbose(" %02x", get_feature_bits(dev)
1002 [dev->desc->feature_len+i]);
1003
1004 if (dev->ready)
1005 dev->ready(dev);
1006 } 955 }
1007} 956}
1008 957
1009/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ 958/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */
1010static void handle_output(int fd, unsigned long addr) 959static void handle_output(unsigned long addr)
1011{ 960{
1012 struct device *i; 961 struct device *i;
1013 struct virtqueue *vq;
1014 962
1015 /* Check each device and virtqueue. */ 963 /* Check each device. */
1016 for (i = devices.dev; i; i = i->next) { 964 for (i = devices.dev; i; i = i->next) {
965 struct virtqueue *vq;
966
1017 /* Notifications to device descriptors update device status. */ 967 /* Notifications to device descriptors update device status. */
1018 if (from_guest_phys(addr) == i->desc) { 968 if (from_guest_phys(addr) == i->desc) {
1019 update_device_status(i); 969 update_device_status(i);
1020 return; 970 return;
1021 } 971 }
1022 972
1023 /* Notifications to virtqueues mean output has occurred. */ 973 /* Devices *can* be used before status is set to DRIVER_OK. */
1024 for (vq = i->vq; vq; vq = vq->next) { 974 for (vq = i->vq; vq; vq = vq->next) {
1025 if (vq->config.pfn != addr/getpagesize()) 975 if (addr != vq->config.pfn*getpagesize())
1026 continue; 976 continue;
1027 977 if (i->running)
1028 /* Guest should acknowledge (and set features!) before 978 errx(1, "Notification on running %s", i->name);
1029 * using the device. */ 979 start_device(i);
1030 if (i->desc->status == 0) {
1031 warnx("%s gave early output", i->name);
1032 return;
1033 }
1034
1035 if (strcmp(vq->dev->name, "console") != 0)
1036 verbose("Output to %s\n", vq->dev->name);
1037 if (vq->handle_output)
1038 vq->handle_output(fd, vq, false);
1039 return; 980 return;
1040 } 981 }
1041 } 982 }
@@ -1049,71 +990,6 @@ static void handle_output(int fd, unsigned long addr)
1049 strnlen(from_guest_phys(addr), guest_limit - addr)); 990 strnlen(from_guest_phys(addr), guest_limit - addr));
1050} 991}
1051 992
1052static void handle_timeout(int fd)
1053{
1054 char buf[32];
1055 struct device *i;
1056 struct virtqueue *vq;
1057
1058 /* Clear the pipe */
1059 read(timeoutpipe[0], buf, sizeof(buf));
1060
1061 /* Check each device and virtqueue: flush blocked ones. */
1062 for (i = devices.dev; i; i = i->next) {
1063 for (vq = i->vq; vq; vq = vq->next) {
1064 if (!vq->blocked)
1065 continue;
1066
1067 vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
1068 vq->blocked = false;
1069 if (vq->handle_output)
1070 vq->handle_output(fd, vq, true);
1071 }
1072 }
1073}
1074
1075/* This is called when the Waker wakes us up: check for incoming file
1076 * descriptors. */
1077static void handle_input(int fd)
1078{
1079 /* select() wants a zeroed timeval to mean "don't wait". */
1080 struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
1081
1082 for (;;) {
1083 struct device *i;
1084 fd_set fds = devices.infds;
1085 int num;
1086
1087 num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
1088 /* Could get interrupted */
1089 if (num < 0)
1090 continue;
1091 /* If nothing is ready, we're done. */
1092 if (num == 0)
1093 break;
1094
1095 /* Otherwise, call the device(s) which have readable file
1096 * descriptors and a method of handling them. */
1097 for (i = devices.dev; i; i = i->next) {
1098 if (i->handle_input && FD_ISSET(i->fd, &fds)) {
1099 if (i->handle_input(fd, i))
1100 continue;
1101
1102 /* If handle_input() returns false, it means we
1103 * should no longer service it. Networking and
1104 * console do this when there's no input
1105 * buffers to deliver into. Console also uses
1106 * it when it discovers that stdin is closed. */
1107 FD_CLR(i->fd, &devices.infds);
1108 }
1109 }
1110
1111 /* Is this the timeout fd? */
1112 if (FD_ISSET(timeoutpipe[0], &fds))
1113 handle_timeout(fd);
1114 }
1115}
1116
1117/*L:190 993/*L:190
1118 * Device Setup 994 * Device Setup
1119 * 995 *
@@ -1129,8 +1005,8 @@ static void handle_input(int fd)
1129static u8 *device_config(const struct device *dev) 1005static u8 *device_config(const struct device *dev)
1130{ 1006{
1131 return (void *)(dev->desc + 1) 1007 return (void *)(dev->desc + 1)
1132 + dev->desc->num_vq * sizeof(struct lguest_vqconfig) 1008 + dev->num_vq * sizeof(struct lguest_vqconfig)
1133 + dev->desc->feature_len * 2; 1009 + dev->feature_len * 2;
1134} 1010}
1135 1011
1136/* This routine allocates a new "struct lguest_device_desc" from descriptor 1012/* This routine allocates a new "struct lguest_device_desc" from descriptor
@@ -1159,7 +1035,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
1159/* Each device descriptor is followed by the description of its virtqueues. We 1035/* Each device descriptor is followed by the description of its virtqueues. We
1160 * specify how many descriptors the virtqueue is to have. */ 1036 * specify how many descriptors the virtqueue is to have. */
1161static void add_virtqueue(struct device *dev, unsigned int num_descs, 1037static void add_virtqueue(struct device *dev, unsigned int num_descs,
1162 void (*handle_output)(int, struct virtqueue *, bool)) 1038 void (*service)(struct virtqueue *))
1163{ 1039{
1164 unsigned int pages; 1040 unsigned int pages;
1165 struct virtqueue **i, *vq = malloc(sizeof(*vq)); 1041 struct virtqueue **i, *vq = malloc(sizeof(*vq));
@@ -1174,8 +1050,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1174 vq->next = NULL; 1050 vq->next = NULL;
1175 vq->last_avail_idx = 0; 1051 vq->last_avail_idx = 0;
1176 vq->dev = dev; 1052 vq->dev = dev;
1177 vq->inflight = 0; 1053 vq->service = service;
1178 vq->blocked = false; 1054 vq->thread = (pid_t)-1;
1179 1055
1180 /* Initialize the configuration. */ 1056 /* Initialize the configuration. */
1181 vq->config.num = num_descs; 1057 vq->config.num = num_descs;
@@ -1191,6 +1067,7 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1191 * yet, otherwise we'd be overwriting them. */ 1067 * yet, otherwise we'd be overwriting them. */
1192 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); 1068 assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
1193 memcpy(device_config(dev), &vq->config, sizeof(vq->config)); 1069 memcpy(device_config(dev), &vq->config, sizeof(vq->config));
1070 dev->num_vq++;
1194 dev->desc->num_vq++; 1071 dev->desc->num_vq++;
1195 1072
1196 verbose("Virtqueue page %#lx\n", to_guest_phys(p)); 1073 verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1199,15 +1076,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
1199 * second. */ 1076 * second. */
1200 for (i = &dev->vq; *i; i = &(*i)->next); 1077 for (i = &dev->vq; *i; i = &(*i)->next);
1201 *i = vq; 1078 *i = vq;
1202
1203 /* Set the routine to call when the Guest does something to this
1204 * virtqueue. */
1205 vq->handle_output = handle_output;
1206
1207 /* As an optimization, set the advisory "Don't Notify Me" flag if we
1208 * don't have a handler */
1209 if (!handle_output)
1210 vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
1211} 1079}
1212 1080
1213/* The first half of the feature bitmask is for us to advertise features. The 1081/* The first half of the feature bitmask is for us to advertise features. The
@@ -1219,7 +1087,7 @@ static void add_feature(struct device *dev, unsigned bit)
1219 /* We can't extend the feature bits once we've added config bytes */ 1087 /* We can't extend the feature bits once we've added config bytes */
1220 if (dev->desc->feature_len <= bit / CHAR_BIT) { 1088 if (dev->desc->feature_len <= bit / CHAR_BIT) {
1221 assert(dev->desc->config_len == 0); 1089 assert(dev->desc->config_len == 0);
1222 dev->desc->feature_len = (bit / CHAR_BIT) + 1; 1090 dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
1223 } 1091 }
1224 1092
1225 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); 1093 features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1243,22 +1111,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
1243 * calling new_dev_desc() to allocate the descriptor and device memory. 1111 * calling new_dev_desc() to allocate the descriptor and device memory.
1244 * 1112 *
1245 * See what I mean about userspace being boring? */ 1113 * See what I mean about userspace being boring? */
1246static struct device *new_device(const char *name, u16 type, int fd, 1114static struct device *new_device(const char *name, u16 type)
1247 bool (*handle_input)(int, struct device *))
1248{ 1115{
1249 struct device *dev = malloc(sizeof(*dev)); 1116 struct device *dev = malloc(sizeof(*dev));
1250 1117
1251 /* Now we populate the fields one at a time. */ 1118 /* Now we populate the fields one at a time. */
1252 dev->fd = fd;
1253 /* If we have an input handler for this file descriptor, then we add it
1254 * to the device_list's fdset and maxfd. */
1255 if (handle_input)
1256 add_device_fd(dev->fd);
1257 dev->desc = new_dev_desc(type); 1119 dev->desc = new_dev_desc(type);
1258 dev->handle_input = handle_input;
1259 dev->name = name; 1120 dev->name = name;
1260 dev->vq = NULL; 1121 dev->vq = NULL;
1261 dev->ready = NULL; 1122 dev->feature_len = 0;
1123 dev->num_vq = 0;
1124 dev->running = false;
1262 1125
1263 /* Append to device list. Prepending to a single-linked list is 1126 /* Append to device list. Prepending to a single-linked list is
1264 * easier, but the user expects the devices to be arranged on the bus 1127 * easier, but the user expects the devices to be arranged on the bus
@@ -1286,13 +1149,10 @@ static void setup_console(void)
1286 * raw input stream to the Guest. */ 1149 * raw input stream to the Guest. */
1287 term.c_lflag &= ~(ISIG|ICANON|ECHO); 1150 term.c_lflag &= ~(ISIG|ICANON|ECHO);
1288 tcsetattr(STDIN_FILENO, TCSANOW, &term); 1151 tcsetattr(STDIN_FILENO, TCSANOW, &term);
1289 /* If we exit gracefully, the original settings will be
1290 * restored so the user can see what they're typing. */
1291 atexit(restore_term);
1292 } 1152 }
1293 1153
1294 dev = new_device("console", VIRTIO_ID_CONSOLE, 1154 dev = new_device("console", VIRTIO_ID_CONSOLE);
1295 STDIN_FILENO, handle_console_input); 1155
1296 /* We store the console state in dev->priv, and initialize it. */ 1156 /* We store the console state in dev->priv, and initialize it. */
1297 dev->priv = malloc(sizeof(struct console_abort)); 1157 dev->priv = malloc(sizeof(struct console_abort));
1298 ((struct console_abort *)dev->priv)->count = 0; 1158 ((struct console_abort *)dev->priv)->count = 0;
@@ -1301,31 +1161,13 @@ static void setup_console(void)
1301 * they put something the input queue, we make sure we're listening to 1161 * they put something the input queue, we make sure we're listening to
1302 * stdin. When they put something in the output queue, we write it to 1162 * stdin. When they put something in the output queue, we write it to
1303 * stdout. */ 1163 * stdout. */
1304 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1164 add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
1305 add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); 1165 add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
1306 1166
1307 verbose("device %u: console\n", devices.device_num++); 1167 verbose("device %u: console\n", ++devices.device_num);
1308} 1168}
1309/*:*/ 1169/*:*/
1310 1170
1311static void timeout_alarm(int sig)
1312{
1313 write(timeoutpipe[1], "", 1);
1314}
1315
1316static void setup_timeout(void)
1317{
1318 if (pipe(timeoutpipe) != 0)
1319 err(1, "Creating timeout pipe");
1320
1321 if (fcntl(timeoutpipe[1], F_SETFL,
1322 fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
1323 err(1, "Making timeout pipe nonblocking");
1324
1325 add_device_fd(timeoutpipe[0]);
1326 signal(SIGALRM, timeout_alarm);
1327}
1328
1329/*M:010 Inter-guest networking is an interesting area. Simplest is to have a 1171/*M:010 Inter-guest networking is an interesting area. Simplest is to have a
1330 * --sharenet=<name> option which opens or creates a named pipe. This can be 1172 * --sharenet=<name> option which opens or creates a named pipe. This can be
1331 * used to send packets to another guest in a 1:1 manner. 1173 * used to send packets to another guest in a 1:1 manner.
@@ -1447,21 +1289,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
1447static void setup_tun_net(char *arg) 1289static void setup_tun_net(char *arg)
1448{ 1290{
1449 struct device *dev; 1291 struct device *dev;
1450 int netfd, ipfd; 1292 struct net_info *net_info = malloc(sizeof(*net_info));
1293 int ipfd;
1451 u32 ip = INADDR_ANY; 1294 u32 ip = INADDR_ANY;
1452 bool bridging = false; 1295 bool bridging = false;
1453 char tapif[IFNAMSIZ], *p; 1296 char tapif[IFNAMSIZ], *p;
1454 struct virtio_net_config conf; 1297 struct virtio_net_config conf;
1455 1298
1456 netfd = get_tun_device(tapif); 1299 net_info->tunfd = get_tun_device(tapif);
1457 1300
1458 /* First we create a new network device. */ 1301 /* First we create a new network device. */
1459 dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); 1302 dev = new_device("net", VIRTIO_ID_NET);
1303 dev->priv = net_info;
1460 1304
1461 /* Network devices need a receive and a send queue, just like 1305 /* Network devices need a receive and a send queue, just like
1462 * console. */ 1306 * console. */
1463 add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); 1307 add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
1464 add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); 1308 add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
1465 1309
1466 /* We need a socket to perform the magic network ioctls to bring up the 1310 /* We need a socket to perform the magic network ioctls to bring up the
1467 * tap interface, connect to the bridge etc. Any socket will do! */ 1311 * tap interface, connect to the bridge etc. Any socket will do! */
@@ -1502,6 +1346,8 @@ static void setup_tun_net(char *arg)
1502 add_feature(dev, VIRTIO_NET_F_HOST_TSO4); 1346 add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
1503 add_feature(dev, VIRTIO_NET_F_HOST_TSO6); 1347 add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
1504 add_feature(dev, VIRTIO_NET_F_HOST_ECN); 1348 add_feature(dev, VIRTIO_NET_F_HOST_ECN);
1349 /* We handle indirect ring entries */
1350 add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
1505 set_config(dev, sizeof(conf), &conf); 1351 set_config(dev, sizeof(conf), &conf);
1506 1352
1507 /* We don't need the socket any more; setup is done. */ 1353 /* We don't need the socket any more; setup is done. */
@@ -1550,20 +1396,18 @@ struct vblk_info
1550 * Remember that the block device is handled by a separate I/O thread. We head 1396 * Remember that the block device is handled by a separate I/O thread. We head
1551 * straight into the core of that thread here: 1397 * straight into the core of that thread here:
1552 */ 1398 */
1553static bool service_io(struct device *dev) 1399static void blk_request(struct virtqueue *vq)
1554{ 1400{
1555 struct vblk_info *vblk = dev->priv; 1401 struct vblk_info *vblk = vq->dev->priv;
1556 unsigned int head, out_num, in_num, wlen; 1402 unsigned int head, out_num, in_num, wlen;
1557 int ret; 1403 int ret;
1558 u8 *in; 1404 u8 *in;
1559 struct virtio_blk_outhdr *out; 1405 struct virtio_blk_outhdr *out;
1560 struct iovec iov[dev->vq->vring.num]; 1406 struct iovec iov[vq->vring.num];
1561 off64_t off; 1407 off64_t off;
1562 1408
1563 /* See if there's a request waiting. If not, nothing to do. */ 1409 /* Get the next request. */
1564 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1410 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1565 if (head == dev->vq->vring.num)
1566 return false;
1567 1411
1568 /* Every block request should contain at least one output buffer 1412 /* Every block request should contain at least one output buffer
1569 * (detailing the location on disk and the type of request) and one 1413 * (detailing the location on disk and the type of request) and one
@@ -1637,83 +1481,21 @@ static bool service_io(struct device *dev)
1637 if (out->type & VIRTIO_BLK_T_BARRIER) 1481 if (out->type & VIRTIO_BLK_T_BARRIER)
1638 fdatasync(vblk->fd); 1482 fdatasync(vblk->fd);
1639 1483
1640 /* We can't trigger an IRQ, because we're not the Launcher. It does 1484 add_used(vq, head, wlen);
1641 * that when we tell it we're done. */
1642 add_used(dev->vq, head, wlen);
1643 return true;
1644}
1645
1646/* This is the thread which actually services the I/O. */
1647static int io_thread(void *_dev)
1648{
1649 struct device *dev = _dev;
1650 struct vblk_info *vblk = dev->priv;
1651 char c;
1652
1653 /* Close other side of workpipe so we get 0 read when main dies. */
1654 close(vblk->workpipe[1]);
1655 /* Close the other side of the done_fd pipe. */
1656 close(dev->fd);
1657
1658 /* When this read fails, it means Launcher died, so we follow. */
1659 while (read(vblk->workpipe[0], &c, 1) == 1) {
1660 /* We acknowledge each request immediately to reduce latency,
1661 * rather than waiting until we've done them all. I haven't
1662 * measured to see if it makes any difference.
1663 *
1664 * That would be an interesting test, wouldn't it? You could
1665 * also try having more than one I/O thread. */
1666 while (service_io(dev))
1667 write(vblk->done_fd, &c, 1);
1668 }
1669 return 0;
1670}
1671
1672/* Now we've seen the I/O thread, we return to the Launcher to see what happens
1673 * when that thread tells us it's completed some I/O. */
1674static bool handle_io_finish(int fd, struct device *dev)
1675{
1676 char c;
1677
1678 /* If the I/O thread died, presumably it printed the error, so we
1679 * simply exit. */
1680 if (read(dev->fd, &c, 1) != 1)
1681 exit(1);
1682
1683 /* It did some work, so trigger the irq. */
1684 trigger_irq(fd, dev->vq);
1685 return true;
1686}
1687
1688/* When the Guest submits some I/O, we just need to wake the I/O thread. */
1689static void handle_virtblk_output(int fd, struct virtqueue *vq, bool timeout)
1690{
1691 struct vblk_info *vblk = vq->dev->priv;
1692 char c = 0;
1693
1694 /* Wake up I/O thread and tell it to go to work! */
1695 if (write(vblk->workpipe[1], &c, 1) != 1)
1696 /* Presumably it indicated why it died. */
1697 exit(1);
1698} 1485}
1699 1486
1700/*L:198 This actually sets up a virtual block device. */ 1487/*L:198 This actually sets up a virtual block device. */
1701static void setup_block_file(const char *filename) 1488static void setup_block_file(const char *filename)
1702{ 1489{
1703 int p[2];
1704 struct device *dev; 1490 struct device *dev;
1705 struct vblk_info *vblk; 1491 struct vblk_info *vblk;
1706 void *stack;
1707 struct virtio_blk_config conf; 1492 struct virtio_blk_config conf;
1708 1493
1709 /* This is the pipe the I/O thread will use to tell us I/O is done. */
1710 pipe(p);
1711
1712 /* The device responds to return from I/O thread. */ 1494 /* The device responds to return from I/O thread. */
1713 dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); 1495 dev = new_device("block", VIRTIO_ID_BLOCK);
1714 1496
1715 /* The device has one virtqueue, where the Guest places requests. */ 1497 /* The device has one virtqueue, where the Guest places requests. */
1716 add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); 1498 add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
1717 1499
1718 /* Allocate the room for our own bookkeeping */ 1500 /* Allocate the room for our own bookkeeping */
1719 vblk = dev->priv = malloc(sizeof(*vblk)); 1501 vblk = dev->priv = malloc(sizeof(*vblk));
@@ -1735,49 +1517,29 @@ static void setup_block_file(const char *filename)
1735 1517
1736 set_config(dev, sizeof(conf), &conf); 1518 set_config(dev, sizeof(conf), &conf);
1737 1519
1738 /* The I/O thread writes to this end of the pipe when done. */
1739 vblk->done_fd = p[1];
1740
1741 /* This is the second pipe, which is how we tell the I/O thread about
1742 * more work. */
1743 pipe(vblk->workpipe);
1744
1745 /* Create stack for thread and run it. Since stack grows upwards, we
1746 * point the stack pointer to the end of this region. */
1747 stack = malloc(32768);
1748 /* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
1749 * becoming a zombie. */
1750 if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
1751 err(1, "Creating clone");
1752
1753 /* We don't need to keep the I/O thread's end of the pipes open. */
1754 close(vblk->done_fd);
1755 close(vblk->workpipe[0]);
1756
1757 verbose("device %u: virtblock %llu sectors\n", 1520 verbose("device %u: virtblock %llu sectors\n",
1758 devices.device_num, le64_to_cpu(conf.capacity)); 1521 ++devices.device_num, le64_to_cpu(conf.capacity));
1759} 1522}
1760 1523
1524struct rng_info {
1525 int rfd;
1526};
1527
1761/* Our random number generator device reads from /dev/random into the Guest's 1528/* Our random number generator device reads from /dev/random into the Guest's
1762 * input buffers. The usual case is that the Guest doesn't want random numbers 1529 * input buffers. The usual case is that the Guest doesn't want random numbers
1763 * and so has no buffers although /dev/random is still readable, whereas 1530 * and so has no buffers although /dev/random is still readable, whereas
1764 * console is the reverse. 1531 * console is the reverse.
1765 * 1532 *
1766 * The same logic applies, however. */ 1533 * The same logic applies, however. */
1767static bool handle_rng_input(int fd, struct device *dev) 1534static void rng_input(struct virtqueue *vq)
1768{ 1535{
1769 int len; 1536 int len;
1770 unsigned int head, in_num, out_num, totlen = 0; 1537 unsigned int head, in_num, out_num, totlen = 0;
1771 struct iovec iov[dev->vq->vring.num]; 1538 struct rng_info *rng_info = vq->dev->priv;
1539 struct iovec iov[vq->vring.num];
1772 1540
1773 /* First we need a buffer from the Guests's virtqueue. */ 1541 /* First we need a buffer from the Guests's virtqueue. */
1774 head = get_vq_desc(dev->vq, iov, &out_num, &in_num); 1542 head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
1775
1776 /* If they're not ready for input, stop listening to this file
1777 * descriptor. We'll start again once they add an input buffer. */
1778 if (head == dev->vq->vring.num)
1779 return false;
1780
1781 if (out_num) 1543 if (out_num)
1782 errx(1, "Output buffers in rng?"); 1544 errx(1, "Output buffers in rng?");
1783 1545
@@ -1785,7 +1547,7 @@ static bool handle_rng_input(int fd, struct device *dev)
1785 * it reads straight into the Guest's buffer. We loop to make sure we 1547 * it reads straight into the Guest's buffer. We loop to make sure we
1786 * fill it. */ 1548 * fill it. */
1787 while (!iov_empty(iov, in_num)) { 1549 while (!iov_empty(iov, in_num)) {
1788 len = readv(dev->fd, iov, in_num); 1550 len = readv(rng_info->rfd, iov, in_num);
1789 if (len <= 0) 1551 if (len <= 0)
1790 err(1, "Read from /dev/random gave %i", len); 1552 err(1, "Read from /dev/random gave %i", len);
1791 iov_consume(iov, in_num, len); 1553 iov_consume(iov, in_num, len);
@@ -1793,25 +1555,23 @@ static bool handle_rng_input(int fd, struct device *dev)
1793 } 1555 }
1794 1556
1795 /* Tell the Guest about the new input. */ 1557 /* Tell the Guest about the new input. */
1796 add_used_and_trigger(fd, dev->vq, head, totlen); 1558 add_used(vq, head, totlen);
1797
1798 /* Everything went OK! */
1799 return true;
1800} 1559}
1801 1560
1802/* And this creates a "hardware" random number device for the Guest. */ 1561/* And this creates a "hardware" random number device for the Guest. */
1803static void setup_rng(void) 1562static void setup_rng(void)
1804{ 1563{
1805 struct device *dev; 1564 struct device *dev;
1806 int fd; 1565 struct rng_info *rng_info = malloc(sizeof(*rng_info));
1807 1566
1808 fd = open_or_die("/dev/random", O_RDONLY); 1567 rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
1809 1568
1810 /* The device responds to return from I/O thread. */ 1569 /* The device responds to return from I/O thread. */
1811 dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); 1570 dev = new_device("rng", VIRTIO_ID_RNG);
1571 dev->priv = rng_info;
1812 1572
1813 /* The device has one virtqueue, where the Guest places inbufs. */ 1573 /* The device has one virtqueue, where the Guest places inbufs. */
1814 add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); 1574 add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
1815 1575
1816 verbose("device %u: rng\n", devices.device_num++); 1576 verbose("device %u: rng\n", devices.device_num++);
1817} 1577}
@@ -1827,17 +1587,18 @@ static void __attribute__((noreturn)) restart_guest(void)
1827 for (i = 3; i < FD_SETSIZE; i++) 1587 for (i = 3; i < FD_SETSIZE; i++)
1828 close(i); 1588 close(i);
1829 1589
1830 /* The exec automatically gets rid of the I/O and Waker threads. */ 1590 /* Reset all the devices (kills all threads). */
1591 cleanup_devices();
1592
1831 execv(main_args[0], main_args); 1593 execv(main_args[0], main_args);
1832 err(1, "Could not exec %s", main_args[0]); 1594 err(1, "Could not exec %s", main_args[0]);
1833} 1595}
1834 1596
1835/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves 1597/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves
1836 * its input and output, and finally, lays it to rest. */ 1598 * its input and output, and finally, lays it to rest. */
1837static void __attribute__((noreturn)) run_guest(int lguest_fd) 1599static void __attribute__((noreturn)) run_guest(void)
1838{ 1600{
1839 for (;;) { 1601 for (;;) {
1840 unsigned long args[] = { LHREQ_BREAK, 0 };
1841 unsigned long notify_addr; 1602 unsigned long notify_addr;
1842 int readval; 1603 int readval;
1843 1604
@@ -1848,8 +1609,7 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1848 /* One unsigned long means the Guest did HCALL_NOTIFY */ 1609 /* One unsigned long means the Guest did HCALL_NOTIFY */
1849 if (readval == sizeof(notify_addr)) { 1610 if (readval == sizeof(notify_addr)) {
1850 verbose("Notify on address %#lx\n", notify_addr); 1611 verbose("Notify on address %#lx\n", notify_addr);
1851 handle_output(lguest_fd, notify_addr); 1612 handle_output(notify_addr);
1852 continue;
1853 /* ENOENT means the Guest died. Reading tells us why. */ 1613 /* ENOENT means the Guest died. Reading tells us why. */
1854 } else if (errno == ENOENT) { 1614 } else if (errno == ENOENT) {
1855 char reason[1024] = { 0 }; 1615 char reason[1024] = { 0 };
@@ -1858,19 +1618,9 @@ static void __attribute__((noreturn)) run_guest(int lguest_fd)
1858 /* ERESTART means that we need to reboot the guest */ 1618 /* ERESTART means that we need to reboot the guest */
1859 } else if (errno == ERESTART) { 1619 } else if (errno == ERESTART) {
1860 restart_guest(); 1620 restart_guest();
1861 /* EAGAIN means a signal (timeout). 1621 /* Anything else means a bug or incompatible change. */
1862 * Anything else means a bug or incompatible change. */ 1622 } else
1863 } else if (errno != EAGAIN)
1864 err(1, "Running guest failed"); 1623 err(1, "Running guest failed");
1865
1866 /* Only service input on thread for CPU 0. */
1867 if (cpu_id != 0)
1868 continue;
1869
1870 /* Service input, then unset the BREAK to release the Waker. */
1871 handle_input(lguest_fd);
1872 if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
1873 err(1, "Resetting break");
1874 } 1624 }
1875} 1625}
1876/*L:240 1626/*L:240
@@ -1904,8 +1654,8 @@ int main(int argc, char *argv[])
1904 /* Memory, top-level pagetable, code startpoint and size of the 1654 /* Memory, top-level pagetable, code startpoint and size of the
1905 * (optional) initrd. */ 1655 * (optional) initrd. */
1906 unsigned long mem = 0, start, initrd_size = 0; 1656 unsigned long mem = 0, start, initrd_size = 0;
1907 /* Two temporaries and the /dev/lguest file descriptor. */ 1657 /* Two temporaries. */
1908 int i, c, lguest_fd; 1658 int i, c;
1909 /* The boot information for the Guest. */ 1659 /* The boot information for the Guest. */
1910 struct boot_params *boot; 1660 struct boot_params *boot;
1911 /* If they specify an initrd file to load. */ 1661 /* If they specify an initrd file to load. */
@@ -1913,18 +1663,10 @@ int main(int argc, char *argv[])
1913 1663
1914 /* Save the args: we "reboot" by execing ourselves again. */ 1664 /* Save the args: we "reboot" by execing ourselves again. */
1915 main_args = argv; 1665 main_args = argv;
1916 /* We don't "wait" for the children, so prevent them from becoming
1917 * zombies. */
1918 signal(SIGCHLD, SIG_IGN);
1919 1666
1920 /* First we initialize the device list. Since console and network 1667 /* First we initialize the device list. We keep a pointer to the last
1921 * device receive input from a file descriptor, we keep an fdset 1668 * device, and the next interrupt number to use for devices (1:
1922 * (infds) and the maximum fd number (max_infd) with the head of the 1669 * remember that 0 is used by the timer). */
1923 * list. We also keep a pointer to the last device. Finally, we keep
1924 * the next interrupt number to use for devices (1: remember that 0 is
1925 * used by the timer). */
1926 FD_ZERO(&devices.infds);
1927 devices.max_infd = -1;
1928 devices.lastdev = NULL; 1670 devices.lastdev = NULL;
1929 devices.next_irq = 1; 1671 devices.next_irq = 1;
1930 1672
@@ -1982,9 +1724,6 @@ int main(int argc, char *argv[])
1982 /* We always have a console device */ 1724 /* We always have a console device */
1983 setup_console(); 1725 setup_console();
1984 1726
1985 /* We can timeout waiting for Guest network transmit. */
1986 setup_timeout();
1987
1988 /* Now we load the kernel */ 1727 /* Now we load the kernel */
1989 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); 1728 start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
1990 1729
@@ -2023,15 +1762,16 @@ int main(int argc, char *argv[])
2023 1762
2024 /* We tell the kernel to initialize the Guest: this returns the open 1763 /* We tell the kernel to initialize the Guest: this returns the open
2025 * /dev/lguest file descriptor. */ 1764 * /dev/lguest file descriptor. */
2026 lguest_fd = tell_kernel(start); 1765 tell_kernel(start);
1766
1767 /* Ensure that we terminate if a child dies. */
1768 signal(SIGCHLD, kill_launcher);
2027 1769
2028 /* We clone off a thread, which wakes the Launcher whenever one of the 1770 /* If we exit via err(), this kills all the threads, restores tty. */
2029 * input file descriptors needs attention. We call this the Waker, and 1771 atexit(cleanup_devices);
2030 * we'll cover it in a moment. */
2031 setup_waker(lguest_fd);
2032 1772
2033 /* Finally, run the Guest. This doesn't return. */ 1773 /* Finally, run the Guest. This doesn't return. */
2034 run_guest(lguest_fd); 1774 run_guest();
2035} 1775}
2036/*:*/ 1776/*:*/
2037 1777
diff --git a/Documentation/lguest/lguest.txt b/Documentation/lguest/lguest.txt
index 28c747362f95..efb3a6a045a2 100644
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@ Running Lguest:
37 "Paravirtualized guest support" = Y 37 "Paravirtualized guest support" = Y
38 "Lguest guest support" = Y 38 "Lguest guest support" = Y
39 "High Memory Support" = off/4GB 39 "High Memory Support" = off/4GB
40 "PAE (Physical Address Extension) Support" = N
41 "Alignment value to which kernel should be aligned" = 0x100000 40 "Alignment value to which kernel should be aligned" = 0x100000
42 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and 41 (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
43 CONFIG_PHYSICAL_ALIGN=0x100000) 42 CONFIG_PHYSICAL_ALIGN=0x100000)
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 1caf57628b9c..313389cd50d2 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
17/* Pages for switcher itself, then two pages per cpu */ 17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) 18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
19 19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */ 20/* We map at -4M (-2M when PAE is activated) for ease of mapping
21 * into the guest (one PTE page). */
22#ifdef CONFIG_X86_PAE
23#define SWITCHER_ADDR 0xFFE00000
24#else
21#define SWITCHER_ADDR 0xFFC00000 25#define SWITCHER_ADDR 0xFFC00000
26#endif
22 27
23/* Found in switcher.S */ 28/* Found in switcher.S */
24extern unsigned long default_idt_entries[]; 29extern unsigned long default_idt_entries[];
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index faae1996487b..d31c4a684078 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
12#define LHCALL_TS 8 12#define LHCALL_TS 8
13#define LHCALL_SET_CLOCKEVENT 9 13#define LHCALL_SET_CLOCKEVENT 9
14#define LHCALL_HALT 10 14#define LHCALL_HALT 10
15#define LHCALL_SET_PMD 13
15#define LHCALL_SET_PTE 14 16#define LHCALL_SET_PTE 14
16#define LHCALL_SET_PMD 15 17#define LHCALL_SET_PGD 15
17#define LHCALL_LOAD_TLS 16 18#define LHCALL_LOAD_TLS 16
18#define LHCALL_NOTIFY 17 19#define LHCALL_NOTIFY 17
19#define LHCALL_LOAD_GDT_ENTRY 18 20#define LHCALL_LOAD_GDT_ENTRY 18
21#define LHCALL_SEND_INTERRUPTS 19
20 22
21#define LGUEST_TRAP_ENTRY 0x1F 23#define LGUEST_TRAP_ENTRY 0x1F
22 24
@@ -32,10 +34,10 @@
32 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
33 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
34 * 36 *
35 * We use the KVM hypercall mechanism. Eighteen hypercalls are 37 * We use the KVM hypercall mechanism. Seventeen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the 38 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %ebx, %ecx and %edx. If a return 39 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
38 * value makes sense, it's returned in %eax. 40 * If a return value makes sense, it's returned in %eax.
39 * 41 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 42 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's 43 * Host, rather than returning failure. This reflects Winston Churchill's
@@ -47,8 +49,9 @@
47 49
48#define LHCALL_RING_SIZE 64 50#define LHCALL_RING_SIZE 64
49struct hcall_args { 51struct hcall_args {
50 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */ 52 /* These map directly onto eax, ebx, ecx, edx and esi
51 unsigned long arg0, arg1, arg2, arg3; 53 * in struct lguest_regs */
54 unsigned long arg0, arg1, arg2, arg3, arg4;
52}; 55};
53 56
54#endif /* !__ASSEMBLY__ */ 57#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 1a830cbd7015..dfdbf6403895 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 127 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
129 OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
129 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir); 130 OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
130 131
131 BLANK(); 132 BLANK();
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d3..38718041efc3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 depends on !X86_PAE
6 select VIRTIO 5 select VIRTIO
7 select VIRTIO_RING 6 select VIRTIO_RING
8 select VIRTIO_CONSOLE 7 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 4e0c26559395..7bc65f0f62c4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -87,7 +87,7 @@ struct lguest_data lguest_data = {
87 87
88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a 88/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
89 * ring buffer of stored hypercalls which the Host will run though next time we 89 * ring buffer of stored hypercalls which the Host will run though next time we
90 * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall 90 * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go, 91 * arguments, and a "hcall_status" word which is 0 if the call is ready to go,
92 * and 255 once the Host has finished with it. 92 * and 255 once the Host has finished with it.
93 * 93 *
@@ -96,7 +96,8 @@ struct lguest_data lguest_data = {
96 * effect of causing the Host to run all the stored calls in the ring buffer 96 * effect of causing the Host to run all the stored calls in the ring buffer
97 * which empties it for next time! */ 97 * which empties it for next time! */
98static void async_hcall(unsigned long call, unsigned long arg1, 98static void async_hcall(unsigned long call, unsigned long arg1,
99 unsigned long arg2, unsigned long arg3) 99 unsigned long arg2, unsigned long arg3,
100 unsigned long arg4)
100{ 101{
101 /* Note: This code assumes we're uniprocessor. */ 102 /* Note: This code assumes we're uniprocessor. */
102 static unsigned int next_call; 103 static unsigned int next_call;
@@ -108,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
108 local_irq_save(flags); 109 local_irq_save(flags);
109 if (lguest_data.hcall_status[next_call] != 0xFF) { 110 if (lguest_data.hcall_status[next_call] != 0xFF) {
110 /* Table full, so do normal hcall which will flush table. */ 111 /* Table full, so do normal hcall which will flush table. */
111 kvm_hypercall3(call, arg1, arg2, arg3); 112 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
112 } else { 113 } else {
113 lguest_data.hcalls[next_call].arg0 = call; 114 lguest_data.hcalls[next_call].arg0 = call;
114 lguest_data.hcalls[next_call].arg1 = arg1; 115 lguest_data.hcalls[next_call].arg1 = arg1;
115 lguest_data.hcalls[next_call].arg2 = arg2; 116 lguest_data.hcalls[next_call].arg2 = arg2;
116 lguest_data.hcalls[next_call].arg3 = arg3; 117 lguest_data.hcalls[next_call].arg3 = arg3;
118 lguest_data.hcalls[next_call].arg4 = arg4;
117 /* Arguments must all be written before we mark it to go */ 119 /* Arguments must all be written before we mark it to go */
118 wmb(); 120 wmb();
119 lguest_data.hcall_status[next_call] = 0; 121 lguest_data.hcall_status[next_call] = 0;
@@ -141,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
141 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 143 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
142 kvm_hypercall1(call, arg1); 144 kvm_hypercall1(call, arg1);
143 else 145 else
144 async_hcall(call, arg1, 0, 0); 146 async_hcall(call, arg1, 0, 0, 0);
145} 147}
146 148
147static void lazy_hcall2(unsigned long call, 149static void lazy_hcall2(unsigned long call,
@@ -151,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 153 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall2(call, arg1, arg2); 154 kvm_hypercall2(call, arg1, arg2);
153 else 155 else
154 async_hcall(call, arg1, arg2, 0); 156 async_hcall(call, arg1, arg2, 0, 0);
155} 157}
156 158
157static void lazy_hcall3(unsigned long call, 159static void lazy_hcall3(unsigned long call,
@@ -162,9 +164,23 @@ static void lazy_hcall3(unsigned long call,
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 164 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall3(call, arg1, arg2, arg3); 165 kvm_hypercall3(call, arg1, arg2, arg3);
164 else 166 else
165 async_hcall(call, arg1, arg2, arg3); 167 async_hcall(call, arg1, arg2, arg3, 0);
166} 168}
167 169
170#ifdef CONFIG_X86_PAE
171static void lazy_hcall4(unsigned long call,
172 unsigned long arg1,
173 unsigned long arg2,
174 unsigned long arg3,
175 unsigned long arg4)
176{
177 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
178 kvm_hypercall4(call, arg1, arg2, arg3, arg4);
179 else
180 async_hcall(call, arg1, arg2, arg3, arg4);
181}
182#endif
183
168/* When lazy mode is turned off reset the per-cpu lazy mode variable and then 184/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
169 * issue the do-nothing hypercall to flush any stored calls. */ 185 * issue the do-nothing hypercall to flush any stored calls. */
170static void lguest_leave_lazy_mmu_mode(void) 186static void lguest_leave_lazy_mmu_mode(void)
@@ -179,7 +195,7 @@ static void lguest_end_context_switch(struct task_struct *next)
179 paravirt_end_context_switch(next); 195 paravirt_end_context_switch(next);
180} 196}
181 197
182/*G:033 198/*G:032
183 * After that diversion we return to our first native-instruction 199 * After that diversion we return to our first native-instruction
184 * replacements: four functions for interrupt control. 200 * replacements: four functions for interrupt control.
185 * 201 *
@@ -199,30 +215,28 @@ static unsigned long save_fl(void)
199{ 215{
200 return lguest_data.irq_enabled; 216 return lguest_data.irq_enabled;
201} 217}
202PV_CALLEE_SAVE_REGS_THUNK(save_fl);
203
204/* restore_flags() just sets the flags back to the value given. */
205static void restore_fl(unsigned long flags)
206{
207 lguest_data.irq_enabled = flags;
208}
209PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
210 218
211/* Interrupts go off... */ 219/* Interrupts go off... */
212static void irq_disable(void) 220static void irq_disable(void)
213{ 221{
214 lguest_data.irq_enabled = 0; 222 lguest_data.irq_enabled = 0;
215} 223}
224
225/* Let's pause a moment. Remember how I said these are called so often?
226 * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
227 * break some rules. In particular, these functions are assumed to save their
228 * own registers if they need to: normal C functions assume they can trash the
229 * eax register. To use normal C functions, we use
230 * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
231 * C function, then restores it. */
232PV_CALLEE_SAVE_REGS_THUNK(save_fl);
216PV_CALLEE_SAVE_REGS_THUNK(irq_disable); 233PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
234/*:*/
217 235
218/* Interrupts go on... */ 236/* These are in i386_head.S */
219static void irq_enable(void) 237extern void lg_irq_enable(void);
220{ 238extern void lg_restore_fl(unsigned long flags);
221 lguest_data.irq_enabled = X86_EFLAGS_IF;
222}
223PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
224 239
225/*:*/
226/*M:003 Note that we don't check for outstanding interrupts when we re-enable 240/*M:003 Note that we don't check for outstanding interrupts when we re-enable
227 * them (or when we unmask an interrupt). This seems to work for the moment, 241 * them (or when we unmask an interrupt). This seems to work for the moment,
228 * since interrupts are rare and we'll just get the interrupt on the next timer 242 * since interrupts are rare and we'll just get the interrupt on the next timer
@@ -368,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
368 case 1: /* Basic feature request. */ 382 case 1: /* Basic feature request. */
369 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ 383 /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
370 *cx &= 0x00002201; 384 *cx &= 0x00002201;
371 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */ 385 /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
372 *dx &= 0x07808111; 386 *dx &= 0x07808151;
373 /* The Host can do a nice optimization if it knows that the 387 /* The Host can do a nice optimization if it knows that the
374 * kernel mappings (addresses above 0xC0000000 or whatever 388 * kernel mappings (addresses above 0xC0000000 or whatever
375 * PAGE_OFFSET is set to) haven't changed. But Linux calls 389 * PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -388,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
388 if (*ax > 0x80000008) 402 if (*ax > 0x80000008)
389 *ax = 0x80000008; 403 *ax = 0x80000008;
390 break; 404 break;
405 case 0x80000001:
406 /* Here we should fix nx cap depending on host. */
407 /* For this version of PAE, we just clear NX bit. */
408 *dx &= ~(1 << 20);
409 break;
391 } 410 }
392} 411}
393 412
@@ -521,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
521static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, 540static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
522 pte_t *ptep) 541 pte_t *ptep)
523{ 542{
543#ifdef CONFIG_X86_PAE
544 lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
545 ptep->pte_low, ptep->pte_high);
546#else
524 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low); 547 lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
548#endif
525} 549}
526 550
527static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, 551static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
528 pte_t *ptep, pte_t pteval) 552 pte_t *ptep, pte_t pteval)
529{ 553{
530 *ptep = pteval; 554 native_set_pte(ptep, pteval);
531 lguest_pte_update(mm, addr, ptep); 555 lguest_pte_update(mm, addr, ptep);
532} 556}
533 557
534/* The Guest calls this to set a top-level entry. Again, we set the entry then 558/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
535 * tell the Host which top-level page we changed, and the index of the entry we 559 * to set a middle-level entry when PAE is activated.
536 * changed. */ 560 * Again, we set the entry then tell the Host which page we changed,
561 * and the index of the entry we changed. */
562#ifdef CONFIG_X86_PAE
563static void lguest_set_pud(pud_t *pudp, pud_t pudval)
564{
565 native_set_pud(pudp, pudval);
566
567 /* 32 bytes aligned pdpt address and the index. */
568 lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
569 (__pa(pudp) & 0x1F) / sizeof(pud_t));
570}
571
537static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) 572static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
538{ 573{
539 *pmdp = pmdval; 574 native_set_pmd(pmdp, pmdval);
540 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK, 575 lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
541 (__pa(pmdp) & (PAGE_SIZE - 1)) / 4); 576 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
542} 577}
578#else
579
580/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
581 * activated. */
582static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
583{
584 native_set_pmd(pmdp, pmdval);
585 lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
586 (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
587}
588#endif
543 589
544/* There are a couple of legacy places where the kernel sets a PTE, but we 590/* There are a couple of legacy places where the kernel sets a PTE, but we
545 * don't know the top level any more. This is useless for us, since we don't 591 * don't know the top level any more. This is useless for us, since we don't
@@ -552,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
552 * which brings boot back to 0.25 seconds. */ 598 * which brings boot back to 0.25 seconds. */
553static void lguest_set_pte(pte_t *ptep, pte_t pteval) 599static void lguest_set_pte(pte_t *ptep, pte_t pteval)
554{ 600{
555 *ptep = pteval; 601 native_set_pte(ptep, pteval);
602 if (cr3_changed)
603 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
604}
605
606#ifdef CONFIG_X86_PAE
607static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
608{
609 native_set_pte_atomic(ptep, pte);
556 if (cr3_changed) 610 if (cr3_changed)
557 lazy_hcall1(LHCALL_FLUSH_TLB, 1); 611 lazy_hcall1(LHCALL_FLUSH_TLB, 1);
558} 612}
559 613
614void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
615{
616 native_pte_clear(mm, addr, ptep);
617 lguest_pte_update(mm, addr, ptep);
618}
619
620void lguest_pmd_clear(pmd_t *pmdp)
621{
622 lguest_set_pmd(pmdp, __pmd(0));
623}
624#endif
625
560/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on 626/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
561 * native page table operations. On native hardware you can set a new page 627 * native page table operations. On native hardware you can set a new page
562 * table entry whenever you want, but if you want to remove one you have to do 628 * table entry whenever you want, but if you want to remove one you have to do
@@ -628,13 +694,12 @@ static void __init lguest_init_IRQ(void)
628{ 694{
629 unsigned int i; 695 unsigned int i;
630 696
631 for (i = 0; i < LGUEST_IRQS; i++) { 697 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
632 int vector = FIRST_EXTERNAL_VECTOR + i;
633 /* Some systems map "vectors" to interrupts weirdly. Lguest has 698 /* Some systems map "vectors" to interrupts weirdly. Lguest has
634 * a straightforward 1 to 1 mapping, so force that here. */ 699 * a straightforward 1 to 1 mapping, so force that here. */
635 __get_cpu_var(vector_irq)[vector] = i; 700 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
636 if (vector != SYSCALL_VECTOR) 701 if (i != SYSCALL_VECTOR)
637 set_intr_gate(vector, interrupt[i]); 702 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
638 } 703 }
639 /* This call is required to set up for 4k stacks, where we have 704 /* This call is required to set up for 4k stacks, where we have
640 * separate stacks for hard and soft interrupts. */ 705 * separate stacks for hard and soft interrupts. */
@@ -973,10 +1038,10 @@ static void lguest_restart(char *reason)
973 * 1038 *
974 * Our current solution is to allow the paravirt back end to optionally patch 1039 * Our current solution is to allow the paravirt back end to optionally patch
975 * over the indirect calls to replace them with something more efficient. We 1040 * over the indirect calls to replace them with something more efficient. We
976 * patch the four most commonly called functions: disable interrupts, enable 1041 * patch two of the simplest of the most commonly called functions: disable
977 * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 1042 * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
978 * bytes to patch into: the Guest versions of these operations are small enough 1043 * into: the Guest versions of these operations are small enough that we can
979 * that we can fit comfortably. 1044 * fit comfortably.
980 * 1045 *
981 * First we need assembly templates of each of the patchable Guest operations, 1046 * First we need assembly templates of each of the patchable Guest operations,
982 * and these are in i386_head.S. */ 1047 * and these are in i386_head.S. */
@@ -987,8 +1052,6 @@ static const struct lguest_insns
987 const char *start, *end; 1052 const char *start, *end;
988} lguest_insns[] = { 1053} lguest_insns[] = {
989 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, 1054 [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
990 [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
991 [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
992 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, 1055 [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
993}; 1056};
994 1057
@@ -1026,6 +1089,7 @@ __init void lguest_init(void)
1026 pv_info.name = "lguest"; 1089 pv_info.name = "lguest";
1027 pv_info.paravirt_enabled = 1; 1090 pv_info.paravirt_enabled = 1;
1028 pv_info.kernel_rpl = 1; 1091 pv_info.kernel_rpl = 1;
1092 pv_info.shared_kernel_pmd = 1;
1029 1093
1030 /* We set up all the lguest overrides for sensitive operations. These 1094 /* We set up all the lguest overrides for sensitive operations. These
1031 * are detailed with the operations themselves. */ 1095 * are detailed with the operations themselves. */
@@ -1033,9 +1097,9 @@ __init void lguest_init(void)
1033 /* interrupt-related operations */ 1097 /* interrupt-related operations */
1034 pv_irq_ops.init_IRQ = lguest_init_IRQ; 1098 pv_irq_ops.init_IRQ = lguest_init_IRQ;
1035 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); 1099 pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
1036 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl); 1100 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
1037 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable); 1101 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
1038 pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable); 1102 pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
1039 pv_irq_ops.safe_halt = lguest_safe_halt; 1103 pv_irq_ops.safe_halt = lguest_safe_halt;
1040 1104
1041 /* init-time operations */ 1105 /* init-time operations */
@@ -1071,6 +1135,12 @@ __init void lguest_init(void)
1071 pv_mmu_ops.set_pte = lguest_set_pte; 1135 pv_mmu_ops.set_pte = lguest_set_pte;
1072 pv_mmu_ops.set_pte_at = lguest_set_pte_at; 1136 pv_mmu_ops.set_pte_at = lguest_set_pte_at;
1073 pv_mmu_ops.set_pmd = lguest_set_pmd; 1137 pv_mmu_ops.set_pmd = lguest_set_pmd;
1138#ifdef CONFIG_X86_PAE
1139 pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
1140 pv_mmu_ops.pte_clear = lguest_pte_clear;
1141 pv_mmu_ops.pmd_clear = lguest_pmd_clear;
1142 pv_mmu_ops.set_pud = lguest_set_pud;
1143#endif
1074 pv_mmu_ops.read_cr2 = lguest_read_cr2; 1144 pv_mmu_ops.read_cr2 = lguest_read_cr2;
1075 pv_mmu_ops.read_cr3 = lguest_read_cr3; 1145 pv_mmu_ops.read_cr3 = lguest_read_cr3;
1076 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; 1146 pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f79541989471..a9c8cfe61cd4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
46 .globl lgstart_##name; .globl lgend_##name 46 .globl lgstart_##name; .globl lgend_##name
47 47
48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) 48LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
49LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
50LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
51LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) 49LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
52/*:*/ 50
51/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
52 * matter for save_fl and irq_disable later). If we write our routines
53 * carefully in assembler, we can avoid clobbering any registers and avoid
54 * jumping through the wrapper functions.
55 *
56 * I skipped over our first piece of assembler, but this one is worth studying
57 * in a bit more detail so I'll describe in easy stages. First, the routine
58 * to enable interrupts: */
59ENTRY(lg_irq_enable)
60 /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
61 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
62 movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
63 /* But now we need to check if the Host wants to know: there might have
64 * been interrupts waiting to be delivered, in which case it will have
65 * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
66 * jump to send_interrupts, otherwise we're done. */
67 testl $0, lguest_data+LGUEST_DATA_irq_pending
68 jnz send_interrupts
69 /* One cool thing about x86 is that you can do many things without using
70 * a register. In this case, the normal path hasn't needed to save or
71 * restore any registers at all! */
72 ret
73send_interrupts:
74 /* OK, now we need a register: eax is used for the hypercall number,
75 * which is LHCALL_SEND_INTERRUPTS.
76 *
77 * We used not to bother with this pending detection at all, which was
78 * much simpler. Sooner or later the Host would realize it had to
79 * send us an interrupt. But that turns out to make performance 7
80 * times worse on a simple tcp benchmark. So now we do this the hard
81 * way. */
82 pushl %eax
83 movl $LHCALL_SEND_INTERRUPTS, %eax
84 /* This is a vmcall instruction (same thing that KVM uses). Older
85 * assembler versions might not know the "vmcall" instruction, so we
86 * create one manually here. */
87 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
88 popl %eax
89 ret
90
91/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
92 * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
93 * enabling interrupts again, if it's 0 we're leaving them off. */
94ENTRY(lg_restore_fl)
95 /* This is just "lguest_data.irq_enabled = flags;" */
96 movl %eax, lguest_data+LGUEST_DATA_irq_enabled
97 /* Now, if the %eax value has enabled interrupts and
98 * lguest_data.irq_pending is set, we want to tell the Host so it can
99 * deliver any outstanding interrupts. Fortunately, both values will
100 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
101 * instruction will AND them together for us. If both are set, we
102 * jump to send_interrupts. */
103 testl lguest_data+LGUEST_DATA_irq_pending, %eax
104 jnz send_interrupts
105 /* Again, the normal path has used no extra registers. Clever, huh? */
106 ret
53 107
54/* These demark the EIP range where host should never deliver interrupts. */ 108/* These demark the EIP range where host should never deliver interrupts. */
55.global lguest_noirq_start 109.global lguest_noirq_start
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index a3d3cbab359a..0aaa0597a622 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -1,6 +1,6 @@
1config LGUEST 1config LGUEST
2 tristate "Linux hypervisor example code" 2 tristate "Linux hypervisor example code"
3 depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX 3 depends on X86_32 && EXPERIMENTAL && EVENTFD
4 select HVC_DRIVER 4 select HVC_DRIVER
5 ---help--- 5 ---help---
6 This is a very simple module which allows you to run 6 This is a very simple module which allows you to run
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 4845fb3cf74b..a6974e9b8ebf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -95,7 +95,7 @@ static __init int map_switcher(void)
95 * array of struct pages. It increments that pointer, but we don't 95 * array of struct pages. It increments that pointer, but we don't
96 * care. */ 96 * care. */
97 pagep = switcher_page; 97 pagep = switcher_page;
98 err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep); 98 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
99 if (err) { 99 if (err) {
100 printk("lguest: map_vm_area failed: %i\n", err); 100 printk("lguest: map_vm_area failed: %i\n", err);
101 goto free_vma; 101 goto free_vma;
@@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
188{ 188{
189 /* We stop running once the Guest is dead. */ 189 /* We stop running once the Guest is dead. */
190 while (!cpu->lg->dead) { 190 while (!cpu->lg->dead) {
191 unsigned int irq;
192 bool more;
193
191 /* First we run any hypercalls the Guest wants done. */ 194 /* First we run any hypercalls the Guest wants done. */
192 if (cpu->hcall) 195 if (cpu->hcall)
193 do_hypercalls(cpu); 196 do_hypercalls(cpu);
@@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
195 /* It's possible the Guest did a NOTIFY hypercall to the 198 /* It's possible the Guest did a NOTIFY hypercall to the
196 * Launcher, in which case we return from the read() now. */ 199 * Launcher, in which case we return from the read() now. */
197 if (cpu->pending_notify) { 200 if (cpu->pending_notify) {
198 if (put_user(cpu->pending_notify, user)) 201 if (!send_notify_to_eventfd(cpu)) {
199 return -EFAULT; 202 if (put_user(cpu->pending_notify, user))
200 return sizeof(cpu->pending_notify); 203 return -EFAULT;
204 return sizeof(cpu->pending_notify);
205 }
201 } 206 }
202 207
203 /* Check for signals */ 208 /* Check for signals */
204 if (signal_pending(current)) 209 if (signal_pending(current))
205 return -ERESTARTSYS; 210 return -ERESTARTSYS;
206 211
207 /* If Waker set break_out, return to Launcher. */
208 if (cpu->break_out)
209 return -EAGAIN;
210
211 /* Check if there are any interrupts which can be delivered now: 212 /* Check if there are any interrupts which can be delivered now:
212 * if so, this sets up the hander to be executed when we next 213 * if so, this sets up the hander to be executed when we next
213 * run the Guest. */ 214 * run the Guest. */
214 maybe_do_interrupt(cpu); 215 irq = interrupt_pending(cpu, &more);
216 if (irq < LGUEST_IRQS)
217 try_deliver_interrupt(cpu, irq, more);
215 218
216 /* All long-lived kernel loops need to check with this horrible 219 /* All long-lived kernel loops need to check with this horrible
217 * thing called the freezer. If the Host is trying to suspend, 220 * thing called the freezer. If the Host is trying to suspend,
@@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
224 break; 227 break;
225 228
226 /* If the Guest asked to be stopped, we sleep. The Guest's 229 /* If the Guest asked to be stopped, we sleep. The Guest's
227 * clock timer or LHREQ_BREAK from the Waker will wake us. */ 230 * clock timer will wake us. */
228 if (cpu->halted) { 231 if (cpu->halted) {
229 set_current_state(TASK_INTERRUPTIBLE); 232 set_current_state(TASK_INTERRUPTIBLE);
230 schedule(); 233 /* Just before we sleep, make sure no interrupt snuck in
234 * which we should be doing. */
235 if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
236 set_current_state(TASK_RUNNING);
237 else
238 schedule();
231 continue; 239 continue;
232 } 240 }
233 241
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 54d66f05fefa..c29ffa19cb74 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
37 /* This call does nothing, except by breaking out of the Guest 37 /* This call does nothing, except by breaking out of the Guest
38 * it makes us process all the asynchronous hypercalls. */ 38 * it makes us process all the asynchronous hypercalls. */
39 break; 39 break;
40 case LHCALL_SEND_INTERRUPTS:
41 /* This call does nothing too, but by breaking out of the Guest
42 * it makes us process any pending interrupts. */
43 break;
40 case LHCALL_LGUEST_INIT: 44 case LHCALL_LGUEST_INIT:
41 /* You can't get here unless you're already initialized. Don't 45 /* You can't get here unless you're already initialized. Don't
42 * do that. */ 46 * do that. */
@@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
73 guest_set_stack(cpu, args->arg1, args->arg2, args->arg3); 77 guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
74 break; 78 break;
75 case LHCALL_SET_PTE: 79 case LHCALL_SET_PTE:
80#ifdef CONFIG_X86_PAE
81 guest_set_pte(cpu, args->arg1, args->arg2,
82 __pte(args->arg3 | (u64)args->arg4 << 32));
83#else
76 guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3)); 84 guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
85#endif
86 break;
87 case LHCALL_SET_PGD:
88 guest_set_pgd(cpu->lg, args->arg1, args->arg2);
77 break; 89 break;
90#ifdef CONFIG_X86_PAE
78 case LHCALL_SET_PMD: 91 case LHCALL_SET_PMD:
79 guest_set_pmd(cpu->lg, args->arg1, args->arg2); 92 guest_set_pmd(cpu->lg, args->arg1, args->arg2);
80 break; 93 break;
94#endif
81 case LHCALL_SET_CLOCKEVENT: 95 case LHCALL_SET_CLOCKEVENT:
82 guest_set_clockevent(cpu, args->arg1); 96 guest_set_clockevent(cpu, args->arg1);
83 break; 97 break;
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 6e99adbe1946..0e9067b0d507 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
128/*H:205 128/*H:205
129 * Virtual Interrupts. 129 * Virtual Interrupts.
130 * 130 *
131 * maybe_do_interrupt() gets called before every entry to the Guest, to see if 131 * interrupt_pending() returns the first pending interrupt which isn't blocked
132 * we should divert the Guest to running an interrupt handler. */ 132 * by the Guest. It is called before every entry to the Guest, and just before
133void maybe_do_interrupt(struct lg_cpu *cpu) 133 * we go to sleep when the Guest has halted itself. */
134unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
134{ 135{
135 unsigned int irq; 136 unsigned int irq;
136 DECLARE_BITMAP(blk, LGUEST_IRQS); 137 DECLARE_BITMAP(blk, LGUEST_IRQS);
137 struct desc_struct *idt;
138 138
139 /* If the Guest hasn't even initialized yet, we can do nothing. */ 139 /* If the Guest hasn't even initialized yet, we can do nothing. */
140 if (!cpu->lg->lguest_data) 140 if (!cpu->lg->lguest_data)
141 return; 141 return LGUEST_IRQS;
142 142
143 /* Take our "irqs_pending" array and remove any interrupts the Guest 143 /* Take our "irqs_pending" array and remove any interrupts the Guest
144 * wants blocked: the result ends up in "blk". */ 144 * wants blocked: the result ends up in "blk". */
145 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 145 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
146 sizeof(blk))) 146 sizeof(blk)))
147 return; 147 return LGUEST_IRQS;
148 bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS); 148 bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
149 149
150 /* Find the first interrupt. */ 150 /* Find the first interrupt. */
151 irq = find_first_bit(blk, LGUEST_IRQS); 151 irq = find_first_bit(blk, LGUEST_IRQS);
152 /* None? Nothing to do */ 152 *more = find_next_bit(blk, LGUEST_IRQS, irq+1);
153 if (irq >= LGUEST_IRQS) 153
154 return; 154 return irq;
155}
156
157/* This actually diverts the Guest to running an interrupt handler, once an
158 * interrupt has been identified by interrupt_pending(). */
159void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
160{
161 struct desc_struct *idt;
162
163 BUG_ON(irq >= LGUEST_IRQS);
155 164
156 /* They may be in the middle of an iret, where they asked us never to 165 /* They may be in the middle of an iret, where they asked us never to
157 * deliver interrupts. */ 166 * deliver interrupts. */
@@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
170 u32 irq_enabled; 179 u32 irq_enabled;
171 if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled)) 180 if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
172 irq_enabled = 0; 181 irq_enabled = 0;
173 if (!irq_enabled) 182 if (!irq_enabled) {
183 /* Make sure they know an IRQ is pending. */
184 put_user(X86_EFLAGS_IF,
185 &cpu->lg->lguest_data->irq_pending);
174 return; 186 return;
187 }
175 } 188 }
176 189
177 /* Look at the IDT entry the Guest gave us for this interrupt. The 190 /* Look at the IDT entry the Guest gave us for this interrupt. The
@@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
194 * here is a compromise which means at least it gets updated every 207 * here is a compromise which means at least it gets updated every
195 * timer interrupt. */ 208 * timer interrupt. */
196 write_timestamp(cpu); 209 write_timestamp(cpu);
210
211 /* If there are no other interrupts we want to deliver, clear
212 * the pending flag. */
213 if (!more)
214 put_user(0, &cpu->lg->lguest_data->irq_pending);
215}
216
217/* And this is the routine when we want to set an interrupt for the Guest. */
218void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
219{
220 /* Next time the Guest runs, the core code will see if it can deliver
221 * this interrupt. */
222 set_bit(irq, cpu->irqs_pending);
223
224 /* Make sure it sees it; it might be asleep (eg. halted), or
225 * running the Guest right now, in which case kick_process()
226 * will knock it out. */
227 if (!wake_up_process(cpu->tsk))
228 kick_process(cpu->tsk);
197} 229}
198/*:*/ 230/*:*/
199 231
@@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
510 struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt); 542 struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
511 543
512 /* Remember the first interrupt is the timer interrupt. */ 544 /* Remember the first interrupt is the timer interrupt. */
513 set_bit(0, cpu->irqs_pending); 545 set_interrupt(cpu, 0);
514 /* If the Guest is actually stopped, we need to wake it up. */
515 if (cpu->halted)
516 wake_up_process(cpu->tsk);
517 return HRTIMER_NORESTART; 546 return HRTIMER_NORESTART;
518} 547}
519 548
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index af92a176697f..d4e8979735cb 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -49,7 +49,7 @@ struct lg_cpu {
49 u32 cr2; 49 u32 cr2;
50 int ts; 50 int ts;
51 u32 esp1; 51 u32 esp1;
52 u8 ss1; 52 u16 ss1;
53 53
54 /* Bitmap of what has changed: see CHANGED_* above. */ 54 /* Bitmap of what has changed: see CHANGED_* above. */
55 int changed; 55 int changed;
@@ -71,9 +71,7 @@ struct lg_cpu {
71 /* Virtual clock device */ 71 /* Virtual clock device */
72 struct hrtimer hrt; 72 struct hrtimer hrt;
73 73
74 /* Do we need to stop what we're doing and return to userspace? */ 74 /* Did the Guest tell us to halt? */
75 int break_out;
76 wait_queue_head_t break_wq;
77 int halted; 75 int halted;
78 76
79 /* Pending virtual interrupts */ 77 /* Pending virtual interrupts */
@@ -82,6 +80,16 @@ struct lg_cpu {
82 struct lg_cpu_arch arch; 80 struct lg_cpu_arch arch;
83}; 81};
84 82
83struct lg_eventfd {
84 unsigned long addr;
85 struct file *event;
86};
87
88struct lg_eventfd_map {
89 unsigned int num;
90 struct lg_eventfd map[];
91};
92
85/* The private info the thread maintains about the guest. */ 93/* The private info the thread maintains about the guest. */
86struct lguest 94struct lguest
87{ 95{
@@ -102,6 +110,8 @@ struct lguest
102 unsigned int stack_pages; 110 unsigned int stack_pages;
103 u32 tsc_khz; 111 u32 tsc_khz;
104 112
113 struct lg_eventfd_map *eventfds;
114
105 /* Dead? */ 115 /* Dead? */
106 const char *dead; 116 const char *dead;
107}; 117};
@@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
137 * in the kernel. */ 147 * in the kernel. */
138#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 148#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
139#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 149#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
150#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
151#define pmd_pfn(x) (pmd_val(x) >> PAGE_SHIFT)
140 152
141/* interrupts_and_traps.c: */ 153/* interrupts_and_traps.c: */
142void maybe_do_interrupt(struct lg_cpu *cpu); 154unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
155void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
156void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
143bool deliver_trap(struct lg_cpu *cpu, unsigned int num); 157bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
144void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i, 158void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
145 u32 low, u32 hi); 159 u32 low, u32 hi);
@@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
150void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 164void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
151 const unsigned long *def); 165 const unsigned long *def);
152void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta); 166void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
167bool send_notify_to_eventfd(struct lg_cpu *cpu);
153void init_clockdev(struct lg_cpu *cpu); 168void init_clockdev(struct lg_cpu *cpu);
154bool check_syscall_vector(struct lguest *lg); 169bool check_syscall_vector(struct lguest *lg);
155int init_interrupts(void); 170int init_interrupts(void);
@@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
168int init_guest_pagetable(struct lguest *lg); 183int init_guest_pagetable(struct lguest *lg);
169void free_guest_pagetable(struct lguest *lg); 184void free_guest_pagetable(struct lguest *lg);
170void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable); 185void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
186void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
187#ifdef CONFIG_X86_PAE
171void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i); 188void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
189#endif
172void guest_pagetable_clear_all(struct lg_cpu *cpu); 190void guest_pagetable_clear_all(struct lg_cpu *cpu);
173void guest_pagetable_flush_user(struct lg_cpu *cpu); 191void guest_pagetable_flush_user(struct lg_cpu *cpu);
174void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, 192void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b8ee103eed5f..32e297121058 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,32 +7,83 @@
7#include <linux/miscdevice.h> 7#include <linux/miscdevice.h>
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/sched.h> 9#include <linux/sched.h>
10#include <linux/eventfd.h>
11#include <linux/file.h>
10#include "lg.h" 12#include "lg.h"
11 13
12/*L:055 When something happens, the Waker process needs a way to stop the 14bool send_notify_to_eventfd(struct lg_cpu *cpu)
13 * kernel running the Guest and return to the Launcher. So the Waker writes
14 * LHREQ_BREAK and the value "1" to /dev/lguest to do this. Once the Launcher
15 * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
16 * the Waker. */
17static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
18{ 15{
19 unsigned long on; 16 unsigned int i;
17 struct lg_eventfd_map *map;
18
19 /* lg->eventfds is RCU-protected */
20 rcu_read_lock();
21 map = rcu_dereference(cpu->lg->eventfds);
22 for (i = 0; i < map->num; i++) {
23 if (map->map[i].addr == cpu->pending_notify) {
24 eventfd_signal(map->map[i].event, 1);
25 cpu->pending_notify = 0;
26 break;
27 }
28 }
29 rcu_read_unlock();
30 return cpu->pending_notify == 0;
31}
20 32
21 /* Fetch whether they're turning break on or off. */ 33static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
22 if (get_user(on, input) != 0) 34{
23 return -EFAULT; 35 struct lg_eventfd_map *new, *old = lg->eventfds;
24 36
25 if (on) { 37 if (!addr)
26 cpu->break_out = 1; 38 return -EINVAL;
27 /* Pop it out of the Guest (may be running on different CPU) */ 39
28 wake_up_process(cpu->tsk); 40 /* Replace the old array with the new one, carefully: others can
29 /* Wait for them to reset it */ 41 * be accessing it at the same time */
30 return wait_event_interruptible(cpu->break_wq, !cpu->break_out); 42 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
31 } else { 43 GFP_KERNEL);
32 cpu->break_out = 0; 44 if (!new)
33 wake_up(&cpu->break_wq); 45 return -ENOMEM;
34 return 0; 46
47 /* First make identical copy. */
48 memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
49 new->num = old->num;
50
51 /* Now append new entry. */
52 new->map[new->num].addr = addr;
53 new->map[new->num].event = eventfd_fget(fd);
54 if (IS_ERR(new->map[new->num].event)) {
55 kfree(new);
56 return PTR_ERR(new->map[new->num].event);
35 } 57 }
58 new->num++;
59
60 /* Now put new one in place. */
61 rcu_assign_pointer(lg->eventfds, new);
62
63 /* We're not in a big hurry. Wait until noone's looking at old
64 * version, then delete it. */
65 synchronize_rcu();
66 kfree(old);
67
68 return 0;
69}
70
71static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
72{
73 unsigned long addr, fd;
74 int err;
75
76 if (get_user(addr, input) != 0)
77 return -EFAULT;
78 input++;
79 if (get_user(fd, input) != 0)
80 return -EFAULT;
81
82 mutex_lock(&lguest_lock);
83 err = add_eventfd(lg, addr, fd);
84 mutex_unlock(&lguest_lock);
85
86 return 0;
36} 87}
37 88
38/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 89/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
@@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
45 return -EFAULT; 96 return -EFAULT;
46 if (irq >= LGUEST_IRQS) 97 if (irq >= LGUEST_IRQS)
47 return -EINVAL; 98 return -EINVAL;
48 /* Next time the Guest runs, the core code will see if it can deliver 99
49 * this interrupt. */ 100 set_interrupt(cpu, irq);
50 set_bit(irq, cpu->irqs_pending);
51 return 0; 101 return 0;
52} 102}
53 103
@@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
126 * address. */ 176 * address. */
127 lguest_arch_setup_regs(cpu, start_ip); 177 lguest_arch_setup_regs(cpu, start_ip);
128 178
129 /* Initialize the queue for the Waker to wait on */
130 init_waitqueue_head(&cpu->break_wq);
131
132 /* We keep a pointer to the Launcher task (ie. current task) for when 179 /* We keep a pointer to the Launcher task (ie. current task) for when
133 * other Guests want to wake this one (eg. console input). */ 180 * other Guests want to wake this one (eg. console input). */
134 cpu->tsk = current; 181 cpu->tsk = current;
@@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
185 goto unlock; 232 goto unlock;
186 } 233 }
187 234
235 lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
236 if (!lg->eventfds) {
237 err = -ENOMEM;
238 goto free_lg;
239 }
240 lg->eventfds->num = 0;
241
188 /* Populate the easy fields of our "struct lguest" */ 242 /* Populate the easy fields of our "struct lguest" */
189 lg->mem_base = (void __user *)args[0]; 243 lg->mem_base = (void __user *)args[0];
190 lg->pfn_limit = args[1]; 244 lg->pfn_limit = args[1];
@@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
192 /* This is the first cpu (cpu 0) and it will start booting at args[2] */ 246 /* This is the first cpu (cpu 0) and it will start booting at args[2] */
193 err = lg_cpu_start(&lg->cpus[0], 0, args[2]); 247 err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
194 if (err) 248 if (err)
195 goto release_guest; 249 goto free_eventfds;
196 250
197 /* Initialize the Guest's shadow page tables, using the toplevel 251 /* Initialize the Guest's shadow page tables, using the toplevel
198 * address the Launcher gave us. This allocates memory, so can fail. */ 252 * address the Launcher gave us. This allocates memory, so can fail. */
@@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input)
211free_regs: 265free_regs:
212 /* FIXME: This should be in free_vcpu */ 266 /* FIXME: This should be in free_vcpu */
213 free_page(lg->cpus[0].regs_page); 267 free_page(lg->cpus[0].regs_page);
214release_guest: 268free_eventfds:
269 kfree(lg->eventfds);
270free_lg:
215 kfree(lg); 271 kfree(lg);
216unlock: 272unlock:
217 mutex_unlock(&lguest_lock); 273 mutex_unlock(&lguest_lock);
@@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in,
252 /* Once the Guest is dead, you can only read() why it died. */ 308 /* Once the Guest is dead, you can only read() why it died. */
253 if (lg->dead) 309 if (lg->dead)
254 return -ENOENT; 310 return -ENOENT;
255
256 /* If you're not the task which owns the Guest, all you can do
257 * is break the Launcher out of running the Guest. */
258 if (current != cpu->tsk && req != LHREQ_BREAK)
259 return -EPERM;
260 } 311 }
261 312
262 switch (req) { 313 switch (req) {
@@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in,
264 return initialize(file, input); 315 return initialize(file, input);
265 case LHREQ_IRQ: 316 case LHREQ_IRQ:
266 return user_send_irq(cpu, input); 317 return user_send_irq(cpu, input);
267 case LHREQ_BREAK: 318 case LHREQ_EVENTFD:
268 return break_guest_out(cpu, input); 319 return attach_eventfd(lg, input);
269 default: 320 default:
270 return -EINVAL; 321 return -EINVAL;
271 } 322 }
@@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file)
303 * the Launcher's memory management structure. */ 354 * the Launcher's memory management structure. */
304 mmput(lg->cpus[i].mm); 355 mmput(lg->cpus[i].mm);
305 } 356 }
357
358 /* Release any eventfds they registered. */
359 for (i = 0; i < lg->eventfds->num; i++)
360 fput(lg->eventfds->map[i].event);
361 kfree(lg->eventfds);
362
306 /* If lg->dead doesn't contain an error code it will be NULL or a 363 /* If lg->dead doesn't contain an error code it will be NULL or a
307 * kmalloc()ed string, either of which is ok to hand to kfree(). */ 364 * kmalloc()ed string, either of which is ok to hand to kfree(). */
308 if (!IS_ERR(lg->dead)) 365 if (!IS_ERR(lg->dead))
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a059cf9980f7..a6fe1abda240 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -53,6 +53,17 @@
53 * page. */ 53 * page. */
54#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 54#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
55 55
56/* For PAE we need the PMD index as well. We use the last 2MB, so we
57 * will need the last pmd entry of the last pmd page. */
58#ifdef CONFIG_X86_PAE
59#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
60#define RESERVE_MEM 2U
61#define CHECK_GPGD_MASK _PAGE_PRESENT
62#else
63#define RESERVE_MEM 4U
64#define CHECK_GPGD_MASK _PAGE_TABLE
65#endif
66
56/* We actually need a separate PTE page for each CPU. Remember that after the 67/* We actually need a separate PTE page for each CPU. Remember that after the
57 * Switcher code itself comes two pages for each CPU, and we don't want this 68 * Switcher code itself comes two pages for each CPU, and we don't want this
58 * CPU's guest to see the pages of any other CPU. */ 69 * CPU's guest to see the pages of any other CPU. */
@@ -73,24 +84,59 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
73{ 84{
74 unsigned int index = pgd_index(vaddr); 85 unsigned int index = pgd_index(vaddr);
75 86
87#ifndef CONFIG_X86_PAE
76 /* We kill any Guest trying to touch the Switcher addresses. */ 88 /* We kill any Guest trying to touch the Switcher addresses. */
77 if (index >= SWITCHER_PGD_INDEX) { 89 if (index >= SWITCHER_PGD_INDEX) {
78 kill_guest(cpu, "attempt to access switcher pages"); 90 kill_guest(cpu, "attempt to access switcher pages");
79 index = 0; 91 index = 0;
80 } 92 }
93#endif
81 /* Return a pointer index'th pgd entry for the i'th page table. */ 94 /* Return a pointer index'th pgd entry for the i'th page table. */
82 return &cpu->lg->pgdirs[i].pgdir[index]; 95 return &cpu->lg->pgdirs[i].pgdir[index];
83} 96}
84 97
98#ifdef CONFIG_X86_PAE
99/* This routine then takes the PGD entry given above, which contains the
100 * address of the PMD page. It then returns a pointer to the PMD entry for the
101 * given address. */
102static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
103{
104 unsigned int index = pmd_index(vaddr);
105 pmd_t *page;
106
107 /* We kill any Guest trying to touch the Switcher addresses. */
108 if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
109 index >= SWITCHER_PMD_INDEX) {
110 kill_guest(cpu, "attempt to access switcher pages");
111 index = 0;
112 }
113
114 /* You should never call this if the PGD entry wasn't valid */
115 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
116 page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
117
118 return &page[index];
119}
120#endif
121
85/* This routine then takes the page directory entry returned above, which 122/* This routine then takes the page directory entry returned above, which
86 * contains the address of the page table entry (PTE) page. It then returns a 123 * contains the address of the page table entry (PTE) page. It then returns a
87 * pointer to the PTE entry for the given address. */ 124 * pointer to the PTE entry for the given address. */
88static pte_t *spte_addr(pgd_t spgd, unsigned long vaddr) 125static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
89{ 126{
127#ifdef CONFIG_X86_PAE
128 pmd_t *pmd = spmd_addr(cpu, spgd, vaddr);
129 pte_t *page = __va(pmd_pfn(*pmd) << PAGE_SHIFT);
130
131 /* You should never call this if the PMD entry wasn't valid */
132 BUG_ON(!(pmd_flags(*pmd) & _PAGE_PRESENT));
133#else
90 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); 134 pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
91 /* You should never call this if the PGD entry wasn't valid */ 135 /* You should never call this if the PGD entry wasn't valid */
92 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); 136 BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
93 return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; 137#endif
138
139 return &page[pte_index(vaddr)];
94} 140}
95 141
96/* These two functions just like the above two, except they access the Guest 142/* These two functions just like the above two, except they access the Guest
@@ -101,12 +147,32 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
101 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t); 147 return cpu->lg->pgdirs[cpu->cpu_pgd].gpgdir + index * sizeof(pgd_t);
102} 148}
103 149
104static unsigned long gpte_addr(pgd_t gpgd, unsigned long vaddr) 150#ifdef CONFIG_X86_PAE
151static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
152{
153 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
154 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
155 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
156}
157
158static unsigned long gpte_addr(struct lg_cpu *cpu,
159 pmd_t gpmd, unsigned long vaddr)
160{
161 unsigned long gpage = pmd_pfn(gpmd) << PAGE_SHIFT;
162
163 BUG_ON(!(pmd_flags(gpmd) & _PAGE_PRESENT));
164 return gpage + pte_index(vaddr) * sizeof(pte_t);
165}
166#else
167static unsigned long gpte_addr(struct lg_cpu *cpu,
168 pgd_t gpgd, unsigned long vaddr)
105{ 169{
106 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 170 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
171
107 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); 172 BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT));
108 return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); 173 return gpage + pte_index(vaddr) * sizeof(pte_t);
109} 174}
175#endif
110/*:*/ 176/*:*/
111 177
112/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 178/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as
@@ -171,7 +237,7 @@ static void release_pte(pte_t pte)
171 /* Remember that get_user_pages_fast() took a reference to the page, in 237 /* Remember that get_user_pages_fast() took a reference to the page, in
172 * get_pfn()? We have to put it back now. */ 238 * get_pfn()? We have to put it back now. */
173 if (pte_flags(pte) & _PAGE_PRESENT) 239 if (pte_flags(pte) & _PAGE_PRESENT)
174 put_page(pfn_to_page(pte_pfn(pte))); 240 put_page(pte_page(pte));
175} 241}
176/*:*/ 242/*:*/
177 243
@@ -184,11 +250,20 @@ static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
184 250
185static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd) 251static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
186{ 252{
187 if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || 253 if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
188 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) 254 (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
189 kill_guest(cpu, "bad page directory entry"); 255 kill_guest(cpu, "bad page directory entry");
190} 256}
191 257
258#ifdef CONFIG_X86_PAE
259static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
260{
261 if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
262 (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
263 kill_guest(cpu, "bad page middle directory entry");
264}
265#endif
266
192/*H:330 267/*H:330
193 * (i) Looking up a page table entry when the Guest faults. 268 * (i) Looking up a page table entry when the Guest faults.
194 * 269 *
@@ -207,6 +282,11 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
207 pte_t gpte; 282 pte_t gpte;
208 pte_t *spte; 283 pte_t *spte;
209 284
285#ifdef CONFIG_X86_PAE
286 pmd_t *spmd;
287 pmd_t gpmd;
288#endif
289
210 /* First step: get the top-level Guest page table entry. */ 290 /* First step: get the top-level Guest page table entry. */
211 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 291 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
212 /* Toplevel not present? We can't map it in. */ 292 /* Toplevel not present? We can't map it in. */
@@ -228,12 +308,45 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
228 check_gpgd(cpu, gpgd); 308 check_gpgd(cpu, gpgd);
229 /* And we copy the flags to the shadow PGD entry. The page 309 /* And we copy the flags to the shadow PGD entry. The page
230 * number in the shadow PGD is the page we just allocated. */ 310 * number in the shadow PGD is the page we just allocated. */
231 *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); 311 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
232 } 312 }
233 313
314#ifdef CONFIG_X86_PAE
315 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
316 /* middle level not present? We can't map it in. */
317 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
318 return false;
319
320 /* Now look at the matching shadow entry. */
321 spmd = spmd_addr(cpu, *spgd, vaddr);
322
323 if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
324 /* No shadow entry: allocate a new shadow PTE page. */
325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
326
327 /* This is not really the Guest's fault, but killing it is
328 * simple for this corner case. */
329 if (!ptepage) {
330 kill_guest(cpu, "out of memory allocating pte page");
331 return false;
332 }
333
334 /* We check that the Guest pmd is OK. */
335 check_gpmd(cpu, gpmd);
336
337 /* And we copy the flags to the shadow PMD entry. The page
338 * number in the shadow PMD is the page we just allocated. */
339 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
340 }
341
342 /* OK, now we look at the lower level in the Guest page table: keep its
343 * address, because we might update it later. */
344 gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
345#else
234 /* OK, now we look at the lower level in the Guest page table: keep its 346 /* OK, now we look at the lower level in the Guest page table: keep its
235 * address, because we might update it later. */ 347 * address, because we might update it later. */
236 gpte_ptr = gpte_addr(gpgd, vaddr); 348 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
349#endif
237 gpte = lgread(cpu, gpte_ptr, pte_t); 350 gpte = lgread(cpu, gpte_ptr, pte_t);
238 351
239 /* If this page isn't in the Guest page tables, we can't page it in. */ 352 /* If this page isn't in the Guest page tables, we can't page it in. */
@@ -259,7 +372,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
259 gpte = pte_mkdirty(gpte); 372 gpte = pte_mkdirty(gpte);
260 373
261 /* Get the pointer to the shadow PTE entry we're going to set. */ 374 /* Get the pointer to the shadow PTE entry we're going to set. */
262 spte = spte_addr(*spgd, vaddr); 375 spte = spte_addr(cpu, *spgd, vaddr);
263 /* If there was a valid shadow PTE entry here before, we release it. 376 /* If there was a valid shadow PTE entry here before, we release it.
264 * This can happen with a write to a previously read-only entry. */ 377 * This can happen with a write to a previously read-only entry. */
265 release_pte(*spte); 378 release_pte(*spte);
@@ -273,7 +386,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
273 * table entry, even if the Guest says it's writable. That way 386 * table entry, even if the Guest says it's writable. That way
274 * we will come back here when a write does actually occur, so 387 * we will come back here when a write does actually occur, so
275 * we can update the Guest's _PAGE_DIRTY flag. */ 388 * we can update the Guest's _PAGE_DIRTY flag. */
276 *spte = gpte_to_spte(cpu, pte_wrprotect(gpte), 0); 389 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
277 390
278 /* Finally, we write the Guest PTE entry back: we've set the 391 /* Finally, we write the Guest PTE entry back: we've set the
279 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 392 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */
@@ -301,14 +414,23 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
301 pgd_t *spgd; 414 pgd_t *spgd;
302 unsigned long flags; 415 unsigned long flags;
303 416
417#ifdef CONFIG_X86_PAE
418 pmd_t *spmd;
419#endif
304 /* Look at the current top level entry: is it present? */ 420 /* Look at the current top level entry: is it present? */
305 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr); 421 spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
306 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) 422 if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
307 return false; 423 return false;
308 424
425#ifdef CONFIG_X86_PAE
426 spmd = spmd_addr(cpu, *spgd, vaddr);
427 if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
428 return false;
429#endif
430
309 /* Check the flags on the pte entry itself: it must be present and 431 /* Check the flags on the pte entry itself: it must be present and
310 * writable. */ 432 * writable. */
311 flags = pte_flags(*(spte_addr(*spgd, vaddr))); 433 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
312 434
313 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 435 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
314} 436}
@@ -322,8 +444,43 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
322 kill_guest(cpu, "bad stack page %#lx", vaddr); 444 kill_guest(cpu, "bad stack page %#lx", vaddr);
323} 445}
324 446
447#ifdef CONFIG_X86_PAE
448static void release_pmd(pmd_t *spmd)
449{
450 /* If the entry's not present, there's nothing to release. */
451 if (pmd_flags(*spmd) & _PAGE_PRESENT) {
452 unsigned int i;
453 pte_t *ptepage = __va(pmd_pfn(*spmd) << PAGE_SHIFT);
454 /* For each entry in the page, we might need to release it. */
455 for (i = 0; i < PTRS_PER_PTE; i++)
456 release_pte(ptepage[i]);
457 /* Now we can free the page of PTEs */
458 free_page((long)ptepage);
459 /* And zero out the PMD entry so we never release it twice. */
460 native_set_pmd(spmd, __pmd(0));
461 }
462}
463
464static void release_pgd(pgd_t *spgd)
465{
466 /* If the entry's not present, there's nothing to release. */
467 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
468 unsigned int i;
469 pmd_t *pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
470
471 for (i = 0; i < PTRS_PER_PMD; i++)
472 release_pmd(&pmdpage[i]);
473
474 /* Now we can free the page of PMDs */
475 free_page((long)pmdpage);
476 /* And zero out the PGD entry so we never release it twice. */
477 set_pgd(spgd, __pgd(0));
478 }
479}
480
481#else /* !CONFIG_X86_PAE */
325/*H:450 If we chase down the release_pgd() code, it looks like this: */ 482/*H:450 If we chase down the release_pgd() code, it looks like this: */
326static void release_pgd(struct lguest *lg, pgd_t *spgd) 483static void release_pgd(pgd_t *spgd)
327{ 484{
328 /* If the entry's not present, there's nothing to release. */ 485 /* If the entry's not present, there's nothing to release. */
329 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 486 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
@@ -341,7 +498,7 @@ static void release_pgd(struct lguest *lg, pgd_t *spgd)
341 *spgd = __pgd(0); 498 *spgd = __pgd(0);
342 } 499 }
343} 500}
344 501#endif
345/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 502/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings()
346 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 503 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
347 * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 504 * It simply releases every PTE page from 0 up to the Guest's kernel address. */
@@ -350,7 +507,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
350 unsigned int i; 507 unsigned int i;
351 /* Release every pgd entry up to the kernel's address. */ 508 /* Release every pgd entry up to the kernel's address. */
352 for (i = 0; i < pgd_index(lg->kernel_address); i++) 509 for (i = 0; i < pgd_index(lg->kernel_address); i++)
353 release_pgd(lg, lg->pgdirs[idx].pgdir + i); 510 release_pgd(lg->pgdirs[idx].pgdir + i);
354} 511}
355 512
356/*H:440 (v) Flushing (throwing away) page tables, 513/*H:440 (v) Flushing (throwing away) page tables,
@@ -369,7 +526,9 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
369{ 526{
370 pgd_t gpgd; 527 pgd_t gpgd;
371 pte_t gpte; 528 pte_t gpte;
372 529#ifdef CONFIG_X86_PAE
530 pmd_t gpmd;
531#endif
373 /* First step: get the top-level Guest page table entry. */ 532 /* First step: get the top-level Guest page table entry. */
374 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); 533 gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
375 /* Toplevel not present? We can't map it in. */ 534 /* Toplevel not present? We can't map it in. */
@@ -378,7 +537,14 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
378 return -1UL; 537 return -1UL;
379 } 538 }
380 539
381 gpte = lgread(cpu, gpte_addr(gpgd, vaddr), pte_t); 540#ifdef CONFIG_X86_PAE
541 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
542 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
543 kill_guest(cpu, "Bad address %#lx", vaddr);
544 gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
545#else
546 gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
547#endif
382 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 548 if (!(pte_flags(gpte) & _PAGE_PRESENT))
383 kill_guest(cpu, "Bad address %#lx", vaddr); 549 kill_guest(cpu, "Bad address %#lx", vaddr);
384 550
@@ -405,6 +571,9 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
405 int *blank_pgdir) 571 int *blank_pgdir)
406{ 572{
407 unsigned int next; 573 unsigned int next;
574#ifdef CONFIG_X86_PAE
575 pmd_t *pmd_table;
576#endif
408 577
409 /* We pick one entry at random to throw out. Choosing the Least 578 /* We pick one entry at random to throw out. Choosing the Least
410 * Recently Used might be better, but this is easy. */ 579 * Recently Used might be better, but this is easy. */
@@ -416,10 +585,27 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
416 /* If the allocation fails, just keep using the one we have */ 585 /* If the allocation fails, just keep using the one we have */
417 if (!cpu->lg->pgdirs[next].pgdir) 586 if (!cpu->lg->pgdirs[next].pgdir)
418 next = cpu->cpu_pgd; 587 next = cpu->cpu_pgd;
419 else 588 else {
420 /* This is a blank page, so there are no kernel 589#ifdef CONFIG_X86_PAE
421 * mappings: caller must map the stack! */ 590 /* In PAE mode, allocate a pmd page and populate the
591 * last pgd entry. */
592 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
593 if (!pmd_table) {
594 free_page((long)cpu->lg->pgdirs[next].pgdir);
595 set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
596 next = cpu->cpu_pgd;
597 } else {
598 set_pgd(cpu->lg->pgdirs[next].pgdir +
599 SWITCHER_PGD_INDEX,
600 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
601 /* This is a blank page, so there are no kernel
602 * mappings: caller must map the stack! */
603 *blank_pgdir = 1;
604 }
605#else
422 *blank_pgdir = 1; 606 *blank_pgdir = 1;
607#endif
608 }
423 } 609 }
424 /* Record which Guest toplevel this shadows. */ 610 /* Record which Guest toplevel this shadows. */
425 cpu->lg->pgdirs[next].gpgdir = gpgdir; 611 cpu->lg->pgdirs[next].gpgdir = gpgdir;
@@ -431,7 +617,7 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
431 617
432/*H:430 (iv) Switching page tables 618/*H:430 (iv) Switching page tables
433 * 619 *
434 * Now we've seen all the page table setting and manipulation, let's see what 620 * Now we've seen all the page table setting and manipulation, let's see
435 * what happens when the Guest changes page tables (ie. changes the top-level 621 * what happens when the Guest changes page tables (ie. changes the top-level
436 * pgdir). This occurs on almost every context switch. */ 622 * pgdir). This occurs on almost every context switch. */
437void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 623void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
@@ -460,10 +646,25 @@ static void release_all_pagetables(struct lguest *lg)
460 646
461 /* Every shadow pagetable this Guest has */ 647 /* Every shadow pagetable this Guest has */
462 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) 648 for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
463 if (lg->pgdirs[i].pgdir) 649 if (lg->pgdirs[i].pgdir) {
650#ifdef CONFIG_X86_PAE
651 pgd_t *spgd;
652 pmd_t *pmdpage;
653 unsigned int k;
654
655 /* Get the last pmd page. */
656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
658
659 /* And release the pmd entries of that pmd page,
660 * except for the switcher pmd. */
661 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
662 release_pmd(&pmdpage[k]);
663#endif
464 /* Every PGD entry except the Switcher at the top */ 664 /* Every PGD entry except the Switcher at the top */
465 for (j = 0; j < SWITCHER_PGD_INDEX; j++) 665 for (j = 0; j < SWITCHER_PGD_INDEX; j++)
466 release_pgd(lg, lg->pgdirs[i].pgdir + j); 666 release_pgd(lg->pgdirs[i].pgdir + j);
667 }
467} 668}
468 669
469/* We also throw away everything when a Guest tells us it's changed a kernel 670/* We also throw away everything when a Guest tells us it's changed a kernel
@@ -504,24 +705,37 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
504{ 705{
505 /* Look up the matching shadow page directory entry. */ 706 /* Look up the matching shadow page directory entry. */
506 pgd_t *spgd = spgd_addr(cpu, idx, vaddr); 707 pgd_t *spgd = spgd_addr(cpu, idx, vaddr);
708#ifdef CONFIG_X86_PAE
709 pmd_t *spmd;
710#endif
507 711
508 /* If the top level isn't present, there's no entry to update. */ 712 /* If the top level isn't present, there's no entry to update. */
509 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 713 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
510 /* Otherwise, we start by releasing the existing entry. */ 714#ifdef CONFIG_X86_PAE
511 pte_t *spte = spte_addr(*spgd, vaddr); 715 spmd = spmd_addr(cpu, *spgd, vaddr);
512 release_pte(*spte); 716 if (pmd_flags(*spmd) & _PAGE_PRESENT) {
513 717#endif
514 /* If they're setting this entry as dirty or accessed, we might 718 /* Otherwise, we start by releasing
515 * as well put that entry they've given us in now. This shaves 719 * the existing entry. */
516 * 10% off a copy-on-write micro-benchmark. */ 720 pte_t *spte = spte_addr(cpu, *spgd, vaddr);
517 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 721 release_pte(*spte);
518 check_gpte(cpu, gpte); 722
519 *spte = gpte_to_spte(cpu, gpte, 723 /* If they're setting this entry as dirty or accessed,
520 pte_flags(gpte) & _PAGE_DIRTY); 724 * we might as well put that entry they've given us
521 } else 725 * in now. This shaves 10% off a
522 /* Otherwise kill it and we can demand_page() it in 726 * copy-on-write micro-benchmark. */
523 * later. */ 727 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
524 *spte = __pte(0); 728 check_gpte(cpu, gpte);
729 native_set_pte(spte,
730 gpte_to_spte(cpu, gpte,
731 pte_flags(gpte) & _PAGE_DIRTY));
732 } else
733 /* Otherwise kill it and we can demand_page()
734 * it in later. */
735 native_set_pte(spte, __pte(0));
736#ifdef CONFIG_X86_PAE
737 }
738#endif
525 } 739 }
526} 740}
527 741
@@ -568,12 +782,10 @@ void guest_set_pte(struct lg_cpu *cpu,
568 * 782 *
569 * So with that in mind here's our code to to update a (top-level) PGD entry: 783 * So with that in mind here's our code to to update a (top-level) PGD entry:
570 */ 784 */
571void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) 785void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
572{ 786{
573 int pgdir; 787 int pgdir;
574 788
575 /* The kernel seems to try to initialize this early on: we ignore its
576 * attempts to map over the Switcher. */
577 if (idx >= SWITCHER_PGD_INDEX) 789 if (idx >= SWITCHER_PGD_INDEX)
578 return; 790 return;
579 791
@@ -581,8 +793,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
581 pgdir = find_pgdir(lg, gpgdir); 793 pgdir = find_pgdir(lg, gpgdir);
582 if (pgdir < ARRAY_SIZE(lg->pgdirs)) 794 if (pgdir < ARRAY_SIZE(lg->pgdirs))
583 /* ... throw it away. */ 795 /* ... throw it away. */
584 release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); 796 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
585} 797}
798#ifdef CONFIG_X86_PAE
799void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
800{
801 guest_pagetable_clear_all(&lg->cpus[0]);
802}
803#endif
586 804
587/* Once we know how much memory we have we can construct simple identity 805/* Once we know how much memory we have we can construct simple identity
588 * (which set virtual == physical) and linear mappings 806 * (which set virtual == physical) and linear mappings
@@ -596,8 +814,16 @@ static unsigned long setup_pagetables(struct lguest *lg,
596{ 814{
597 pgd_t __user *pgdir; 815 pgd_t __user *pgdir;
598 pte_t __user *linear; 816 pte_t __user *linear;
599 unsigned int mapped_pages, i, linear_pages, phys_linear;
600 unsigned long mem_base = (unsigned long)lg->mem_base; 817 unsigned long mem_base = (unsigned long)lg->mem_base;
818 unsigned int mapped_pages, i, linear_pages;
819#ifdef CONFIG_X86_PAE
820 pmd_t __user *pmds;
821 unsigned int j;
822 pgd_t pgd;
823 pmd_t pmd;
824#else
825 unsigned int phys_linear;
826#endif
601 827
602 /* We have mapped_pages frames to map, so we need 828 /* We have mapped_pages frames to map, so we need
603 * linear_pages page tables to map them. */ 829 * linear_pages page tables to map them. */
@@ -610,6 +836,9 @@ static unsigned long setup_pagetables(struct lguest *lg,
610 /* Now we use the next linear_pages pages as pte pages */ 836 /* Now we use the next linear_pages pages as pte pages */
611 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 837 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
612 838
839#ifdef CONFIG_X86_PAE
840 pmds = (void *)linear - PAGE_SIZE;
841#endif
613 /* Linear mapping is easy: put every page's address into the 842 /* Linear mapping is easy: put every page's address into the
614 * mapping in order. */ 843 * mapping in order. */
615 for (i = 0; i < mapped_pages; i++) { 844 for (i = 0; i < mapped_pages; i++) {
@@ -621,6 +850,22 @@ static unsigned long setup_pagetables(struct lguest *lg,
621 850
622 /* The top level points to the linear page table pages above. 851 /* The top level points to the linear page table pages above.
623 * We setup the identity and linear mappings here. */ 852 * We setup the identity and linear mappings here. */
853#ifdef CONFIG_X86_PAE
854 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
855 i += PTRS_PER_PTE, j++) {
856 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
857 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
858
859 if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
860 return -EFAULT;
861 }
862
863 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
864 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
865 return -EFAULT;
866 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
867 return -EFAULT;
868#else
624 phys_linear = (unsigned long)linear - mem_base; 869 phys_linear = (unsigned long)linear - mem_base;
625 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 870 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
626 pgd_t pgd; 871 pgd_t pgd;
@@ -633,6 +878,7 @@ static unsigned long setup_pagetables(struct lguest *lg,
633 &pgd, sizeof(pgd))) 878 &pgd, sizeof(pgd)))
634 return -EFAULT; 879 return -EFAULT;
635 } 880 }
881#endif
636 882
637 /* We return the top level (guest-physical) address: remember where 883 /* We return the top level (guest-physical) address: remember where
638 * this is. */ 884 * this is. */
@@ -648,7 +894,10 @@ int init_guest_pagetable(struct lguest *lg)
648 u64 mem; 894 u64 mem;
649 u32 initrd_size; 895 u32 initrd_size;
650 struct boot_params __user *boot = (struct boot_params *)lg->mem_base; 896 struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
651 897#ifdef CONFIG_X86_PAE
898 pgd_t *pgd;
899 pmd_t *pmd_table;
900#endif
652 /* Get the Guest memory size and the ramdisk size from the boot header 901 /* Get the Guest memory size and the ramdisk size from the boot header
653 * located at lg->mem_base (Guest address 0). */ 902 * located at lg->mem_base (Guest address 0). */
654 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 903 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
@@ -663,6 +912,15 @@ int init_guest_pagetable(struct lguest *lg)
663 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 912 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
664 if (!lg->pgdirs[0].pgdir) 913 if (!lg->pgdirs[0].pgdir)
665 return -ENOMEM; 914 return -ENOMEM;
915#ifdef CONFIG_X86_PAE
916 pgd = lg->pgdirs[0].pgdir;
917 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
918 if (!pmd_table)
919 return -ENOMEM;
920
921 set_pgd(pgd + SWITCHER_PGD_INDEX,
922 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
923#endif
666 lg->cpus[0].cpu_pgd = 0; 924 lg->cpus[0].cpu_pgd = 0;
667 return 0; 925 return 0;
668} 926}
@@ -672,17 +930,24 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
672{ 930{
673 /* We get the kernel address: above this is all kernel memory. */ 931 /* We get the kernel address: above this is all kernel memory. */
674 if (get_user(cpu->lg->kernel_address, 932 if (get_user(cpu->lg->kernel_address,
675 &cpu->lg->lguest_data->kernel_address) 933 &cpu->lg->lguest_data->kernel_address)
676 /* We tell the Guest that it can't use the top 4MB of virtual 934 /* We tell the Guest that it can't use the top 2 or 4 MB
677 * addresses used by the Switcher. */ 935 * of virtual addresses used by the Switcher. */
678 || put_user(4U*1024*1024, &cpu->lg->lguest_data->reserve_mem) 936 || put_user(RESERVE_MEM * 1024 * 1024,
679 || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) 937 &cpu->lg->lguest_data->reserve_mem)
938 || put_user(cpu->lg->pgdirs[0].gpgdir,
939 &cpu->lg->lguest_data->pgdir))
680 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 940 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
681 941
682 /* In flush_user_mappings() we loop from 0 to 942 /* In flush_user_mappings() we loop from 0 to
683 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 943 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
684 * Switcher mappings, so check that now. */ 944 * Switcher mappings, so check that now. */
945#ifdef CONFIG_X86_PAE
946 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
947 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
948#else
685 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX) 949 if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
950#endif
686 kill_guest(cpu, "bad kernel address %#lx", 951 kill_guest(cpu, "bad kernel address %#lx",
687 cpu->lg->kernel_address); 952 cpu->lg->kernel_address);
688} 953}
@@ -708,16 +973,30 @@ void free_guest_pagetable(struct lguest *lg)
708void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 973void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
709{ 974{
710 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 975 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
711 pgd_t switcher_pgd;
712 pte_t regs_pte; 976 pte_t regs_pte;
713 unsigned long pfn; 977 unsigned long pfn;
714 978
979#ifdef CONFIG_X86_PAE
980 pmd_t switcher_pmd;
981 pmd_t *pmd_table;
982
983 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
984 PAGE_SHIFT, PAGE_KERNEL_EXEC));
985
986 pmd_table = __va(pgd_pfn(cpu->lg->
987 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
988 << PAGE_SHIFT);
989 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
990#else
991 pgd_t switcher_pgd;
992
715 /* Make the last PGD entry for this Guest point to the Switcher's PTE 993 /* Make the last PGD entry for this Guest point to the Switcher's PTE
716 * page for this CPU (with appropriate flags). */ 994 * page for this CPU (with appropriate flags). */
717 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL); 995 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
718 996
719 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 997 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
720 998
999#endif
721 /* We also change the Switcher PTE page. When we're running the Guest, 1000 /* We also change the Switcher PTE page. When we're running the Guest,
722 * we want the Guest's "regs" page to appear where the first Switcher 1001 * we want the Guest's "regs" page to appear where the first Switcher
723 * page for this CPU is. This is an optimization: when the Switcher 1002 * page for this CPU is. This is an optimization: when the Switcher
@@ -726,8 +1005,9 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
726 * page is already mapped there, we don't have to copy them out 1005 * page is already mapped there, we don't have to copy them out
727 * again. */ 1006 * again. */
728 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1007 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
729 regs_pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL)); 1008 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
730 switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; 1009 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
1010 regs_pte);
731} 1011}
732/*:*/ 1012/*:*/
733 1013
@@ -752,21 +1032,21 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
752 1032
753 /* The first entries are easy: they map the Switcher code. */ 1033 /* The first entries are easy: they map the Switcher code. */
754 for (i = 0; i < pages; i++) { 1034 for (i = 0; i < pages; i++) {
755 pte[i] = mk_pte(switcher_page[i], 1035 native_set_pte(&pte[i], mk_pte(switcher_page[i],
756 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 1036 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
757 } 1037 }
758 1038
759 /* The only other thing we map is this CPU's pair of pages. */ 1039 /* The only other thing we map is this CPU's pair of pages. */
760 i = pages + cpu*2; 1040 i = pages + cpu*2;
761 1041
762 /* First page (Guest registers) is writable from the Guest */ 1042 /* First page (Guest registers) is writable from the Guest */
763 pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), 1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
764 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); 1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
765 1045
766 /* The second page contains the "struct lguest_ro_state", and is 1046 /* The second page contains the "struct lguest_ro_state", and is
767 * read-only. */ 1047 * read-only. */
768 pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), 1048 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
769 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); 1049 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
770} 1050}
771 1051
772/* We've made it through the page table code. Perhaps our tired brains are 1052/* We've made it through the page table code. Perhaps our tired brains are
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 7ede64ffeef9..482ed5a18750 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
150{ 150{
151 /* We assume the Guest has the same number of GDT entries as the 151 /* We assume the Guest has the same number of GDT entries as the
152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
153 if (num > ARRAY_SIZE(cpu->arch.gdt)) 153 if (num >= ARRAY_SIZE(cpu->arch.gdt))
154 kill_guest(cpu, "too many gdt entries %i", num); 154 kill_guest(cpu, "too many gdt entries %i", num);
155 155
156 /* Set it up, then fix it. */ 156 /* Set it up, then fix it. */
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 2a701d593d35..3f0e1974abdc 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
16#include <linux/anon_inodes.h> 16#include <linux/anon_inodes.h>
17#include <linux/eventfd.h> 17#include <linux/eventfd.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
19#include <linux/module.h>
19 20
20struct eventfd_ctx { 21struct eventfd_ctx {
21 wait_queue_head_t wqh; 22 wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
56 57
57 return n; 58 return n;
58} 59}
60EXPORT_SYMBOL_GPL(eventfd_signal);
59 61
60static int eventfd_release(struct inode *inode, struct file *file) 62static int eventfd_release(struct inode *inode, struct file *file)
61{ 63{
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
197 199
198 return file; 200 return file;
199} 201}
202EXPORT_SYMBOL_GPL(eventfd_fget);
200 203
201SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) 204SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
202{ 205{
diff --git a/include/linux/lguest.h b/include/linux/lguest.h
index 175e63f4a8c0..7bc1440fc473 100644
--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -30,6 +30,10 @@ struct lguest_data
30 /* Wallclock time set by the Host. */ 30 /* Wallclock time set by the Host. */
31 struct timespec time; 31 struct timespec time;
32 32
33 /* Interrupt pending set by the Host. The Guest should do a hypercall
34 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */
35 int irq_pending;
36
33 /* Async hypercall ring. Instead of directly making hypercalls, we can 37 /* Async hypercall ring. Instead of directly making hypercalls, we can
34 * place them in here for processing the next time the Host wants. 38 * place them in here for processing the next time the Host wants.
35 * This batching can be quite efficient. */ 39 * This batching can be quite efficient. */
diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h
index a53407a4165c..bfefbdf7498a 100644
--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -57,7 +57,8 @@ enum lguest_req
57 LHREQ_INITIALIZE, /* + base, pfnlimit, start */ 57 LHREQ_INITIALIZE, /* + base, pfnlimit, start */
58 LHREQ_GETDMA, /* No longer used */ 58 LHREQ_GETDMA, /* No longer used */
59 LHREQ_IRQ, /* + irq */ 59 LHREQ_IRQ, /* + irq */
60 LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ 60 LHREQ_BREAK, /* No longer used */
61 LHREQ_EVENTFD, /* + address, fd. */
61}; 62};
62 63
63/* The alignment to use between consumer and producer parts of vring. 64/* The alignment to use between consumer and producer parts of vring.
diff --git a/kernel/sched.c b/kernel/sched.c
index f04aa9664504..8ec9d13140be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p)
2192 smp_send_reschedule(cpu); 2192 smp_send_reschedule(cpu);
2193 preempt_enable(); 2193 preempt_enable();
2194} 2194}
2195EXPORT_SYMBOL_GPL(kick_process);
2195 2196
2196/* 2197/*
2197 * Return a low guess at the load of a migration-source cpu weighted 2198 * Return a low guess at the load of a migration-source cpu weighted