aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/lguest
diff options
context:
space:
mode:
authorRusty Russell <rusty@rustcorp.com.au>2009-07-30 18:03:45 -0400
committerRusty Russell <rusty@rustcorp.com.au>2009-07-30 02:33:46 -0400
commita91d74a3c4de8115295ee87350c13a329164aaaf (patch)
tree02c862fccc9abedf7fc354061e69c4b5fbcce06d /drivers/lguest
parent2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff)
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Cc: Ingo Molnar <mingo@redhat.com> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/core.c7
-rw-r--r--drivers/lguest/hypercalls.c6
-rw-r--r--drivers/lguest/lguest_device.c11
-rw-r--r--drivers/lguest/lguest_user.c100
-rw-r--r--drivers/lguest/page_tables.c84
-rw-r--r--drivers/lguest/x86/core.c2
-rw-r--r--drivers/lguest/x86/switcher_32.S6
7 files changed, 176 insertions, 40 deletions
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index cd058bc903ff..1e2cb846b3c9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
217 217
218 /* 218 /*
219 * It's possible the Guest did a NOTIFY hypercall to the 219 * It's possible the Guest did a NOTIFY hypercall to the
220 * Launcher, in which case we return from the read() now. 220 * Launcher.
221 */ 221 */
222 if (cpu->pending_notify) { 222 if (cpu->pending_notify) {
223 /*
224 * Does it just needs to write to a registered
225 * eventfd (ie. the appropriate virtqueue thread)?
226 */
223 if (!send_notify_to_eventfd(cpu)) { 227 if (!send_notify_to_eventfd(cpu)) {
228 /* OK, we tell the main Laucher. */
224 if (put_user(cpu->pending_notify, user)) 229 if (put_user(cpu->pending_notify, user))
225 return -EFAULT; 230 return -EFAULT;
226 return sizeof(cpu->pending_notify); 231 return sizeof(cpu->pending_notify);
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 787ab4bc09f0..83511eb0923d 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
59 case LHCALL_SHUTDOWN: { 59 case LHCALL_SHUTDOWN: {
60 char msg[128]; 60 char msg[128];
61 /* 61 /*
62 * Shutdown is such a trivial hypercall that we do it in four 62 * Shutdown is such a trivial hypercall that we do it in five
63 * lines right here. 63 * lines right here.
64 * 64 *
65 * If the lgread fails, it will call kill_guest() itself; the 65 * If the lgread fails, it will call kill_guest() itself; the
@@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu)
245 * device), the Guest will still see the old page. In practice, this never 245 * device), the Guest will still see the old page. In practice, this never
246 * happens: why would the Guest read a page which it has never written to? But 246 * happens: why would the Guest read a page which it has never written to? But
247 * a similar scenario might one day bite us, so it's worth mentioning. 247 * a similar scenario might one day bite us, so it's worth mentioning.
248 *
249 * Note that if we used a shared anonymous mapping in the Launcher instead of
250 * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
251 * need that to switch the Launcher to processes (away from threads) anyway.
248:*/ 252:*/
249 253
250/*H:100 254/*H:100
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index cc000e79c3d1..1401c1ace1ec 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq)
236extern void lguest_setup_irq(unsigned int irq); 236extern void lguest_setup_irq(unsigned int irq);
237 237
238/* 238/*
239 * This routine finds the first virtqueue described in the configuration of 239 * This routine finds the Nth virtqueue described in the configuration of
240 * this device and sets it up. 240 * this device and sets it up.
241 * 241 *
242 * This is kind of an ugly duckling. It'd be nicer to have a standard 242 * This is kind of an ugly duckling. It'd be nicer to have a standard
@@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq);
244 * everyone wants to do it differently. The KVM coders want the Guest to 244 * everyone wants to do it differently. The KVM coders want the Guest to
245 * allocate its own pages and tell the Host where they are, but for lguest it's 245 * allocate its own pages and tell the Host where they are, but for lguest it's
246 * simpler for the Host to simply tell us where the pages are. 246 * simpler for the Host to simply tell us where the pages are.
247 *
248 * So we provide drivers with a "find the Nth virtqueue and set it up"
249 * function.
250 */ 247 */
251static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 248static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
252 unsigned index, 249 unsigned index,
@@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d,
422 419
423 /* This devices' parent is the lguest/ dir. */ 420 /* This devices' parent is the lguest/ dir. */
424 ldev->vdev.dev.parent = lguest_root; 421 ldev->vdev.dev.parent = lguest_root;
425 /* We have a unique device index thanks to the dev_index counter. */ 422 /*
423 * The device type comes straight from the descriptor. There's also a
424 * device vendor field in the virtio_device struct, which we leave as
425 * 0.
426 */
426 ldev->vdev.id.device = d->type; 427 ldev->vdev.id.device = d->type;
427 /* 428 /*
428 * We have a simple set of routines for querying the device's 429 * We have a simple set of routines for querying the device's
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 7e92017103dc..b4d3f7ca554f 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,9 +1,8 @@
1/*P:200 1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
2 * This contains all the /dev/lguest code, whereby the userspace launcher
3 * controls and communicates with the Guest. For example, the first write will 2 * controls and communicates with the Guest. For example, the first write will
4 * tell us the Guest's memory layout, pagetable, entry point and kernel address 3 * tell us the Guest's memory layout and entry point. A read will run the
5 * offset. A read will run the Guest until something happens, such as a signal 4 * Guest until something happens, such as a signal or the Guest doing a NOTIFY
6 * or the Guest doing a NOTIFY out to the Launcher. 5 * out to the Launcher.
7:*/ 6:*/
8#include <linux/uaccess.h> 7#include <linux/uaccess.h>
9#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
@@ -13,14 +12,41 @@
13#include <linux/file.h> 12#include <linux/file.h>
14#include "lg.h" 13#include "lg.h"
15 14
15/*L:056
16 * Before we move on, let's jump ahead and look at what the kernel does when
17 * it needs to look up the eventfds. That will complete our picture of how we
18 * use RCU.
19 *
20 * The notification value is in cpu->pending_notify: we return true if it went
21 * to an eventfd.
22 */
16bool send_notify_to_eventfd(struct lg_cpu *cpu) 23bool send_notify_to_eventfd(struct lg_cpu *cpu)
17{ 24{
18 unsigned int i; 25 unsigned int i;
19 struct lg_eventfd_map *map; 26 struct lg_eventfd_map *map;
20 27
21 /* lg->eventfds is RCU-protected */ 28 /*
29 * This "rcu_read_lock()" helps track when someone is still looking at
30 * the (RCU-using) eventfds array. It's not actually a lock at all;
31 * indeed it's a noop in many configurations. (You didn't expect me to
32 * explain all the RCU secrets here, did you?)
33 */
22 rcu_read_lock(); 34 rcu_read_lock();
35 /*
36 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
37 * makes sure we don't access the memory pointed to by
38 * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
39 * but Alpha allows this! Paul McKenney points out that a really
40 * aggressive compiler could have the same effect:
41 * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
42 *
43 * So play safe, use rcu_dereference to get the rcu-protected pointer:
44 */
23 map = rcu_dereference(cpu->lg->eventfds); 45 map = rcu_dereference(cpu->lg->eventfds);
46 /*
47 * Simple array search: even if they add an eventfd while we do this,
48 * we'll continue to use the old array and just won't see the new one.
49 */
24 for (i = 0; i < map->num; i++) { 50 for (i = 0; i < map->num; i++) {
25 if (map->map[i].addr == cpu->pending_notify) { 51 if (map->map[i].addr == cpu->pending_notify) {
26 eventfd_signal(map->map[i].event, 1); 52 eventfd_signal(map->map[i].event, 1);
@@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
28 break; 54 break;
29 } 55 }
30 } 56 }
57 /* We're done with the rcu-protected variable cpu->lg->eventfds. */
31 rcu_read_unlock(); 58 rcu_read_unlock();
59
60 /* If we cleared the notification, it's because we found a match. */
32 return cpu->pending_notify == 0; 61 return cpu->pending_notify == 0;
33} 62}
34 63
64/*L:055
65 * One of the more tricksy tricks in the Linux Kernel is a technique called
66 * Read Copy Update. Since one point of lguest is to teach lguest journeyers
67 * about kernel coding, I use it here. (In case you're curious, other purposes
68 * include learning about virtualization and instilling a deep appreciation for
69 * simplicity and puppies).
70 *
71 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
72 * add new eventfds without ever blocking readers from accessing the array.
73 * The current Launcher only does this during boot, so that never happens. But
74 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
75 * than this code does.
76 *
77 * We allocate a brand new one-larger array, copy the old one and add our new
78 * element. Then we make the lg eventfd pointer point to the new array.
79 * That's the easy part: now we need to free the old one, but we need to make
80 * sure no slow CPU somewhere is still looking at it. That's what
81 * synchronize_rcu does for us: waits until every CPU has indicated that it has
82 * moved on to know it's no longer using the old one.
83 *
84 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
85 */
35static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) 86static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
36{ 87{
37 struct lg_eventfd_map *new, *old = lg->eventfds; 88 struct lg_eventfd_map *new, *old = lg->eventfds;
38 89
90 /*
91 * We don't allow notifications on value 0 anyway (pending_notify of
92 * 0 means "nothing pending").
93 */
39 if (!addr) 94 if (!addr)
40 return -EINVAL; 95 return -EINVAL;
41 96
@@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
62 } 117 }
63 new->num++; 118 new->num++;
64 119
65 /* Now put new one in place. */ 120 /*
121 * Now put new one in place: rcu_assign_pointer() is a fancy way of
122 * doing "lg->eventfds = new", but it uses memory barriers to make
123 * absolutely sure that the contents of "new" written above is nailed
124 * down before we actually do the assignment.
125 *
126 * We have to think about these kinds of things when we're operating on
127 * live data without locks.
128 */
66 rcu_assign_pointer(lg->eventfds, new); 129 rcu_assign_pointer(lg->eventfds, new);
67 130
68 /* 131 /*
69 * We're not in a big hurry. Wait until noone's looking at old 132 * We're not in a big hurry. Wait until noone's looking at old
70 * version, then delete it. 133 * version, then free it.
71 */ 134 */
72 synchronize_rcu(); 135 synchronize_rcu();
73 kfree(old); 136 kfree(old);
@@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
75 return 0; 138 return 0;
76} 139}
77 140
141/*L:052
142 * Receiving notifications from the Guest is usually done by attaching a
143 * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
144 * become readable when the Guest does an LHCALL_NOTIFY with that value.
145 *
146 * This is really convenient for processing each virtqueue in a separate
147 * thread.
148 */
78static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) 149static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
79{ 150{
80 unsigned long addr, fd; 151 unsigned long addr, fd;
@@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
86 if (get_user(fd, input) != 0) 157 if (get_user(fd, input) != 0)
87 return -EFAULT; 158 return -EFAULT;
88 159
160 /*
161 * Just make sure two callers don't add eventfds at once. We really
162 * only need to lock against callers adding to the same Guest, so using
163 * the Big Lguest Lock is overkill. But this is setup, not a fast path.
164 */
89 mutex_lock(&lguest_lock); 165 mutex_lock(&lguest_lock);
90 err = add_eventfd(lg, addr, fd); 166 err = add_eventfd(lg, addr, fd);
91 mutex_unlock(&lguest_lock); 167 mutex_unlock(&lguest_lock);
@@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
106 if (irq >= LGUEST_IRQS) 182 if (irq >= LGUEST_IRQS)
107 return -EINVAL; 183 return -EINVAL;
108 184
185 /*
186 * Next time the Guest runs, the core code will see if it can deliver
187 * this interrupt.
188 */
109 set_interrupt(cpu, irq); 189 set_interrupt(cpu, irq);
110 return 0; 190 return 0;
111} 191}
@@ -307,10 +387,10 @@ unlock:
307 * The first operation the Launcher does must be a write. All writes 387 * The first operation the Launcher does must be a write. All writes
308 * start with an unsigned long number: for the first write this must be 388 * start with an unsigned long number: for the first write this must be
309 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 389 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
310 * writes of other values to send interrupts. 390 * writes of other values to send interrupts or set up receipt of notifications.
311 * 391 *
312 * Note that we overload the "offset" in the /dev/lguest file to indicate what 392 * Note that we overload the "offset" in the /dev/lguest file to indicate what
313 * CPU number we're dealing with. Currently this is always 0, since we only 393 * CPU number we're dealing with. Currently this is always 0 since we only
314 * support uniprocessor Guests, but you can see the beginnings of SMP support 394 * support uniprocessor Guests, but you can see the beginnings of SMP support
315 * here. 395 * here.
316 */ 396 */
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 3da902e4b4cb..a8d0aee3bc0e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -29,10 +29,10 @@
29/*H:300 29/*H:300
30 * The Page Table Code 30 * The Page Table Code
31 * 31 *
32 * We use two-level page tables for the Guest. If you're not entirely 32 * We use two-level page tables for the Guest, or three-level with PAE. If
33 * comfortable with virtual addresses, physical addresses and page tables then 33 * you're not entirely comfortable with virtual addresses, physical addresses
34 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
35 * diagrams!). 35 * Table Handling" (with diagrams!).
36 * 36 *
37 * The Guest keeps page tables, but we maintain the actual ones here: these are 37 * The Guest keeps page tables, but we maintain the actual ones here: these are
38 * called "shadow" page tables. Which is a very Guest-centric name: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -52,9 +52,8 @@
52:*/ 52:*/
53 53
54/* 54/*
55 * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
56 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * or 512 PTE entries with PAE (2MB).
57 * page.
58 */ 57 */
59#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
60 59
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
81 80
82/*H:320 81/*H:320
83 * The page table code is curly enough to need helper functions to keep it 82 * The page table code is curly enough to need helper functions to keep it
84 * clear and clean. 83 * clear and clean. The kernel itself provides many of them; one advantage
84 * of insisting that the Guest and Host use the same CONFIG_PAE setting.
85 * 85 *
86 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
87 * page tables. 87 * page tables.
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
155} 155}
156 156
157/* 157/*
158 * These two functions just like the above two, except they access the Guest 158 * These functions are just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address. 159 * page tables. Hence they return a Guest address.
160 */ 160 */
161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
165} 165}
166 166
167#ifdef CONFIG_X86_PAE 167#ifdef CONFIG_X86_PAE
168/* Follow the PGD to the PMD. */
168static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
169{ 170{
170 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
172 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
173} 174}
174 175
176/* Follow the PMD to the PTE. */
175static unsigned long gpte_addr(struct lg_cpu *cpu, 177static unsigned long gpte_addr(struct lg_cpu *cpu,
176 pmd_t gpmd, unsigned long vaddr) 178 pmd_t gpmd, unsigned long vaddr)
177{ 179{
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
181 return gpage + pte_index(vaddr) * sizeof(pte_t); 183 return gpage + pte_index(vaddr) * sizeof(pte_t);
182} 184}
183#else 185#else
186/* Follow the PGD to the PTE (no mid-level for !PAE). */
184static unsigned long gpte_addr(struct lg_cpu *cpu, 187static unsigned long gpte_addr(struct lg_cpu *cpu,
185 pgd_t gpgd, unsigned long vaddr) 188 pgd_t gpgd, unsigned long vaddr)
186{ 189{
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
314 pte_t gpte; 317 pte_t gpte;
315 pte_t *spte; 318 pte_t *spte;
316 319
320 /* Mid level for PAE. */
317#ifdef CONFIG_X86_PAE 321#ifdef CONFIG_X86_PAE
318 pmd_t *spmd; 322 pmd_t *spmd;
319 pmd_t gpmd; 323 pmd_t gpmd;
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
391 */ 395 */
392 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
393#endif 397#endif
398
399 /* Read the actual PTE value. */
394 gpte = lgread(cpu, gpte_ptr, pte_t); 400 gpte = lgread(cpu, gpte_ptr, pte_t);
395 401
396 /* If this page isn't in the Guest page tables, we can't page it in. */ 402 /* If this page isn't in the Guest page tables, we can't page it in. */
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
507 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
508 kill_guest(cpu, "bad stack page %#lx", vaddr); 514 kill_guest(cpu, "bad stack page %#lx", vaddr);
509} 515}
516/*:*/
510 517
511#ifdef CONFIG_X86_PAE 518#ifdef CONFIG_X86_PAE
512static void release_pmd(pmd_t *spmd) 519static void release_pmd(pmd_t *spmd)
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd)
543} 550}
544 551
545#else /* !CONFIG_X86_PAE */ 552#else /* !CONFIG_X86_PAE */
546/*H:450 If we chase down the release_pgd() code, it looks like this: */ 553/*H:450
554 * If we chase down the release_pgd() code, the non-PAE version looks like
555 * this. The PAE version is almost identical, but instead of calling
556 * release_pte it calls release_pmd(), which looks much like this.
557 */
547static void release_pgd(pgd_t *spgd) 558static void release_pgd(pgd_t *spgd)
548{ 559{
549 /* If the entry's not present, there's nothing to release. */ 560 /* If the entry's not present, there's nothing to release. */
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
898 /* ... throw it away. */ 909 /* ... throw it away. */
899 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
900} 911}
912
901#ifdef CONFIG_X86_PAE 913#ifdef CONFIG_X86_PAE
914/* For setting a mid-level, we just throw everything away. It's easy. */
902void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
903{ 916{
904 guest_pagetable_clear_all(&lg->cpus[0]); 917 guest_pagetable_clear_all(&lg->cpus[0]);
905} 918}
906#endif 919#endif
907 920
908/* 921/*H:505
909 * Once we know how much memory we have we can construct simple identity (which 922 * To get through boot, we construct simple identity page mappings (which
910 * set virtual == physical) and linear mappings which will get the Guest far 923 * set virtual == physical) and linear mappings which will get the Guest far
911 * enough into the boot to create its own. 924 * enough into the boot to create its own. The linear mapping means we
925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
926 * as you'll see.
912 * 927 *
913 * We lay them out of the way, just below the initrd (which is why we need to 928 * We lay them out of the way, just below the initrd (which is why we need to
914 * know its size here). 929 * know its size here).
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
944 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
945 960
946#ifdef CONFIG_X86_PAE 961#ifdef CONFIG_X86_PAE
962 /*
963 * And the single mid page goes below that. We only use one, but
964 * that's enough to map 1G, which definitely gets us through boot.
965 */
947 pmds = (void *)linear - PAGE_SIZE; 966 pmds = (void *)linear - PAGE_SIZE;
948#endif 967#endif
949 /* 968 /*
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,
957 return -EFAULT; 976 return -EFAULT;
958 } 977 }
959 978
979#ifdef CONFIG_X86_PAE
960 /* 980 /*
961 * The top level points to the linear page table pages above. 981 * Make the Guest PMD entries point to the corresponding place in the
962 * We setup the identity and linear mappings here. 982 * linear mapping (up to one page worth of PMD).
963 */ 983 */
964#ifdef CONFIG_X86_PAE
965 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
966 i += PTRS_PER_PTE, j++) { 985 i += PTRS_PER_PTE, j++) {
986 /* FIXME: native_set_pmd is overkill here. */
967 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
968 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
969 989
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,
971 return -EFAULT; 991 return -EFAULT;
972 } 992 }
973 993
994 /* One PGD entry, pointing to that PMD page. */
974 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 995 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
975 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
976 return -EFAULT; 998 return -EFAULT;
999 /*
1000 * And the third PGD entry (ie. addresses 3G-4G).
1001 *
1002 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000.
1003 */
977 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 1004 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
978 return -EFAULT; 1005 return -EFAULT;
979#else 1006#else
1007 /*
1008 * The top level points to the linear page table pages above.
1009 * We setup the identity and linear mappings here.
1010 */
980 phys_linear = (unsigned long)linear - mem_base; 1011 phys_linear = (unsigned long)linear - mem_base;
981 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1012 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
982 pgd_t pgd; 1013 pgd_t pgd;
1014 /*
1015 * Create a PGD entry which points to the right part of the
1016 * linear PTE pages.
1017 */
983 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1018 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
984 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1019 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
985 1020
1021 /*
1022 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1023 */
986 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1024 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
987 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1025 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
988 + i / PTRS_PER_PTE], 1026 + i / PTRS_PER_PTE],
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg,
992#endif 1030#endif
993 1031
994 /* 1032 /*
995 * We return the top level (guest-physical) address: remember where 1033 * We return the top level (guest-physical) address: we remember where
996 * this is. 1034 * this is to write it into lguest_data when the Guest initializes.
997 */ 1035 */
998 return (unsigned long)pgdir - mem_base; 1036 return (unsigned long)pgdir - mem_base;
999} 1037}
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg)
1031 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1069 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
1032 if (!lg->pgdirs[0].pgdir) 1070 if (!lg->pgdirs[0].pgdir)
1033 return -ENOMEM; 1071 return -ENOMEM;
1072
1034#ifdef CONFIG_X86_PAE 1073#ifdef CONFIG_X86_PAE
1074 /* For PAE, we also create the initial mid-level. */
1035 pgd = lg->pgdirs[0].pgdir; 1075 pgd = lg->pgdirs[0].pgdir;
1036 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1076 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
1037 if (!pmd_table) 1077 if (!pmd_table)
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg)
1040 set_pgd(pgd + SWITCHER_PGD_INDEX, 1080 set_pgd(pgd + SWITCHER_PGD_INDEX,
1041 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1081 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
1042#endif 1082#endif
1083
1084 /* This is the current page table. */
1043 lg->cpus[0].cpu_pgd = 0; 1085 lg->cpus[0].cpu_pgd = 0;
1044 return 0; 1086 return 0;
1045} 1087}
1046 1088
1047/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1089/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
1048void page_table_guest_data_init(struct lg_cpu *cpu) 1090void page_table_guest_data_init(struct lg_cpu *cpu)
1049{ 1091{
1050 /* We get the kernel address: above this is all kernel memory. */ 1092 /* We get the kernel address: above this is all kernel memory. */
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
1105 pmd_t switcher_pmd; 1147 pmd_t switcher_pmd;
1106 pmd_t *pmd_table; 1148 pmd_t *pmd_table;
1107 1149
1150 /* FIXME: native_set_pmd is overkill here. */
1108 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
1109 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 PAGE_SHIFT, PAGE_KERNEL_EXEC));
1110 1153
1154 /* Figure out where the pmd page is, by reading the PGD, and converting
1155 * it to a virtual address. */
1111 pmd_table = __va(pgd_pfn(cpu->lg-> 1156 pmd_table = __va(pgd_pfn(cpu->lg->
1112 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1157 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
1113 << PAGE_SHIFT); 1158 << PAGE_SHIFT);
1159 /* Now write it into the shadow page table. */
1114 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1160 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
1115#else 1161#else
1116 pgd_t switcher_pgd; 1162 pgd_t switcher_pgd;
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 96f7d88ec7f8..6ae388849a3b 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
187 * also simplify copy_in_guest_info(). Note that we'd still need to restore 187 * also simplify copy_in_guest_info(). Note that we'd still need to restore
188 * things when we exit to Launcher userspace, but that's fairly easy. 188 * things when we exit to Launcher userspace, but that's fairly easy.
189 * 189 *
190 * We could also try using this hooks for PGE, but that might be too expensive. 190 * We could also try using these hooks for PGE, but that might be too expensive.
191 * 191 *
192 * The hooks were designed for KVM, but we can also put them to good use. 192 * The hooks were designed for KVM, but we can also put them to good use.
193:*/ 193:*/
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 6dec09793836..40634b0db9f7 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -1,7 +1,7 @@
1/*P:900 1/*P:900
2 * This is the Switcher: code which sits at 0xFFC00000 astride both the 2 * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
3 * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 3 * both the Host and Guest to do the low-level Guest<->Host switch. It is as
4 * it can be made, but it's naturally very specific to x86. 4 * simple as it can be made, but it's naturally very specific to x86.
5 * 5 *
6 * You have now completed Preparation. If this has whet your appetite; if you 6 * You have now completed Preparation. If this has whet your appetite; if you
7 * are feeling invigorated and refreshed then the next, more challenging stage 7 * are feeling invigorated and refreshed then the next, more challenging stage