diff options
author | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 18:03:45 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2009-07-30 02:33:46 -0400 |
commit | a91d74a3c4de8115295ee87350c13a329164aaaf (patch) | |
tree | 02c862fccc9abedf7fc354061e69c4b5fbcce06d /drivers | |
parent | 2e04ef76916d1e29a077ea9d0f2003c8fd86724d (diff) |
lguest: update commentry
Every so often, after code shuffles, I need to go through and unbitrot
the Lguest Journey (see drivers/lguest/README). Since we now use RCU in
a simple form in one place I took the opportunity to expand that explanation.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/lguest/core.c | 7 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 6 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 11 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 100 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 84 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 2 | ||||
-rw-r--r-- | drivers/lguest/x86/switcher_32.S | 6 |
7 files changed, 176 insertions, 40 deletions
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c | |||
@@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) | |||
217 | 217 | ||
218 | /* | 218 | /* |
219 | * It's possible the Guest did a NOTIFY hypercall to the | 219 | * It's possible the Guest did a NOTIFY hypercall to the |
220 | * Launcher, in which case we return from the read() now. | 220 | * Launcher. |
221 | */ | 221 | */ |
222 | if (cpu->pending_notify) { | 222 | if (cpu->pending_notify) { |
223 | /* | ||
224 | * Does it just needs to write to a registered | ||
225 | * eventfd (ie. the appropriate virtqueue thread)? | ||
226 | */ | ||
223 | if (!send_notify_to_eventfd(cpu)) { | 227 | if (!send_notify_to_eventfd(cpu)) { |
228 | /* OK, we tell the main Laucher. */ | ||
224 | if (put_user(cpu->pending_notify, user)) | 229 | if (put_user(cpu->pending_notify, user)) |
225 | return -EFAULT; | 230 | return -EFAULT; |
226 | return sizeof(cpu->pending_notify); | 231 | return sizeof(cpu->pending_notify); |
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) | |||
59 | case LHCALL_SHUTDOWN: { | 59 | case LHCALL_SHUTDOWN: { |
60 | char msg[128]; | 60 | char msg[128]; |
61 | /* | 61 | /* |
62 | * Shutdown is such a trivial hypercall that we do it in four | 62 | * Shutdown is such a trivial hypercall that we do it in five |
63 | * lines right here. | 63 | * lines right here. |
64 | * | 64 | * |
65 | * If the lgread fails, it will call kill_guest() itself; the | 65 | * If the lgread fails, it will call kill_guest() itself; the |
@@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) | |||
245 | * device), the Guest will still see the old page. In practice, this never | 245 | * device), the Guest will still see the old page. In practice, this never |
246 | * happens: why would the Guest read a page which it has never written to? But | 246 | * happens: why would the Guest read a page which it has never written to? But |
247 | * a similar scenario might one day bite us, so it's worth mentioning. | 247 | * a similar scenario might one day bite us, so it's worth mentioning. |
248 | * | ||
249 | * Note that if we used a shared anonymous mapping in the Launcher instead of | ||
250 | * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we | ||
251 | * need that to switch the Launcher to processes (away from threads) anyway. | ||
248 | :*/ | 252 | :*/ |
249 | 253 | ||
250 | /*H:100 | 254 | /*H:100 |
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c | |||
@@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) | |||
236 | extern void lguest_setup_irq(unsigned int irq); | 236 | extern void lguest_setup_irq(unsigned int irq); |
237 | 237 | ||
238 | /* | 238 | /* |
239 | * This routine finds the first virtqueue described in the configuration of | 239 | * This routine finds the Nth virtqueue described in the configuration of |
240 | * this device and sets it up. | 240 | * this device and sets it up. |
241 | * | 241 | * |
242 | * This is kind of an ugly duckling. It'd be nicer to have a standard | 242 | * This is kind of an ugly duckling. It'd be nicer to have a standard |
@@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); | |||
244 | * everyone wants to do it differently. The KVM coders want the Guest to | 244 | * everyone wants to do it differently. The KVM coders want the Guest to |
245 | * allocate its own pages and tell the Host where they are, but for lguest it's | 245 | * allocate its own pages and tell the Host where they are, but for lguest it's |
246 | * simpler for the Host to simply tell us where the pages are. | 246 | * simpler for the Host to simply tell us where the pages are. |
247 | * | ||
248 | * So we provide drivers with a "find the Nth virtqueue and set it up" | ||
249 | * function. | ||
250 | */ | 247 | */ |
251 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, | 248 | static struct virtqueue *lg_find_vq(struct virtio_device *vdev, |
252 | unsigned index, | 249 | unsigned index, |
@@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, | |||
422 | 419 | ||
423 | /* This devices' parent is the lguest/ dir. */ | 420 | /* This devices' parent is the lguest/ dir. */ |
424 | ldev->vdev.dev.parent = lguest_root; | 421 | ldev->vdev.dev.parent = lguest_root; |
425 | /* We have a unique device index thanks to the dev_index counter. */ | 422 | /* |
423 | * The device type comes straight from the descriptor. There's also a | ||
424 | * device vendor field in the virtio_device struct, which we leave as | ||
425 | * 0. | ||
426 | */ | ||
426 | ldev->vdev.id.device = d->type; | 427 | ldev->vdev.id.device = d->type; |
427 | /* | 428 | /* |
428 | * We have a simple set of routines for querying the device's | 429 | * We have a simple set of routines for querying the device's |
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c | |||
@@ -1,9 +1,8 @@ | |||
1 | /*P:200 | 1 | /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher |
2 | * This contains all the /dev/lguest code, whereby the userspace launcher | ||
3 | * controls and communicates with the Guest. For example, the first write will | 2 | * controls and communicates with the Guest. For example, the first write will |
4 | * tell us the Guest's memory layout, pagetable, entry point and kernel address | 3 | * tell us the Guest's memory layout and entry point. A read will run the |
5 | * offset. A read will run the Guest until something happens, such as a signal | 4 | * Guest until something happens, such as a signal or the Guest doing a NOTIFY |
6 | * or the Guest doing a NOTIFY out to the Launcher. | 5 | * out to the Launcher. |
7 | :*/ | 6 | :*/ |
8 | #include <linux/uaccess.h> | 7 | #include <linux/uaccess.h> |
9 | #include <linux/miscdevice.h> | 8 | #include <linux/miscdevice.h> |
@@ -13,14 +12,41 @@ | |||
13 | #include <linux/file.h> | 12 | #include <linux/file.h> |
14 | #include "lg.h" | 13 | #include "lg.h" |
15 | 14 | ||
15 | /*L:056 | ||
16 | * Before we move on, let's jump ahead and look at what the kernel does when | ||
17 | * it needs to look up the eventfds. That will complete our picture of how we | ||
18 | * use RCU. | ||
19 | * | ||
20 | * The notification value is in cpu->pending_notify: we return true if it went | ||
21 | * to an eventfd. | ||
22 | */ | ||
16 | bool send_notify_to_eventfd(struct lg_cpu *cpu) | 23 | bool send_notify_to_eventfd(struct lg_cpu *cpu) |
17 | { | 24 | { |
18 | unsigned int i; | 25 | unsigned int i; |
19 | struct lg_eventfd_map *map; | 26 | struct lg_eventfd_map *map; |
20 | 27 | ||
21 | /* lg->eventfds is RCU-protected */ | 28 | /* |
29 | * This "rcu_read_lock()" helps track when someone is still looking at | ||
30 | * the (RCU-using) eventfds array. It's not actually a lock at all; | ||
31 | * indeed it's a noop in many configurations. (You didn't expect me to | ||
32 | * explain all the RCU secrets here, did you?) | ||
33 | */ | ||
22 | rcu_read_lock(); | 34 | rcu_read_lock(); |
35 | /* | ||
36 | * rcu_dereference is the counter-side of rcu_assign_pointer(); it | ||
37 | * makes sure we don't access the memory pointed to by | ||
38 | * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, | ||
39 | * but Alpha allows this! Paul McKenney points out that a really | ||
40 | * aggressive compiler could have the same effect: | ||
41 | * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html | ||
42 | * | ||
43 | * So play safe, use rcu_dereference to get the rcu-protected pointer: | ||
44 | */ | ||
23 | map = rcu_dereference(cpu->lg->eventfds); | 45 | map = rcu_dereference(cpu->lg->eventfds); |
46 | /* | ||
47 | * Simple array search: even if they add an eventfd while we do this, | ||
48 | * we'll continue to use the old array and just won't see the new one. | ||
49 | */ | ||
24 | for (i = 0; i < map->num; i++) { | 50 | for (i = 0; i < map->num; i++) { |
25 | if (map->map[i].addr == cpu->pending_notify) { | 51 | if (map->map[i].addr == cpu->pending_notify) { |
26 | eventfd_signal(map->map[i].event, 1); | 52 | eventfd_signal(map->map[i].event, 1); |
@@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) | |||
28 | break; | 54 | break; |
29 | } | 55 | } |
30 | } | 56 | } |
57 | /* We're done with the rcu-protected variable cpu->lg->eventfds. */ | ||
31 | rcu_read_unlock(); | 58 | rcu_read_unlock(); |
59 | |||
60 | /* If we cleared the notification, it's because we found a match. */ | ||
32 | return cpu->pending_notify == 0; | 61 | return cpu->pending_notify == 0; |
33 | } | 62 | } |
34 | 63 | ||
64 | /*L:055 | ||
65 | * One of the more tricksy tricks in the Linux Kernel is a technique called | ||
66 | * Read Copy Update. Since one point of lguest is to teach lguest journeyers | ||
67 | * about kernel coding, I use it here. (In case you're curious, other purposes | ||
68 | * include learning about virtualization and instilling a deep appreciation for | ||
69 | * simplicity and puppies). | ||
70 | * | ||
71 | * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we | ||
72 | * add new eventfds without ever blocking readers from accessing the array. | ||
73 | * The current Launcher only does this during boot, so that never happens. But | ||
74 | * Read Copy Update is cool, and adding a lock risks damaging even more puppies | ||
75 | * than this code does. | ||
76 | * | ||
77 | * We allocate a brand new one-larger array, copy the old one and add our new | ||
78 | * element. Then we make the lg eventfd pointer point to the new array. | ||
79 | * That's the easy part: now we need to free the old one, but we need to make | ||
80 | * sure no slow CPU somewhere is still looking at it. That's what | ||
81 | * synchronize_rcu does for us: waits until every CPU has indicated that it has | ||
82 | * moved on to know it's no longer using the old one. | ||
83 | * | ||
84 | * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. | ||
85 | */ | ||
35 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | 86 | static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) |
36 | { | 87 | { |
37 | struct lg_eventfd_map *new, *old = lg->eventfds; | 88 | struct lg_eventfd_map *new, *old = lg->eventfds; |
38 | 89 | ||
90 | /* | ||
91 | * We don't allow notifications on value 0 anyway (pending_notify of | ||
92 | * 0 means "nothing pending"). | ||
93 | */ | ||
39 | if (!addr) | 94 | if (!addr) |
40 | return -EINVAL; | 95 | return -EINVAL; |
41 | 96 | ||
@@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
62 | } | 117 | } |
63 | new->num++; | 118 | new->num++; |
64 | 119 | ||
65 | /* Now put new one in place. */ | 120 | /* |
121 | * Now put new one in place: rcu_assign_pointer() is a fancy way of | ||
122 | * doing "lg->eventfds = new", but it uses memory barriers to make | ||
123 | * absolutely sure that the contents of "new" written above is nailed | ||
124 | * down before we actually do the assignment. | ||
125 | * | ||
126 | * We have to think about these kinds of things when we're operating on | ||
127 | * live data without locks. | ||
128 | */ | ||
66 | rcu_assign_pointer(lg->eventfds, new); | 129 | rcu_assign_pointer(lg->eventfds, new); |
67 | 130 | ||
68 | /* | 131 | /* |
69 | * We're not in a big hurry. Wait until noone's looking at old | 132 | * We're not in a big hurry. Wait until noone's looking at old |
70 | * version, then delete it. | 133 | * version, then free it. |
71 | */ | 134 | */ |
72 | synchronize_rcu(); | 135 | synchronize_rcu(); |
73 | kfree(old); | 136 | kfree(old); |
@@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) | |||
75 | return 0; | 138 | return 0; |
76 | } | 139 | } |
77 | 140 | ||
141 | /*L:052 | ||
142 | * Receiving notifications from the Guest is usually done by attaching a | ||
143 | * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will | ||
144 | * become readable when the Guest does an LHCALL_NOTIFY with that value. | ||
145 | * | ||
146 | * This is really convenient for processing each virtqueue in a separate | ||
147 | * thread. | ||
148 | */ | ||
78 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | 149 | static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) |
79 | { | 150 | { |
80 | unsigned long addr, fd; | 151 | unsigned long addr, fd; |
@@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) | |||
86 | if (get_user(fd, input) != 0) | 157 | if (get_user(fd, input) != 0) |
87 | return -EFAULT; | 158 | return -EFAULT; |
88 | 159 | ||
160 | /* | ||
161 | * Just make sure two callers don't add eventfds at once. We really | ||
162 | * only need to lock against callers adding to the same Guest, so using | ||
163 | * the Big Lguest Lock is overkill. But this is setup, not a fast path. | ||
164 | */ | ||
89 | mutex_lock(&lguest_lock); | 165 | mutex_lock(&lguest_lock); |
90 | err = add_eventfd(lg, addr, fd); | 166 | err = add_eventfd(lg, addr, fd); |
91 | mutex_unlock(&lguest_lock); | 167 | mutex_unlock(&lguest_lock); |
@@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) | |||
106 | if (irq >= LGUEST_IRQS) | 182 | if (irq >= LGUEST_IRQS) |
107 | return -EINVAL; | 183 | return -EINVAL; |
108 | 184 | ||
185 | /* | ||
186 | * Next time the Guest runs, the core code will see if it can deliver | ||
187 | * this interrupt. | ||
188 | */ | ||
109 | set_interrupt(cpu, irq); | 189 | set_interrupt(cpu, irq); |
110 | return 0; | 190 | return 0; |
111 | } | 191 | } |
@@ -307,10 +387,10 @@ unlock: | |||
307 | * The first operation the Launcher does must be a write. All writes | 387 | * The first operation the Launcher does must be a write. All writes |
308 | * start with an unsigned long number: for the first write this must be | 388 | * start with an unsigned long number: for the first write this must be |
309 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use | 389 | * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use |
310 | * writes of other values to send interrupts. | 390 | * writes of other values to send interrupts or set up receipt of notifications. |
311 | * | 391 | * |
312 | * Note that we overload the "offset" in the /dev/lguest file to indicate what | 392 | * Note that we overload the "offset" in the /dev/lguest file to indicate what |
313 | * CPU number we're dealing with. Currently this is always 0, since we only | 393 | * CPU number we're dealing with. Currently this is always 0 since we only |
314 | * support uniprocessor Guests, but you can see the beginnings of SMP support | 394 | * support uniprocessor Guests, but you can see the beginnings of SMP support |
315 | * here. | 395 | * here. |
316 | */ | 396 | */ |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -29,10 +29,10 @@ | |||
29 | /*H:300 | 29 | /*H:300 |
30 | * The Page Table Code | 30 | * The Page Table Code |
31 | * | 31 | * |
32 | * We use two-level page tables for the Guest. If you're not entirely | 32 | * We use two-level page tables for the Guest, or three-level with PAE. If |
33 | * comfortable with virtual addresses, physical addresses and page tables then | 33 | * you're not entirely comfortable with virtual addresses, physical addresses |
34 | * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with | 34 | * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page |
35 | * diagrams!). | 35 | * Table Handling" (with diagrams!). |
36 | * | 36 | * |
37 | * The Guest keeps page tables, but we maintain the actual ones here: these are | 37 | * The Guest keeps page tables, but we maintain the actual ones here: these are |
38 | * called "shadow" page tables. Which is a very Guest-centric name: these are | 38 | * called "shadow" page tables. Which is a very Guest-centric name: these are |
@@ -52,9 +52,8 @@ | |||
52 | :*/ | 52 | :*/ |
53 | 53 | ||
54 | /* | 54 | /* |
55 | * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 55 | * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) |
56 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 56 | * or 512 PTE entries with PAE (2MB). |
57 | * page. | ||
58 | */ | 57 | */ |
59 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) | 58 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
60 | 59 | ||
@@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); | |||
81 | 80 | ||
82 | /*H:320 | 81 | /*H:320 |
83 | * The page table code is curly enough to need helper functions to keep it | 82 | * The page table code is curly enough to need helper functions to keep it |
84 | * clear and clean. | 83 | * clear and clean. The kernel itself provides many of them; one advantage |
84 | * of insisting that the Guest and Host use the same CONFIG_PAE setting. | ||
85 | * | 85 | * |
86 | * There are two functions which return pointers to the shadow (aka "real") | 86 | * There are two functions which return pointers to the shadow (aka "real") |
87 | * page tables. | 87 | * page tables. |
@@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) | |||
155 | } | 155 | } |
156 | 156 | ||
157 | /* | 157 | /* |
158 | * These two functions just like the above two, except they access the Guest | 158 | * These functions are just like the above two, except they access the Guest |
159 | * page tables. Hence they return a Guest address. | 159 | * page tables. Hence they return a Guest address. |
160 | */ | 160 | */ |
161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | 161 | static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) |
@@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) | |||
165 | } | 165 | } |
166 | 166 | ||
167 | #ifdef CONFIG_X86_PAE | 167 | #ifdef CONFIG_X86_PAE |
168 | /* Follow the PGD to the PMD. */ | ||
168 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | 169 | static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) |
169 | { | 170 | { |
170 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; | 171 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
@@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) | |||
172 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); | 173 | return gpage + pmd_index(vaddr) * sizeof(pmd_t); |
173 | } | 174 | } |
174 | 175 | ||
176 | /* Follow the PMD to the PTE. */ | ||
175 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 177 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
176 | pmd_t gpmd, unsigned long vaddr) | 178 | pmd_t gpmd, unsigned long vaddr) |
177 | { | 179 | { |
@@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, | |||
181 | return gpage + pte_index(vaddr) * sizeof(pte_t); | 183 | return gpage + pte_index(vaddr) * sizeof(pte_t); |
182 | } | 184 | } |
183 | #else | 185 | #else |
186 | /* Follow the PGD to the PTE (no mid-level for !PAE). */ | ||
184 | static unsigned long gpte_addr(struct lg_cpu *cpu, | 187 | static unsigned long gpte_addr(struct lg_cpu *cpu, |
185 | pgd_t gpgd, unsigned long vaddr) | 188 | pgd_t gpgd, unsigned long vaddr) |
186 | { | 189 | { |
@@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
314 | pte_t gpte; | 317 | pte_t gpte; |
315 | pte_t *spte; | 318 | pte_t *spte; |
316 | 319 | ||
320 | /* Mid level for PAE. */ | ||
317 | #ifdef CONFIG_X86_PAE | 321 | #ifdef CONFIG_X86_PAE |
318 | pmd_t *spmd; | 322 | pmd_t *spmd; |
319 | pmd_t gpmd; | 323 | pmd_t gpmd; |
@@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) | |||
391 | */ | 395 | */ |
392 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); | 396 | gpte_ptr = gpte_addr(cpu, gpgd, vaddr); |
393 | #endif | 397 | #endif |
398 | |||
399 | /* Read the actual PTE value. */ | ||
394 | gpte = lgread(cpu, gpte_ptr, pte_t); | 400 | gpte = lgread(cpu, gpte_ptr, pte_t); |
395 | 401 | ||
396 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 402 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
@@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) | |||
507 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) | 513 | if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) |
508 | kill_guest(cpu, "bad stack page %#lx", vaddr); | 514 | kill_guest(cpu, "bad stack page %#lx", vaddr); |
509 | } | 515 | } |
516 | /*:*/ | ||
510 | 517 | ||
511 | #ifdef CONFIG_X86_PAE | 518 | #ifdef CONFIG_X86_PAE |
512 | static void release_pmd(pmd_t *spmd) | 519 | static void release_pmd(pmd_t *spmd) |
@@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) | |||
543 | } | 550 | } |
544 | 551 | ||
545 | #else /* !CONFIG_X86_PAE */ | 552 | #else /* !CONFIG_X86_PAE */ |
546 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 553 | /*H:450 |
554 | * If we chase down the release_pgd() code, the non-PAE version looks like | ||
555 | * this. The PAE version is almost identical, but instead of calling | ||
556 | * release_pte it calls release_pmd(), which looks much like this. | ||
557 | */ | ||
547 | static void release_pgd(pgd_t *spgd) | 558 | static void release_pgd(pgd_t *spgd) |
548 | { | 559 | { |
549 | /* If the entry's not present, there's nothing to release. */ | 560 | /* If the entry's not present, there's nothing to release. */ |
@@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) | |||
898 | /* ... throw it away. */ | 909 | /* ... throw it away. */ |
899 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); | 910 | release_pgd(lg->pgdirs[pgdir].pgdir + idx); |
900 | } | 911 | } |
912 | |||
901 | #ifdef CONFIG_X86_PAE | 913 | #ifdef CONFIG_X86_PAE |
914 | /* For setting a mid-level, we just throw everything away. It's easy. */ | ||
902 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) | 915 | void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) |
903 | { | 916 | { |
904 | guest_pagetable_clear_all(&lg->cpus[0]); | 917 | guest_pagetable_clear_all(&lg->cpus[0]); |
905 | } | 918 | } |
906 | #endif | 919 | #endif |
907 | 920 | ||
908 | /* | 921 | /*H:505 |
909 | * Once we know how much memory we have we can construct simple identity (which | 922 | * To get through boot, we construct simple identity page mappings (which |
910 | * set virtual == physical) and linear mappings which will get the Guest far | 923 | * set virtual == physical) and linear mappings which will get the Guest far |
911 | * enough into the boot to create its own. | 924 | * enough into the boot to create its own. The linear mapping means we |
925 | * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, | ||
926 | * as you'll see. | ||
912 | * | 927 | * |
913 | * We lay them out of the way, just below the initrd (which is why we need to | 928 | * We lay them out of the way, just below the initrd (which is why we need to |
914 | * know its size here). | 929 | * know its size here). |
@@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
944 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; | 959 | linear = (void *)pgdir - linear_pages * PAGE_SIZE; |
945 | 960 | ||
946 | #ifdef CONFIG_X86_PAE | 961 | #ifdef CONFIG_X86_PAE |
962 | /* | ||
963 | * And the single mid page goes below that. We only use one, but | ||
964 | * that's enough to map 1G, which definitely gets us through boot. | ||
965 | */ | ||
947 | pmds = (void *)linear - PAGE_SIZE; | 966 | pmds = (void *)linear - PAGE_SIZE; |
948 | #endif | 967 | #endif |
949 | /* | 968 | /* |
@@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
957 | return -EFAULT; | 976 | return -EFAULT; |
958 | } | 977 | } |
959 | 978 | ||
979 | #ifdef CONFIG_X86_PAE | ||
960 | /* | 980 | /* |
961 | * The top level points to the linear page table pages above. | 981 | * Make the Guest PMD entries point to the corresponding place in the |
962 | * We setup the identity and linear mappings here. | 982 | * linear mapping (up to one page worth of PMD). |
963 | */ | 983 | */ |
964 | #ifdef CONFIG_X86_PAE | ||
965 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; | 984 | for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; |
966 | i += PTRS_PER_PTE, j++) { | 985 | i += PTRS_PER_PTE, j++) { |
986 | /* FIXME: native_set_pmd is overkill here. */ | ||
967 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) | 987 | native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) |
968 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 988 | - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
969 | 989 | ||
@@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
971 | return -EFAULT; | 991 | return -EFAULT; |
972 | } | 992 | } |
973 | 993 | ||
994 | /* One PGD entry, pointing to that PMD page. */ | ||
974 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); | 995 | set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); |
996 | /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ | ||
975 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) | 997 | if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) |
976 | return -EFAULT; | 998 | return -EFAULT; |
999 | /* | ||
1000 | * And the third PGD entry (ie. addresses 3G-4G). | ||
1001 | * | ||
1002 | * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. | ||
1003 | */ | ||
977 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) | 1004 | if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) |
978 | return -EFAULT; | 1005 | return -EFAULT; |
979 | #else | 1006 | #else |
1007 | /* | ||
1008 | * The top level points to the linear page table pages above. | ||
1009 | * We setup the identity and linear mappings here. | ||
1010 | */ | ||
980 | phys_linear = (unsigned long)linear - mem_base; | 1011 | phys_linear = (unsigned long)linear - mem_base; |
981 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { | 1012 | for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { |
982 | pgd_t pgd; | 1013 | pgd_t pgd; |
1014 | /* | ||
1015 | * Create a PGD entry which points to the right part of the | ||
1016 | * linear PTE pages. | ||
1017 | */ | ||
983 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | | 1018 | pgd = __pgd((phys_linear + i * sizeof(pte_t)) | |
984 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); | 1019 | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); |
985 | 1020 | ||
1021 | /* | ||
1022 | * Copy it into the PGD page at 0 and PAGE_OFFSET. | ||
1023 | */ | ||
986 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) | 1024 | if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) |
987 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) | 1025 | || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) |
988 | + i / PTRS_PER_PTE], | 1026 | + i / PTRS_PER_PTE], |
@@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, | |||
992 | #endif | 1030 | #endif |
993 | 1031 | ||
994 | /* | 1032 | /* |
995 | * We return the top level (guest-physical) address: remember where | 1033 | * We return the top level (guest-physical) address: we remember where |
996 | * this is. | 1034 | * this is to write it into lguest_data when the Guest initializes. |
997 | */ | 1035 | */ |
998 | return (unsigned long)pgdir - mem_base; | 1036 | return (unsigned long)pgdir - mem_base; |
999 | } | 1037 | } |
@@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) | |||
1031 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); | 1069 | lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
1032 | if (!lg->pgdirs[0].pgdir) | 1070 | if (!lg->pgdirs[0].pgdir) |
1033 | return -ENOMEM; | 1071 | return -ENOMEM; |
1072 | |||
1034 | #ifdef CONFIG_X86_PAE | 1073 | #ifdef CONFIG_X86_PAE |
1074 | /* For PAE, we also create the initial mid-level. */ | ||
1035 | pgd = lg->pgdirs[0].pgdir; | 1075 | pgd = lg->pgdirs[0].pgdir; |
1036 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); | 1076 | pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); |
1037 | if (!pmd_table) | 1077 | if (!pmd_table) |
@@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) | |||
1040 | set_pgd(pgd + SWITCHER_PGD_INDEX, | 1080 | set_pgd(pgd + SWITCHER_PGD_INDEX, |
1041 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); | 1081 | __pgd(__pa(pmd_table) | _PAGE_PRESENT)); |
1042 | #endif | 1082 | #endif |
1083 | |||
1084 | /* This is the current page table. */ | ||
1043 | lg->cpus[0].cpu_pgd = 0; | 1085 | lg->cpus[0].cpu_pgd = 0; |
1044 | return 0; | 1086 | return 0; |
1045 | } | 1087 | } |
1046 | 1088 | ||
1047 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | 1089 | /*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ |
1048 | void page_table_guest_data_init(struct lg_cpu *cpu) | 1090 | void page_table_guest_data_init(struct lg_cpu *cpu) |
1049 | { | 1091 | { |
1050 | /* We get the kernel address: above this is all kernel memory. */ | 1092 | /* We get the kernel address: above this is all kernel memory. */ |
@@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
1105 | pmd_t switcher_pmd; | 1147 | pmd_t switcher_pmd; |
1106 | pmd_t *pmd_table; | 1148 | pmd_t *pmd_table; |
1107 | 1149 | ||
1150 | /* FIXME: native_set_pmd is overkill here. */ | ||
1108 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> | 1151 | native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> |
1109 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); | 1152 | PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
1110 | 1153 | ||
1154 | /* Figure out where the pmd page is, by reading the PGD, and converting | ||
1155 | * it to a virtual address. */ | ||
1111 | pmd_table = __va(pgd_pfn(cpu->lg-> | 1156 | pmd_table = __va(pgd_pfn(cpu->lg-> |
1112 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) | 1157 | pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) |
1113 | << PAGE_SHIFT); | 1158 | << PAGE_SHIFT); |
1159 | /* Now write it into the shadow page table. */ | ||
1114 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); | 1160 | native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); |
1115 | #else | 1161 | #else |
1116 | pgd_t switcher_pgd; | 1162 | pgd_t switcher_pgd; |
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c | |||
@@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) | |||
187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore | 187 | * also simplify copy_in_guest_info(). Note that we'd still need to restore |
188 | * things when we exit to Launcher userspace, but that's fairly easy. | 188 | * things when we exit to Launcher userspace, but that's fairly easy. |
189 | * | 189 | * |
190 | * We could also try using this hooks for PGE, but that might be too expensive. | 190 | * We could also try using these hooks for PGE, but that might be too expensive. |
191 | * | 191 | * |
192 | * The hooks were designed for KVM, but we can also put them to good use. | 192 | * The hooks were designed for KVM, but we can also put them to good use. |
193 | :*/ | 193 | :*/ |
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S | |||
@@ -1,7 +1,7 @@ | |||
1 | /*P:900 | 1 | /*P:900 |
2 | * This is the Switcher: code which sits at 0xFFC00000 astride both the | 2 | * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride |
3 | * Host and Guest to do the low-level Guest<->Host switch. It is as simple as | 3 | * both the Host and Guest to do the low-level Guest<->Host switch. It is as |
4 | * it can be made, but it's naturally very specific to x86. | 4 | * simple as it can be made, but it's naturally very specific to x86. |
5 | * | 5 | * |
6 | * You have now completed Preparation. If this has whet your appetite; if you | 6 | * You have now completed Preparation. If this has whet your appetite; if you |
7 | * are feeling invigorated and refreshed then the next, more challenging stage | 7 | * are feeling invigorated and refreshed then the next, more challenging stage |