diff options
Diffstat (limited to 'drivers/lguest/page_tables.c')
-rw-r--r-- | drivers/lguest/page_tables.c | 250 |
1 files changed, 136 insertions, 114 deletions
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index b7a924ace684..2a45f0691c9b 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/random.h> | 13 | #include <linux/random.h> |
14 | #include <linux/percpu.h> | 14 | #include <linux/percpu.h> |
15 | #include <asm/tlbflush.h> | 15 | #include <asm/tlbflush.h> |
16 | #include <asm/uaccess.h> | ||
16 | #include "lg.h" | 17 | #include "lg.h" |
17 | 18 | ||
18 | /*M:008 We hold reference to pages, which prevents them from being swapped. | 19 | /*M:008 We hold reference to pages, which prevents them from being swapped. |
@@ -44,44 +45,32 @@ | |||
44 | * (vii) Setting up the page tables initially. | 45 | * (vii) Setting up the page tables initially. |
45 | :*/ | 46 | :*/ |
46 | 47 | ||
47 | /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 | ||
48 | * (or 2^10) entries per page. */ | ||
49 | #define PTES_PER_PAGE_SHIFT 10 | ||
50 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) | ||
51 | 48 | ||
52 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 49 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is |
53 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 50 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE |
54 | * page. */ | 51 | * page. */ |
55 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) | 52 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
56 | 53 | ||
57 | /* We actually need a separate PTE page for each CPU. Remember that after the | 54 | /* We actually need a separate PTE page for each CPU. Remember that after the |
58 | * Switcher code itself comes two pages for each CPU, and we don't want this | 55 | * Switcher code itself comes two pages for each CPU, and we don't want this |
59 | * CPU's guest to see the pages of any other CPU. */ | 56 | * CPU's guest to see the pages of any other CPU. */ |
60 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); | 57 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); |
61 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | 58 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
62 | 59 | ||
63 | /*H:320 With our shadow and Guest types established, we need to deal with | 60 | /*H:320 With our shadow and Guest types established, we need to deal with |
64 | * them: the page table code is curly enough to need helper functions to keep | 61 | * them: the page table code is curly enough to need helper functions to keep |
65 | * it clear and clean. | 62 | * it clear and clean. |
66 | * | 63 | * |
67 | * The first helper takes a virtual address, and says which entry in the top | 64 | * There are two functions which return pointers to the shadow (aka "real") |
68 | * level page table deals with that address. Since each top level entry deals | ||
69 | * with 4M, this effectively divides by 4M. */ | ||
70 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) | ||
71 | { | ||
72 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | ||
73 | } | ||
74 | |||
75 | /* There are two functions which return pointers to the shadow (aka "real") | ||
76 | * page tables. | 65 | * page tables. |
77 | * | 66 | * |
78 | * spgd_addr() takes the virtual address and returns a pointer to the top-level | 67 | * spgd_addr() takes the virtual address and returns a pointer to the top-level |
79 | * page directory entry for that address. Since we keep track of several page | 68 | * page directory entry for that address. Since we keep track of several page |
80 | * tables, the "i" argument tells us which one we're interested in (it's | 69 | * tables, the "i" argument tells us which one we're interested in (it's |
81 | * usually the current one). */ | 70 | * usually the current one). */ |
82 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 71 | static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) |
83 | { | 72 | { |
84 | unsigned int index = vaddr_to_pgd_index(vaddr); | 73 | unsigned int index = pgd_index(vaddr); |
85 | 74 | ||
86 | /* We kill any Guest trying to touch the Switcher addresses. */ | 75 | /* We kill any Guest trying to touch the Switcher addresses. */ |
87 | if (index >= SWITCHER_PGD_INDEX) { | 76 | if (index >= SWITCHER_PGD_INDEX) { |
@@ -95,28 +84,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | |||
95 | /* This routine then takes the PGD entry given above, which contains the | 84 | /* This routine then takes the PGD entry given above, which contains the |
96 | * address of the PTE page. It then returns a pointer to the PTE entry for the | 85 | * address of the PTE page. It then returns a pointer to the PTE entry for the |
97 | * given address. */ | 86 | * given address. */ |
98 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) | 87 | static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) |
99 | { | 88 | { |
100 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); | 89 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
101 | /* You should never call this if the PGD entry wasn't valid */ | 90 | /* You should never call this if the PGD entry wasn't valid */ |
102 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); | 91 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
103 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; | 92 | return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; |
104 | } | 93 | } |
105 | 94 | ||
106 | /* These two functions just like the above two, except they access the Guest | 95 | /* These two functions just like the above two, except they access the Guest |
107 | * page tables. Hence they return a Guest address. */ | 96 | * page tables. Hence they return a Guest address. */ |
108 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | 97 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) |
109 | { | 98 | { |
110 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 99 | unsigned int index = vaddr >> (PGDIR_SHIFT); |
111 | return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); | 100 | return lg->pgdirs[lg->pgdidx].gpgdir + index * sizeof(pgd_t); |
112 | } | 101 | } |
113 | 102 | ||
114 | static unsigned long gpte_addr(struct lguest *lg, | 103 | static unsigned long gpte_addr(struct lguest *lg, |
115 | gpgd_t gpgd, unsigned long vaddr) | 104 | pgd_t gpgd, unsigned long vaddr) |
116 | { | 105 | { |
117 | unsigned long gpage = gpgd.pfn << PAGE_SHIFT; | 106 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
118 | BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); | 107 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
119 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); | 108 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); |
120 | } | 109 | } |
121 | 110 | ||
122 | /*H:350 This routine takes a page number given by the Guest and converts it to | 111 | /*H:350 This routine takes a page number given by the Guest and converts it to |
@@ -149,53 +138,55 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) | |||
149 | * entry can be a little tricky. The flags are (almost) the same, but the | 138 | * entry can be a little tricky. The flags are (almost) the same, but the |
150 | * Guest PTE contains a virtual page number: the CPU needs the real page | 139 | * Guest PTE contains a virtual page number: the CPU needs the real page |
151 | * number. */ | 140 | * number. */ |
152 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | 141 | static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) |
153 | { | 142 | { |
154 | spte_t spte; | 143 | unsigned long pfn, base, flags; |
155 | unsigned long pfn; | ||
156 | 144 | ||
157 | /* The Guest sets the global flag, because it thinks that it is using | 145 | /* The Guest sets the global flag, because it thinks that it is using |
158 | * PGE. We only told it to use PGE so it would tell us whether it was | 146 | * PGE. We only told it to use PGE so it would tell us whether it was |
159 | * flushing a kernel mapping or a userspace mapping. We don't actually | 147 | * flushing a kernel mapping or a userspace mapping. We don't actually |
160 | * use the global bit, so throw it away. */ | 148 | * use the global bit, so throw it away. */ |
161 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); | 149 | flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); |
150 | |||
151 | /* The Guest's pages are offset inside the Launcher. */ | ||
152 | base = (unsigned long)lg->mem_base / PAGE_SIZE; | ||
162 | 153 | ||
163 | /* We need a temporary "unsigned long" variable to hold the answer from | 154 | /* We need a temporary "unsigned long" variable to hold the answer from |
164 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't | 155 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't |
165 | * fit in spte.pfn. get_pfn() finds the real physical number of the | 156 | * fit in spte.pfn. get_pfn() finds the real physical number of the |
166 | * page, given the virtual number. */ | 157 | * page, given the virtual number. */ |
167 | pfn = get_pfn(gpte.pfn, write); | 158 | pfn = get_pfn(base + pte_pfn(gpte), write); |
168 | if (pfn == -1UL) { | 159 | if (pfn == -1UL) { |
169 | kill_guest(lg, "failed to get page %u", gpte.pfn); | 160 | kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); |
170 | /* When we destroy the Guest, we'll go through the shadow page | 161 | /* When we destroy the Guest, we'll go through the shadow page |
171 | * tables and release_pte() them. Make sure we don't think | 162 | * tables and release_pte() them. Make sure we don't think |
172 | * this one is valid! */ | 163 | * this one is valid! */ |
173 | spte.flags = 0; | 164 | flags = 0; |
174 | } | 165 | } |
175 | /* Now we assign the page number, and our shadow PTE is complete. */ | 166 | /* Now we assemble our shadow PTE from the page number and flags. */ |
176 | spte.pfn = pfn; | 167 | return pfn_pte(pfn, __pgprot(flags)); |
177 | return spte; | ||
178 | } | 168 | } |
179 | 169 | ||
180 | /*H:460 And to complete the chain, release_pte() looks like this: */ | 170 | /*H:460 And to complete the chain, release_pte() looks like this: */ |
181 | static void release_pte(spte_t pte) | 171 | static void release_pte(pte_t pte) |
182 | { | 172 | { |
183 | /* Remember that get_user_pages() took a reference to the page, in | 173 | /* Remember that get_user_pages() took a reference to the page, in |
184 | * get_pfn()? We have to put it back now. */ | 174 | * get_pfn()? We have to put it back now. */ |
185 | if (pte.flags & _PAGE_PRESENT) | 175 | if (pte_flags(pte) & _PAGE_PRESENT) |
186 | put_page(pfn_to_page(pte.pfn)); | 176 | put_page(pfn_to_page(pte_pfn(pte))); |
187 | } | 177 | } |
188 | /*:*/ | 178 | /*:*/ |
189 | 179 | ||
190 | static void check_gpte(struct lguest *lg, gpte_t gpte) | 180 | static void check_gpte(struct lguest *lg, pte_t gpte) |
191 | { | 181 | { |
192 | if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) | 182 | if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) |
183 | || pte_pfn(gpte) >= lg->pfn_limit) | ||
193 | kill_guest(lg, "bad page table entry"); | 184 | kill_guest(lg, "bad page table entry"); |
194 | } | 185 | } |
195 | 186 | ||
196 | static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | 187 | static void check_gpgd(struct lguest *lg, pgd_t gpgd) |
197 | { | 188 | { |
198 | if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) | 189 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) |
199 | kill_guest(lg, "bad page directory entry"); | 190 | kill_guest(lg, "bad page directory entry"); |
200 | } | 191 | } |
201 | 192 | ||
@@ -211,21 +202,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | |||
211 | * true. */ | 202 | * true. */ |
212 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 203 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) |
213 | { | 204 | { |
214 | gpgd_t gpgd; | 205 | pgd_t gpgd; |
215 | spgd_t *spgd; | 206 | pgd_t *spgd; |
216 | unsigned long gpte_ptr; | 207 | unsigned long gpte_ptr; |
217 | gpte_t gpte; | 208 | pte_t gpte; |
218 | spte_t *spte; | 209 | pte_t *spte; |
219 | 210 | ||
220 | /* First step: get the top-level Guest page table entry. */ | 211 | /* First step: get the top-level Guest page table entry. */ |
221 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | 212 | gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); |
222 | /* Toplevel not present? We can't map it in. */ | 213 | /* Toplevel not present? We can't map it in. */ |
223 | if (!(gpgd.flags & _PAGE_PRESENT)) | 214 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) |
224 | return 0; | 215 | return 0; |
225 | 216 | ||
226 | /* Now look at the matching shadow entry. */ | 217 | /* Now look at the matching shadow entry. */ |
227 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 218 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
228 | if (!(spgd->flags & _PAGE_PRESENT)) { | 219 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
229 | /* No shadow entry: allocate a new shadow PTE page. */ | 220 | /* No shadow entry: allocate a new shadow PTE page. */ |
230 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 221 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
231 | /* This is not really the Guest's fault, but killing it is | 222 | /* This is not really the Guest's fault, but killing it is |
@@ -238,34 +229,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
238 | check_gpgd(lg, gpgd); | 229 | check_gpgd(lg, gpgd); |
239 | /* And we copy the flags to the shadow PGD entry. The page | 230 | /* And we copy the flags to the shadow PGD entry. The page |
240 | * number in the shadow PGD is the page we just allocated. */ | 231 | * number in the shadow PGD is the page we just allocated. */ |
241 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); | 232 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); |
242 | } | 233 | } |
243 | 234 | ||
244 | /* OK, now we look at the lower level in the Guest page table: keep its | 235 | /* OK, now we look at the lower level in the Guest page table: keep its |
245 | * address, because we might update it later. */ | 236 | * address, because we might update it later. */ |
246 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | 237 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); |
247 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); | 238 | gpte = lgread(lg, gpte_ptr, pte_t); |
248 | 239 | ||
249 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 240 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
250 | if (!(gpte.flags & _PAGE_PRESENT)) | 241 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
251 | return 0; | 242 | return 0; |
252 | 243 | ||
253 | /* Check they're not trying to write to a page the Guest wants | 244 | /* Check they're not trying to write to a page the Guest wants |
254 | * read-only (bit 2 of errcode == write). */ | 245 | * read-only (bit 2 of errcode == write). */ |
255 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) | 246 | if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) |
256 | return 0; | 247 | return 0; |
257 | 248 | ||
258 | /* User access to a kernel page? (bit 3 == user access) */ | 249 | /* User access to a kernel page? (bit 3 == user access) */ |
259 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) | 250 | if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) |
260 | return 0; | 251 | return 0; |
261 | 252 | ||
262 | /* Check that the Guest PTE flags are OK, and the page number is below | 253 | /* Check that the Guest PTE flags are OK, and the page number is below |
263 | * the pfn_limit (ie. not mapping the Launcher binary). */ | 254 | * the pfn_limit (ie. not mapping the Launcher binary). */ |
264 | check_gpte(lg, gpte); | 255 | check_gpte(lg, gpte); |
265 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 256 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
266 | gpte.flags |= _PAGE_ACCESSED; | 257 | gpte = pte_mkyoung(gpte); |
258 | |||
267 | if (errcode & 2) | 259 | if (errcode & 2) |
268 | gpte.flags |= _PAGE_DIRTY; | 260 | gpte = pte_mkdirty(gpte); |
269 | 261 | ||
270 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 262 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
271 | spte = spte_addr(lg, *spgd, vaddr); | 263 | spte = spte_addr(lg, *spgd, vaddr); |
@@ -275,21 +267,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
275 | 267 | ||
276 | /* If this is a write, we insist that the Guest page is writable (the | 268 | /* If this is a write, we insist that the Guest page is writable (the |
277 | * final arg to gpte_to_spte()). */ | 269 | * final arg to gpte_to_spte()). */ |
278 | if (gpte.flags & _PAGE_DIRTY) | 270 | if (pte_dirty(gpte)) |
279 | *spte = gpte_to_spte(lg, gpte, 1); | 271 | *spte = gpte_to_spte(lg, gpte, 1); |
280 | else { | 272 | else |
281 | /* If this is a read, don't set the "writable" bit in the page | 273 | /* If this is a read, don't set the "writable" bit in the page |
282 | * table entry, even if the Guest says it's writable. That way | 274 | * table entry, even if the Guest says it's writable. That way |
283 | * we come back here when a write does actually ocur, so we can | 275 | * we come back here when a write does actually ocur, so we can |
284 | * update the Guest's _PAGE_DIRTY flag. */ | 276 | * update the Guest's _PAGE_DIRTY flag. */ |
285 | gpte_t ro_gpte = gpte; | 277 | *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); |
286 | ro_gpte.flags &= ~_PAGE_RW; | ||
287 | *spte = gpte_to_spte(lg, ro_gpte, 0); | ||
288 | } | ||
289 | 278 | ||
290 | /* Finally, we write the Guest PTE entry back: we've set the | 279 | /* Finally, we write the Guest PTE entry back: we've set the |
291 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 280 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
292 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); | 281 | lgwrite(lg, gpte_ptr, pte_t, gpte); |
293 | 282 | ||
294 | /* We succeeded in mapping the page! */ | 283 | /* We succeeded in mapping the page! */ |
295 | return 1; | 284 | return 1; |
@@ -305,17 +294,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
305 | * mapped by the shadow page tables, and is it writable? */ | 294 | * mapped by the shadow page tables, and is it writable? */ |
306 | static int page_writable(struct lguest *lg, unsigned long vaddr) | 295 | static int page_writable(struct lguest *lg, unsigned long vaddr) |
307 | { | 296 | { |
308 | spgd_t *spgd; | 297 | pgd_t *spgd; |
309 | unsigned long flags; | 298 | unsigned long flags; |
310 | 299 | ||
311 | /* Look at the top level entry: is it present? */ | 300 | /* Look at the top level entry: is it present? */ |
312 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 301 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
313 | if (!(spgd->flags & _PAGE_PRESENT)) | 302 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
314 | return 0; | 303 | return 0; |
315 | 304 | ||
316 | /* Check the flags on the pte entry itself: it must be present and | 305 | /* Check the flags on the pte entry itself: it must be present and |
317 | * writable. */ | 306 | * writable. */ |
318 | flags = spte_addr(lg, *spgd, vaddr)->flags; | 307 | flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); |
308 | |||
319 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 309 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
320 | } | 310 | } |
321 | 311 | ||
@@ -329,22 +319,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr) | |||
329 | } | 319 | } |
330 | 320 | ||
331 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 321 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
332 | static void release_pgd(struct lguest *lg, spgd_t *spgd) | 322 | static void release_pgd(struct lguest *lg, pgd_t *spgd) |
333 | { | 323 | { |
334 | /* If the entry's not present, there's nothing to release. */ | 324 | /* If the entry's not present, there's nothing to release. */ |
335 | if (spgd->flags & _PAGE_PRESENT) { | 325 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
336 | unsigned int i; | 326 | unsigned int i; |
337 | /* Converting the pfn to find the actual PTE page is easy: turn | 327 | /* Converting the pfn to find the actual PTE page is easy: turn |
338 | * the page number into a physical address, then convert to a | 328 | * the page number into a physical address, then convert to a |
339 | * virtual address (easy for kernel pages like this one). */ | 329 | * virtual address (easy for kernel pages like this one). */ |
340 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); | 330 | pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); |
341 | /* For each entry in the page, we might need to release it. */ | 331 | /* For each entry in the page, we might need to release it. */ |
342 | for (i = 0; i < PTES_PER_PAGE; i++) | 332 | for (i = 0; i < PTRS_PER_PTE; i++) |
343 | release_pte(ptepage[i]); | 333 | release_pte(ptepage[i]); |
344 | /* Now we can free the page of PTEs */ | 334 | /* Now we can free the page of PTEs */ |
345 | free_page((long)ptepage); | 335 | free_page((long)ptepage); |
346 | /* And zero out the PGD entry we we never release it twice. */ | 336 | /* And zero out the PGD entry we we never release it twice. */ |
347 | spgd->raw.val = 0; | 337 | *spgd = __pgd(0); |
348 | } | 338 | } |
349 | } | 339 | } |
350 | 340 | ||
@@ -356,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
356 | { | 346 | { |
357 | unsigned int i; | 347 | unsigned int i; |
358 | /* Release every pgd entry up to the kernel's address. */ | 348 | /* Release every pgd entry up to the kernel's address. */ |
359 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) | 349 | for (i = 0; i < pgd_index(lg->kernel_address); i++) |
360 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 350 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
361 | } | 351 | } |
362 | 352 | ||
@@ -369,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg) | |||
369 | } | 359 | } |
370 | /*:*/ | 360 | /*:*/ |
371 | 361 | ||
362 | /* We walk down the guest page tables to get a guest-physical address */ | ||
363 | unsigned long guest_pa(struct lguest *lg, unsigned long vaddr) | ||
364 | { | ||
365 | pgd_t gpgd; | ||
366 | pte_t gpte; | ||
367 | |||
368 | /* First step: get the top-level Guest page table entry. */ | ||
369 | gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); | ||
370 | /* Toplevel not present? We can't map it in. */ | ||
371 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) | ||
372 | kill_guest(lg, "Bad address %#lx", vaddr); | ||
373 | |||
374 | gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); | ||
375 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) | ||
376 | kill_guest(lg, "Bad address %#lx", vaddr); | ||
377 | |||
378 | return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); | ||
379 | } | ||
380 | |||
372 | /* We keep several page tables. This is a simple routine to find the page | 381 | /* We keep several page tables. This is a simple routine to find the page |
373 | * table (if any) corresponding to this top-level address the Guest has given | 382 | * table (if any) corresponding to this top-level address the Guest has given |
374 | * us. */ | 383 | * us. */ |
@@ -376,7 +385,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | |||
376 | { | 385 | { |
377 | unsigned int i; | 386 | unsigned int i; |
378 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 387 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
379 | if (lg->pgdirs[i].cr3 == pgtable) | 388 | if (lg->pgdirs[i].gpgdir == pgtable) |
380 | break; | 389 | break; |
381 | return i; | 390 | return i; |
382 | } | 391 | } |
@@ -385,7 +394,7 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) | |||
385 | * allocate a new one (and so the kernel parts are not there), we set | 394 | * allocate a new one (and so the kernel parts are not there), we set |
386 | * blank_pgdir. */ | 395 | * blank_pgdir. */ |
387 | static unsigned int new_pgdir(struct lguest *lg, | 396 | static unsigned int new_pgdir(struct lguest *lg, |
388 | unsigned long cr3, | 397 | unsigned long gpgdir, |
389 | int *blank_pgdir) | 398 | int *blank_pgdir) |
390 | { | 399 | { |
391 | unsigned int next; | 400 | unsigned int next; |
@@ -395,7 +404,7 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
395 | next = random32() % ARRAY_SIZE(lg->pgdirs); | 404 | next = random32() % ARRAY_SIZE(lg->pgdirs); |
396 | /* If it's never been allocated at all before, try now. */ | 405 | /* If it's never been allocated at all before, try now. */ |
397 | if (!lg->pgdirs[next].pgdir) { | 406 | if (!lg->pgdirs[next].pgdir) { |
398 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); | 407 | lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
399 | /* If the allocation fails, just keep using the one we have */ | 408 | /* If the allocation fails, just keep using the one we have */ |
400 | if (!lg->pgdirs[next].pgdir) | 409 | if (!lg->pgdirs[next].pgdir) |
401 | next = lg->pgdidx; | 410 | next = lg->pgdidx; |
@@ -405,7 +414,7 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
405 | *blank_pgdir = 1; | 414 | *blank_pgdir = 1; |
406 | } | 415 | } |
407 | /* Record which Guest toplevel this shadows. */ | 416 | /* Record which Guest toplevel this shadows. */ |
408 | lg->pgdirs[next].cr3 = cr3; | 417 | lg->pgdirs[next].gpgdir = gpgdir; |
409 | /* Release all the non-kernel mappings. */ | 418 | /* Release all the non-kernel mappings. */ |
410 | flush_user_mappings(lg, next); | 419 | flush_user_mappings(lg, next); |
411 | 420 | ||
@@ -472,26 +481,27 @@ void guest_pagetable_clear_all(struct lguest *lg) | |||
472 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. | 481 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. |
473 | */ | 482 | */ |
474 | static void do_set_pte(struct lguest *lg, int idx, | 483 | static void do_set_pte(struct lguest *lg, int idx, |
475 | unsigned long vaddr, gpte_t gpte) | 484 | unsigned long vaddr, pte_t gpte) |
476 | { | 485 | { |
477 | /* Look up the matching shadow page directot entry. */ | 486 | /* Look up the matching shadow page directot entry. */ |
478 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); | 487 | pgd_t *spgd = spgd_addr(lg, idx, vaddr); |
479 | 488 | ||
480 | /* If the top level isn't present, there's no entry to update. */ | 489 | /* If the top level isn't present, there's no entry to update. */ |
481 | if (spgd->flags & _PAGE_PRESENT) { | 490 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
482 | /* Otherwise, we start by releasing the existing entry. */ | 491 | /* Otherwise, we start by releasing the existing entry. */ |
483 | spte_t *spte = spte_addr(lg, *spgd, vaddr); | 492 | pte_t *spte = spte_addr(lg, *spgd, vaddr); |
484 | release_pte(*spte); | 493 | release_pte(*spte); |
485 | 494 | ||
486 | /* If they're setting this entry as dirty or accessed, we might | 495 | /* If they're setting this entry as dirty or accessed, we might |
487 | * as well put that entry they've given us in now. This shaves | 496 | * as well put that entry they've given us in now. This shaves |
488 | * 10% off a copy-on-write micro-benchmark. */ | 497 | * 10% off a copy-on-write micro-benchmark. */ |
489 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 498 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
490 | check_gpte(lg, gpte); | 499 | check_gpte(lg, gpte); |
491 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); | 500 | *spte = gpte_to_spte(lg, gpte, |
501 | pte_flags(gpte) & _PAGE_DIRTY); | ||
492 | } else | 502 | } else |
493 | /* Otherwise we can demand_page() it in later. */ | 503 | /* Otherwise we can demand_page() it in later. */ |
494 | spte->raw.val = 0; | 504 | *spte = __pte(0); |
495 | } | 505 | } |
496 | } | 506 | } |
497 | 507 | ||
@@ -506,18 +516,18 @@ static void do_set_pte(struct lguest *lg, int idx, | |||
506 | * The benefit is that when we have to track a new page table, we can copy keep | 516 | * The benefit is that when we have to track a new page table, we can copy keep |
507 | * all the kernel mappings. This speeds up context switch immensely. */ | 517 | * all the kernel mappings. This speeds up context switch immensely. */ |
508 | void guest_set_pte(struct lguest *lg, | 518 | void guest_set_pte(struct lguest *lg, |
509 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) | 519 | unsigned long gpgdir, unsigned long vaddr, pte_t gpte) |
510 | { | 520 | { |
511 | /* Kernel mappings must be changed on all top levels. Slow, but | 521 | /* Kernel mappings must be changed on all top levels. Slow, but |
512 | * doesn't happen often. */ | 522 | * doesn't happen often. */ |
513 | if (vaddr >= lg->page_offset) { | 523 | if (vaddr >= lg->kernel_address) { |
514 | unsigned int i; | 524 | unsigned int i; |
515 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) | 525 | for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) |
516 | if (lg->pgdirs[i].pgdir) | 526 | if (lg->pgdirs[i].pgdir) |
517 | do_set_pte(lg, i, vaddr, gpte); | 527 | do_set_pte(lg, i, vaddr, gpte); |
518 | } else { | 528 | } else { |
519 | /* Is this page table one we have a shadow for? */ | 529 | /* Is this page table one we have a shadow for? */ |
520 | int pgdir = find_pgdir(lg, cr3); | 530 | int pgdir = find_pgdir(lg, gpgdir); |
521 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) | 531 | if (pgdir != ARRAY_SIZE(lg->pgdirs)) |
522 | /* If so, do the update. */ | 532 | /* If so, do the update. */ |
523 | do_set_pte(lg, pgdir, vaddr, gpte); | 533 | do_set_pte(lg, pgdir, vaddr, gpte); |
@@ -538,7 +548,7 @@ void guest_set_pte(struct lguest *lg, | |||
538 | * | 548 | * |
539 | * So with that in mind here's our code to to update a (top-level) PGD entry: | 549 | * So with that in mind here's our code to to update a (top-level) PGD entry: |
540 | */ | 550 | */ |
541 | void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | 551 | void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx) |
542 | { | 552 | { |
543 | int pgdir; | 553 | int pgdir; |
544 | 554 | ||
@@ -548,7 +558,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | |||
548 | return; | 558 | return; |
549 | 559 | ||
550 | /* If they're talking about a page table we have a shadow for... */ | 560 | /* If they're talking about a page table we have a shadow for... */ |
551 | pgdir = find_pgdir(lg, cr3); | 561 | pgdir = find_pgdir(lg, gpgdir); |
552 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) | 562 | if (pgdir < ARRAY_SIZE(lg->pgdirs)) |
553 | /* ... throw it away. */ | 563 | /* ... throw it away. */ |
554 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); | 564 | release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx); |
@@ -560,21 +570,34 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | |||
560 | * its first page table is. We set some things up here: */ | 570 | * its first page table is. We set some things up here: */ |
561 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | 571 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) |
562 | { | 572 | { |
563 | /* In flush_user_mappings() we loop from 0 to | ||
564 | * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit | ||
565 | * the Switcher mappings, so check that now. */ | ||
566 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | ||
567 | return -EINVAL; | ||
568 | /* We start on the first shadow page table, and give it a blank PGD | 573 | /* We start on the first shadow page table, and give it a blank PGD |
569 | * page. */ | 574 | * page. */ |
570 | lg->pgdidx = 0; | 575 | lg->pgdidx = 0; |
571 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; | 576 | lg->pgdirs[lg->pgdidx].gpgdir = pgtable; |
572 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); | 577 | lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); |
573 | if (!lg->pgdirs[lg->pgdidx].pgdir) | 578 | if (!lg->pgdirs[lg->pgdidx].pgdir) |
574 | return -ENOMEM; | 579 | return -ENOMEM; |
575 | return 0; | 580 | return 0; |
576 | } | 581 | } |
577 | 582 | ||
583 | /* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ | ||
584 | void page_table_guest_data_init(struct lguest *lg) | ||
585 | { | ||
586 | /* We get the kernel address: above this is all kernel memory. */ | ||
587 | if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) | ||
588 | /* We tell the Guest that it can't use the top 4MB of virtual | ||
589 | * addresses used by the Switcher. */ | ||
590 | || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) | ||
591 | || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) | ||
592 | kill_guest(lg, "bad guest page %p", lg->lguest_data); | ||
593 | |||
594 | /* In flush_user_mappings() we loop from 0 to | ||
595 | * "pgd_index(lg->kernel_address)". This assumes it won't hit the | ||
596 | * Switcher mappings, so check that now. */ | ||
597 | if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) | ||
598 | kill_guest(lg, "bad kernel address %#lx", lg->kernel_address); | ||
599 | } | ||
600 | |||
578 | /* When a Guest dies, our cleanup is fairly simple. */ | 601 | /* When a Guest dies, our cleanup is fairly simple. */ |
579 | void free_guest_pagetable(struct lguest *lg) | 602 | void free_guest_pagetable(struct lguest *lg) |
580 | { | 603 | { |
@@ -594,14 +617,14 @@ void free_guest_pagetable(struct lguest *lg) | |||
594 | * for each CPU already set up, we just need to hook them in. */ | 617 | * for each CPU already set up, we just need to hook them in. */ |
595 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 618 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) |
596 | { | 619 | { |
597 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 620 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
598 | spgd_t switcher_pgd; | 621 | pgd_t switcher_pgd; |
599 | spte_t regs_pte; | 622 | pte_t regs_pte; |
600 | 623 | ||
601 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 624 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
602 | * page for this CPU (with appropriate flags). */ | 625 | * page for this CPU (with appropriate flags). */ |
603 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; | 626 | switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); |
604 | switcher_pgd.flags = _PAGE_KERNEL; | 627 | |
605 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 628 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
606 | 629 | ||
607 | /* We also change the Switcher PTE page. When we're running the Guest, | 630 | /* We also change the Switcher PTE page. When we're running the Guest, |
@@ -611,10 +634,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | |||
611 | * CPU's "struct lguest_pages": if we make sure the Guest's register | 634 | * CPU's "struct lguest_pages": if we make sure the Guest's register |
612 | * page is already mapped there, we don't have to copy them out | 635 | * page is already mapped there, we don't have to copy them out |
613 | * again. */ | 636 | * again. */ |
614 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; | 637 | regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); |
615 | regs_pte.flags = _PAGE_KERNEL; | 638 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; |
616 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] | ||
617 | = regs_pte; | ||
618 | } | 639 | } |
619 | /*:*/ | 640 | /*:*/ |
620 | 641 | ||
@@ -635,24 +656,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
635 | unsigned int pages) | 656 | unsigned int pages) |
636 | { | 657 | { |
637 | unsigned int i; | 658 | unsigned int i; |
638 | spte_t *pte = switcher_pte_page(cpu); | 659 | pte_t *pte = switcher_pte_page(cpu); |
639 | 660 | ||
640 | /* The first entries are easy: they map the Switcher code. */ | 661 | /* The first entries are easy: they map the Switcher code. */ |
641 | for (i = 0; i < pages; i++) { | 662 | for (i = 0; i < pages; i++) { |
642 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 663 | pte[i] = mk_pte(switcher_page[i], |
643 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 664 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); |
644 | } | 665 | } |
645 | 666 | ||
646 | /* The only other thing we map is this CPU's pair of pages. */ | 667 | /* The only other thing we map is this CPU's pair of pages. */ |
647 | i = pages + cpu*2; | 668 | i = pages + cpu*2; |
648 | 669 | ||
649 | /* First page (Guest registers) is writable from the Guest */ | 670 | /* First page (Guest registers) is writable from the Guest */ |
650 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 671 | pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), |
651 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; | 672 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); |
673 | |||
652 | /* The second page contains the "struct lguest_ro_state", and is | 674 | /* The second page contains the "struct lguest_ro_state", and is |
653 | * read-only. */ | 675 | * read-only. */ |
654 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); | 676 | pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), |
655 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 677 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); |
656 | } | 678 | } |
657 | 679 | ||
658 | /*H:510 At boot or module load time, init_pagetables() allocates and populates | 680 | /*H:510 At boot or module load time, init_pagetables() allocates and populates |
@@ -662,7 +684,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) | |||
662 | unsigned int i; | 684 | unsigned int i; |
663 | 685 | ||
664 | for_each_possible_cpu(i) { | 686 | for_each_possible_cpu(i) { |
665 | switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); | 687 | switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); |
666 | if (!switcher_pte_page(i)) { | 688 | if (!switcher_pte_page(i)) { |
667 | free_switcher_pte_pages(); | 689 | free_switcher_pte_pages(); |
668 | return -ENOMEM; | 690 | return -ENOMEM; |