diff options
author | Matias Zabaljauregui <matias.zabaljauregui@cern.ch> | 2007-10-21 21:03:33 -0400 |
---|---|---|
committer | Rusty Russell <rusty@rustcorp.com.au> | 2007-10-23 01:49:53 -0400 |
commit | df29f43e650df29456804dabdb2611de914e7c0f (patch) | |
tree | 2f8de4a2f1b7c4141e710123fc86db266f507d83 | |
parent | 47aee45ae3c708ab678e09abfba0efaf6ca0e87a (diff) |
Pagetables to use normal kernel types
This is my first step in the migration of page_tables.c to the kernel
types and functions/macros (2.6.23-rc3). Seems to be working OK.
Signed-off-by: Matias Zabaljauregui <matias.zabaljauregui@cern.ch>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
-rw-r--r-- | drivers/lguest/hypercalls.c | 2 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 45 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 192 |
3 files changed, 98 insertions, 141 deletions
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 2859a7687288..02d0ae268267 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c | |||
@@ -83,7 +83,7 @@ static void do_hcall(struct lguest *lg, struct hcall_args *args) | |||
83 | guest_set_stack(lg, args->arg1, args->arg2, args->arg3); | 83 | guest_set_stack(lg, args->arg1, args->arg2, args->arg3); |
84 | break; | 84 | break; |
85 | case LHCALL_SET_PTE: | 85 | case LHCALL_SET_PTE: |
86 | guest_set_pte(lg, args->arg1, args->arg2, mkgpte(args->arg3)); | 86 | guest_set_pte(lg, args->arg1, args->arg2, __pte(args->arg3)); |
87 | break; | 87 | break; |
88 | case LHCALL_SET_PMD: | 88 | case LHCALL_SET_PMD: |
89 | guest_set_pmd(lg, args->arg1, args->arg2); | 89 | guest_set_pmd(lg, args->arg1, args->arg2); |
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index c2557cfd86c7..dc15b88208ff 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h | |||
@@ -28,45 +28,10 @@ struct lguest_dma_info | |||
28 | u8 interrupt; /* 0 when not registered */ | 28 | u8 interrupt; /* 0 when not registered */ |
29 | }; | 29 | }; |
30 | 30 | ||
31 | /*H:310 The page-table code owes a great debt of gratitude to Andi Kleen. He | ||
32 | * reviewed the original code which used "u32" for all page table entries, and | ||
33 | * insisted that it would be far clearer with explicit typing. I thought it | ||
34 | * was overkill, but he was right: it is much clearer than it was before. | ||
35 | * | ||
36 | * We have separate types for the Guest's ptes & pgds and the shadow ptes & | ||
37 | * pgds. There's already a Linux type for these (pte_t and pgd_t) but they | ||
38 | * change depending on kernel config options (PAE). */ | ||
39 | |||
40 | /* Each entry is identical: lower 12 bits of flags and upper 20 bits for the | ||
41 | * "page frame number" (0 == first physical page, etc). They are different | ||
42 | * types so the compiler will warn us if we mix them improperly. */ | ||
43 | typedef union { | ||
44 | struct { unsigned flags:12, pfn:20; }; | ||
45 | struct { unsigned long val; } raw; | ||
46 | } spgd_t; | ||
47 | typedef union { | ||
48 | struct { unsigned flags:12, pfn:20; }; | ||
49 | struct { unsigned long val; } raw; | ||
50 | } spte_t; | ||
51 | typedef union { | ||
52 | struct { unsigned flags:12, pfn:20; }; | ||
53 | struct { unsigned long val; } raw; | ||
54 | } gpgd_t; | ||
55 | typedef union { | ||
56 | struct { unsigned flags:12, pfn:20; }; | ||
57 | struct { unsigned long val; } raw; | ||
58 | } gpte_t; | ||
59 | |||
60 | /* We have two convenient macros to convert a "raw" value as handed to us by | ||
61 | * the Guest into the correct Guest PGD or PTE type. */ | ||
62 | #define mkgpte(_val) ((gpte_t){.raw.val = _val}) | ||
63 | #define mkgpgd(_val) ((gpgd_t){.raw.val = _val}) | ||
64 | /*:*/ | ||
65 | |||
66 | struct pgdir | 31 | struct pgdir |
67 | { | 32 | { |
68 | unsigned long cr3; | 33 | unsigned long cr3; |
69 | spgd_t *pgdir; | 34 | pgd_t *pgdir; |
70 | }; | 35 | }; |
71 | 36 | ||
72 | /* We have two pages shared with guests, per cpu. */ | 37 | /* We have two pages shared with guests, per cpu. */ |
@@ -157,6 +122,12 @@ int lguest_address_ok(const struct lguest *lg, | |||
157 | unsigned long addr, unsigned long len); | 122 | unsigned long addr, unsigned long len); |
158 | int run_guest(struct lguest *lg, unsigned long __user *user); | 123 | int run_guest(struct lguest *lg, unsigned long __user *user); |
159 | 124 | ||
125 | /* Helper macros to obtain the first 12 or the last 20 bits, this is only the | ||
126 | * first step in the migration to the kernel types. pte_pfn is already defined | ||
127 | * in the kernel. */ | ||
128 | #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) | ||
129 | #define pte_flags(x) (pte_val(x) & ~PAGE_MASK) | ||
130 | #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) | ||
160 | 131 | ||
161 | /* interrupts_and_traps.c: */ | 132 | /* interrupts_and_traps.c: */ |
162 | void maybe_do_interrupt(struct lguest *lg); | 133 | void maybe_do_interrupt(struct lguest *lg); |
@@ -187,7 +158,7 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 i); | |||
187 | void guest_pagetable_clear_all(struct lguest *lg); | 158 | void guest_pagetable_clear_all(struct lguest *lg); |
188 | void guest_pagetable_flush_user(struct lguest *lg); | 159 | void guest_pagetable_flush_user(struct lguest *lg); |
189 | void guest_set_pte(struct lguest *lg, unsigned long cr3, | 160 | void guest_set_pte(struct lguest *lg, unsigned long cr3, |
190 | unsigned long vaddr, gpte_t val); | 161 | unsigned long vaddr, pte_t val); |
191 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); | 162 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages); |
192 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); | 163 | int demand_page(struct lguest *info, unsigned long cr2, int errcode); |
193 | void pin_page(struct lguest *lg, unsigned long vaddr); | 164 | void pin_page(struct lguest *lg, unsigned long vaddr); |
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 9cd2faceb87c..5c4c53f38cf4 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c | |||
@@ -44,44 +44,32 @@ | |||
44 | * (vii) Setting up the page tables initially. | 44 | * (vii) Setting up the page tables initially. |
45 | :*/ | 45 | :*/ |
46 | 46 | ||
47 | /* Pages a 4k long, and each page table entry is 4 bytes long, giving us 1024 | ||
48 | * (or 2^10) entries per page. */ | ||
49 | #define PTES_PER_PAGE_SHIFT 10 | ||
50 | #define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT) | ||
51 | 47 | ||
52 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is | 48 | /* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is |
53 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE | 49 | * conveniently placed at the top 4MB, so it uses a separate, complete PTE |
54 | * page. */ | 50 | * page. */ |
55 | #define SWITCHER_PGD_INDEX (PTES_PER_PAGE - 1) | 51 | #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) |
56 | 52 | ||
57 | /* We actually need a separate PTE page for each CPU. Remember that after the | 53 | /* We actually need a separate PTE page for each CPU. Remember that after the |
58 | * Switcher code itself comes two pages for each CPU, and we don't want this | 54 | * Switcher code itself comes two pages for each CPU, and we don't want this |
59 | * CPU's guest to see the pages of any other CPU. */ | 55 | * CPU's guest to see the pages of any other CPU. */ |
60 | static DEFINE_PER_CPU(spte_t *, switcher_pte_pages); | 56 | static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); |
61 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) | 57 | #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) |
62 | 58 | ||
63 | /*H:320 With our shadow and Guest types established, we need to deal with | 59 | /*H:320 With our shadow and Guest types established, we need to deal with |
64 | * them: the page table code is curly enough to need helper functions to keep | 60 | * them: the page table code is curly enough to need helper functions to keep |
65 | * it clear and clean. | 61 | * it clear and clean. |
66 | * | 62 | * |
67 | * The first helper takes a virtual address, and says which entry in the top | 63 | * There are two functions which return pointers to the shadow (aka "real") |
68 | * level page table deals with that address. Since each top level entry deals | ||
69 | * with 4M, this effectively divides by 4M. */ | ||
70 | static unsigned vaddr_to_pgd_index(unsigned long vaddr) | ||
71 | { | ||
72 | return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | ||
73 | } | ||
74 | |||
75 | /* There are two functions which return pointers to the shadow (aka "real") | ||
76 | * page tables. | 64 | * page tables. |
77 | * | 65 | * |
78 | * spgd_addr() takes the virtual address and returns a pointer to the top-level | 66 | * spgd_addr() takes the virtual address and returns a pointer to the top-level |
79 | * page directory entry for that address. Since we keep track of several page | 67 | * page directory entry for that address. Since we keep track of several page |
80 | * tables, the "i" argument tells us which one we're interested in (it's | 68 | * tables, the "i" argument tells us which one we're interested in (it's |
81 | * usually the current one). */ | 69 | * usually the current one). */ |
82 | static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | 70 | static pgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) |
83 | { | 71 | { |
84 | unsigned int index = vaddr_to_pgd_index(vaddr); | 72 | unsigned int index = pgd_index(vaddr); |
85 | 73 | ||
86 | /* We kill any Guest trying to touch the Switcher addresses. */ | 74 | /* We kill any Guest trying to touch the Switcher addresses. */ |
87 | if (index >= SWITCHER_PGD_INDEX) { | 75 | if (index >= SWITCHER_PGD_INDEX) { |
@@ -95,28 +83,28 @@ static spgd_t *spgd_addr(struct lguest *lg, u32 i, unsigned long vaddr) | |||
95 | /* This routine then takes the PGD entry given above, which contains the | 83 | /* This routine then takes the PGD entry given above, which contains the |
96 | * address of the PTE page. It then returns a pointer to the PTE entry for the | 84 | * address of the PTE page. It then returns a pointer to the PTE entry for the |
97 | * given address. */ | 85 | * given address. */ |
98 | static spte_t *spte_addr(struct lguest *lg, spgd_t spgd, unsigned long vaddr) | 86 | static pte_t *spte_addr(struct lguest *lg, pgd_t spgd, unsigned long vaddr) |
99 | { | 87 | { |
100 | spte_t *page = __va(spgd.pfn << PAGE_SHIFT); | 88 | pte_t *page = __va(pgd_pfn(spgd) << PAGE_SHIFT); |
101 | /* You should never call this if the PGD entry wasn't valid */ | 89 | /* You should never call this if the PGD entry wasn't valid */ |
102 | BUG_ON(!(spgd.flags & _PAGE_PRESENT)); | 90 | BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT)); |
103 | return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE]; | 91 | return &page[(vaddr >> PAGE_SHIFT) % PTRS_PER_PTE]; |
104 | } | 92 | } |
105 | 93 | ||
106 | /* These two functions just like the above two, except they access the Guest | 94 | /* These two functions just like the above two, except they access the Guest |
107 | * page tables. Hence they return a Guest address. */ | 95 | * page tables. Hence they return a Guest address. */ |
108 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) | 96 | static unsigned long gpgd_addr(struct lguest *lg, unsigned long vaddr) |
109 | { | 97 | { |
110 | unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT); | 98 | unsigned int index = vaddr >> (PGDIR_SHIFT); |
111 | return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(gpgd_t); | 99 | return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(pgd_t); |
112 | } | 100 | } |
113 | 101 | ||
114 | static unsigned long gpte_addr(struct lguest *lg, | 102 | static unsigned long gpte_addr(struct lguest *lg, |
115 | gpgd_t gpgd, unsigned long vaddr) | 103 | pgd_t gpgd, unsigned long vaddr) |
116 | { | 104 | { |
117 | unsigned long gpage = gpgd.pfn << PAGE_SHIFT; | 105 | unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; |
118 | BUG_ON(!(gpgd.flags & _PAGE_PRESENT)); | 106 | BUG_ON(!(pgd_flags(gpgd) & _PAGE_PRESENT)); |
119 | return gpage + ((vaddr>>PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(gpte_t); | 107 | return gpage + ((vaddr>>PAGE_SHIFT) % PTRS_PER_PTE) * sizeof(pte_t); |
120 | } | 108 | } |
121 | 109 | ||
122 | /*H:350 This routine takes a page number given by the Guest and converts it to | 110 | /*H:350 This routine takes a page number given by the Guest and converts it to |
@@ -149,16 +137,15 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) | |||
149 | * entry can be a little tricky. The flags are (almost) the same, but the | 137 | * entry can be a little tricky. The flags are (almost) the same, but the |
150 | * Guest PTE contains a virtual page number: the CPU needs the real page | 138 | * Guest PTE contains a virtual page number: the CPU needs the real page |
151 | * number. */ | 139 | * number. */ |
152 | static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | 140 | static pte_t gpte_to_spte(struct lguest *lg, pte_t gpte, int write) |
153 | { | 141 | { |
154 | spte_t spte; | 142 | unsigned long pfn, base, flags; |
155 | unsigned long pfn, base; | ||
156 | 143 | ||
157 | /* The Guest sets the global flag, because it thinks that it is using | 144 | /* The Guest sets the global flag, because it thinks that it is using |
158 | * PGE. We only told it to use PGE so it would tell us whether it was | 145 | * PGE. We only told it to use PGE so it would tell us whether it was |
159 | * flushing a kernel mapping or a userspace mapping. We don't actually | 146 | * flushing a kernel mapping or a userspace mapping. We don't actually |
160 | * use the global bit, so throw it away. */ | 147 | * use the global bit, so throw it away. */ |
161 | spte.flags = (gpte.flags & ~_PAGE_GLOBAL); | 148 | flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); |
162 | 149 | ||
163 | /* The Guest's pages are offset inside the Launcher. */ | 150 | /* The Guest's pages are offset inside the Launcher. */ |
164 | base = (unsigned long)lg->mem_base / PAGE_SIZE; | 151 | base = (unsigned long)lg->mem_base / PAGE_SIZE; |
@@ -167,38 +154,38 @@ static spte_t gpte_to_spte(struct lguest *lg, gpte_t gpte, int write) | |||
167 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't | 154 | * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't |
168 | * fit in spte.pfn. get_pfn() finds the real physical number of the | 155 | * fit in spte.pfn. get_pfn() finds the real physical number of the |
169 | * page, given the virtual number. */ | 156 | * page, given the virtual number. */ |
170 | pfn = get_pfn(base + gpte.pfn, write); | 157 | pfn = get_pfn(base + pte_pfn(gpte), write); |
171 | if (pfn == -1UL) { | 158 | if (pfn == -1UL) { |
172 | kill_guest(lg, "failed to get page %u", gpte.pfn); | 159 | kill_guest(lg, "failed to get page %lu", pte_pfn(gpte)); |
173 | /* When we destroy the Guest, we'll go through the shadow page | 160 | /* When we destroy the Guest, we'll go through the shadow page |
174 | * tables and release_pte() them. Make sure we don't think | 161 | * tables and release_pte() them. Make sure we don't think |
175 | * this one is valid! */ | 162 | * this one is valid! */ |
176 | spte.flags = 0; | 163 | flags = 0; |
177 | } | 164 | } |
178 | /* Now we assign the page number, and our shadow PTE is complete. */ | 165 | /* Now we assemble our shadow PTE from the page number and flags. */ |
179 | spte.pfn = pfn; | 166 | return pfn_pte(pfn, __pgprot(flags)); |
180 | return spte; | ||
181 | } | 167 | } |
182 | 168 | ||
183 | /*H:460 And to complete the chain, release_pte() looks like this: */ | 169 | /*H:460 And to complete the chain, release_pte() looks like this: */ |
184 | static void release_pte(spte_t pte) | 170 | static void release_pte(pte_t pte) |
185 | { | 171 | { |
186 | /* Remember that get_user_pages() took a reference to the page, in | 172 | /* Remember that get_user_pages() took a reference to the page, in |
187 | * get_pfn()? We have to put it back now. */ | 173 | * get_pfn()? We have to put it back now. */ |
188 | if (pte.flags & _PAGE_PRESENT) | 174 | if (pte_flags(pte) & _PAGE_PRESENT) |
189 | put_page(pfn_to_page(pte.pfn)); | 175 | put_page(pfn_to_page(pte_pfn(pte))); |
190 | } | 176 | } |
191 | /*:*/ | 177 | /*:*/ |
192 | 178 | ||
193 | static void check_gpte(struct lguest *lg, gpte_t gpte) | 179 | static void check_gpte(struct lguest *lg, pte_t gpte) |
194 | { | 180 | { |
195 | if ((gpte.flags & (_PAGE_PWT|_PAGE_PSE)) || gpte.pfn >= lg->pfn_limit) | 181 | if ((pte_flags(gpte) & (_PAGE_PWT|_PAGE_PSE)) |
182 | || pte_pfn(gpte) >= lg->pfn_limit) | ||
196 | kill_guest(lg, "bad page table entry"); | 183 | kill_guest(lg, "bad page table entry"); |
197 | } | 184 | } |
198 | 185 | ||
199 | static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | 186 | static void check_gpgd(struct lguest *lg, pgd_t gpgd) |
200 | { | 187 | { |
201 | if ((gpgd.flags & ~_PAGE_TABLE) || gpgd.pfn >= lg->pfn_limit) | 188 | if ((pgd_flags(gpgd) & ~_PAGE_TABLE) || pgd_pfn(gpgd) >= lg->pfn_limit) |
202 | kill_guest(lg, "bad page directory entry"); | 189 | kill_guest(lg, "bad page directory entry"); |
203 | } | 190 | } |
204 | 191 | ||
@@ -214,21 +201,21 @@ static void check_gpgd(struct lguest *lg, gpgd_t gpgd) | |||
214 | * true. */ | 201 | * true. */ |
215 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | 202 | int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) |
216 | { | 203 | { |
217 | gpgd_t gpgd; | 204 | pgd_t gpgd; |
218 | spgd_t *spgd; | 205 | pgd_t *spgd; |
219 | unsigned long gpte_ptr; | 206 | unsigned long gpte_ptr; |
220 | gpte_t gpte; | 207 | pte_t gpte; |
221 | spte_t *spte; | 208 | pte_t *spte; |
222 | 209 | ||
223 | /* First step: get the top-level Guest page table entry. */ | 210 | /* First step: get the top-level Guest page table entry. */ |
224 | gpgd = mkgpgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); | 211 | gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr))); |
225 | /* Toplevel not present? We can't map it in. */ | 212 | /* Toplevel not present? We can't map it in. */ |
226 | if (!(gpgd.flags & _PAGE_PRESENT)) | 213 | if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) |
227 | return 0; | 214 | return 0; |
228 | 215 | ||
229 | /* Now look at the matching shadow entry. */ | 216 | /* Now look at the matching shadow entry. */ |
230 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 217 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
231 | if (!(spgd->flags & _PAGE_PRESENT)) { | 218 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { |
232 | /* No shadow entry: allocate a new shadow PTE page. */ | 219 | /* No shadow entry: allocate a new shadow PTE page. */ |
233 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); | 220 | unsigned long ptepage = get_zeroed_page(GFP_KERNEL); |
234 | /* This is not really the Guest's fault, but killing it is | 221 | /* This is not really the Guest's fault, but killing it is |
@@ -241,34 +228,35 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
241 | check_gpgd(lg, gpgd); | 228 | check_gpgd(lg, gpgd); |
242 | /* And we copy the flags to the shadow PGD entry. The page | 229 | /* And we copy the flags to the shadow PGD entry. The page |
243 | * number in the shadow PGD is the page we just allocated. */ | 230 | * number in the shadow PGD is the page we just allocated. */ |
244 | spgd->raw.val = (__pa(ptepage) | gpgd.flags); | 231 | *spgd = __pgd(__pa(ptepage) | pgd_flags(gpgd)); |
245 | } | 232 | } |
246 | 233 | ||
247 | /* OK, now we look at the lower level in the Guest page table: keep its | 234 | /* OK, now we look at the lower level in the Guest page table: keep its |
248 | * address, because we might update it later. */ | 235 | * address, because we might update it later. */ |
249 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); | 236 | gpte_ptr = gpte_addr(lg, gpgd, vaddr); |
250 | gpte = mkgpte(lgread_u32(lg, gpte_ptr)); | 237 | gpte = __pte(lgread_u32(lg, gpte_ptr)); |
251 | 238 | ||
252 | /* If this page isn't in the Guest page tables, we can't page it in. */ | 239 | /* If this page isn't in the Guest page tables, we can't page it in. */ |
253 | if (!(gpte.flags & _PAGE_PRESENT)) | 240 | if (!(pte_flags(gpte) & _PAGE_PRESENT)) |
254 | return 0; | 241 | return 0; |
255 | 242 | ||
256 | /* Check they're not trying to write to a page the Guest wants | 243 | /* Check they're not trying to write to a page the Guest wants |
257 | * read-only (bit 2 of errcode == write). */ | 244 | * read-only (bit 2 of errcode == write). */ |
258 | if ((errcode & 2) && !(gpte.flags & _PAGE_RW)) | 245 | if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) |
259 | return 0; | 246 | return 0; |
260 | 247 | ||
261 | /* User access to a kernel page? (bit 3 == user access) */ | 248 | /* User access to a kernel page? (bit 3 == user access) */ |
262 | if ((errcode & 4) && !(gpte.flags & _PAGE_USER)) | 249 | if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) |
263 | return 0; | 250 | return 0; |
264 | 251 | ||
265 | /* Check that the Guest PTE flags are OK, and the page number is below | 252 | /* Check that the Guest PTE flags are OK, and the page number is below |
266 | * the pfn_limit (ie. not mapping the Launcher binary). */ | 253 | * the pfn_limit (ie. not mapping the Launcher binary). */ |
267 | check_gpte(lg, gpte); | 254 | check_gpte(lg, gpte); |
268 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ | 255 | /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ |
269 | gpte.flags |= _PAGE_ACCESSED; | 256 | gpte = pte_mkyoung(gpte); |
257 | |||
270 | if (errcode & 2) | 258 | if (errcode & 2) |
271 | gpte.flags |= _PAGE_DIRTY; | 259 | gpte = pte_mkdirty(gpte); |
272 | 260 | ||
273 | /* Get the pointer to the shadow PTE entry we're going to set. */ | 261 | /* Get the pointer to the shadow PTE entry we're going to set. */ |
274 | spte = spte_addr(lg, *spgd, vaddr); | 262 | spte = spte_addr(lg, *spgd, vaddr); |
@@ -278,21 +266,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
278 | 266 | ||
279 | /* If this is a write, we insist that the Guest page is writable (the | 267 | /* If this is a write, we insist that the Guest page is writable (the |
280 | * final arg to gpte_to_spte()). */ | 268 | * final arg to gpte_to_spte()). */ |
281 | if (gpte.flags & _PAGE_DIRTY) | 269 | if (pte_dirty(gpte)) |
282 | *spte = gpte_to_spte(lg, gpte, 1); | 270 | *spte = gpte_to_spte(lg, gpte, 1); |
283 | else { | 271 | else |
284 | /* If this is a read, don't set the "writable" bit in the page | 272 | /* If this is a read, don't set the "writable" bit in the page |
285 | * table entry, even if the Guest says it's writable. That way | 273 | * table entry, even if the Guest says it's writable. That way |
286 | * we come back here when a write does actually ocur, so we can | 274 | * we come back here when a write does actually ocur, so we can |
287 | * update the Guest's _PAGE_DIRTY flag. */ | 275 | * update the Guest's _PAGE_DIRTY flag. */ |
288 | gpte_t ro_gpte = gpte; | 276 | *spte = gpte_to_spte(lg, pte_wrprotect(gpte), 0); |
289 | ro_gpte.flags &= ~_PAGE_RW; | ||
290 | *spte = gpte_to_spte(lg, ro_gpte, 0); | ||
291 | } | ||
292 | 277 | ||
293 | /* Finally, we write the Guest PTE entry back: we've set the | 278 | /* Finally, we write the Guest PTE entry back: we've set the |
294 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ | 279 | * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ |
295 | lgwrite_u32(lg, gpte_ptr, gpte.raw.val); | 280 | lgwrite_u32(lg, gpte_ptr, pte_val(gpte)); |
296 | 281 | ||
297 | /* We succeeded in mapping the page! */ | 282 | /* We succeeded in mapping the page! */ |
298 | return 1; | 283 | return 1; |
@@ -308,17 +293,18 @@ int demand_page(struct lguest *lg, unsigned long vaddr, int errcode) | |||
308 | * mapped by the shadow page tables, and is it writable? */ | 293 | * mapped by the shadow page tables, and is it writable? */ |
309 | static int page_writable(struct lguest *lg, unsigned long vaddr) | 294 | static int page_writable(struct lguest *lg, unsigned long vaddr) |
310 | { | 295 | { |
311 | spgd_t *spgd; | 296 | pgd_t *spgd; |
312 | unsigned long flags; | 297 | unsigned long flags; |
313 | 298 | ||
314 | /* Look at the top level entry: is it present? */ | 299 | /* Look at the top level entry: is it present? */ |
315 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); | 300 | spgd = spgd_addr(lg, lg->pgdidx, vaddr); |
316 | if (!(spgd->flags & _PAGE_PRESENT)) | 301 | if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) |
317 | return 0; | 302 | return 0; |
318 | 303 | ||
319 | /* Check the flags on the pte entry itself: it must be present and | 304 | /* Check the flags on the pte entry itself: it must be present and |
320 | * writable. */ | 305 | * writable. */ |
321 | flags = spte_addr(lg, *spgd, vaddr)->flags; | 306 | flags = pte_flags(*(spte_addr(lg, *spgd, vaddr))); |
307 | |||
322 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); | 308 | return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); |
323 | } | 309 | } |
324 | 310 | ||
@@ -332,22 +318,22 @@ void pin_page(struct lguest *lg, unsigned long vaddr) | |||
332 | } | 318 | } |
333 | 319 | ||
334 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ | 320 | /*H:450 If we chase down the release_pgd() code, it looks like this: */ |
335 | static void release_pgd(struct lguest *lg, spgd_t *spgd) | 321 | static void release_pgd(struct lguest *lg, pgd_t *spgd) |
336 | { | 322 | { |
337 | /* If the entry's not present, there's nothing to release. */ | 323 | /* If the entry's not present, there's nothing to release. */ |
338 | if (spgd->flags & _PAGE_PRESENT) { | 324 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
339 | unsigned int i; | 325 | unsigned int i; |
340 | /* Converting the pfn to find the actual PTE page is easy: turn | 326 | /* Converting the pfn to find the actual PTE page is easy: turn |
341 | * the page number into a physical address, then convert to a | 327 | * the page number into a physical address, then convert to a |
342 | * virtual address (easy for kernel pages like this one). */ | 328 | * virtual address (easy for kernel pages like this one). */ |
343 | spte_t *ptepage = __va(spgd->pfn << PAGE_SHIFT); | 329 | pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); |
344 | /* For each entry in the page, we might need to release it. */ | 330 | /* For each entry in the page, we might need to release it. */ |
345 | for (i = 0; i < PTES_PER_PAGE; i++) | 331 | for (i = 0; i < PTRS_PER_PTE; i++) |
346 | release_pte(ptepage[i]); | 332 | release_pte(ptepage[i]); |
347 | /* Now we can free the page of PTEs */ | 333 | /* Now we can free the page of PTEs */ |
348 | free_page((long)ptepage); | 334 | free_page((long)ptepage); |
349 | /* And zero out the PGD entry we we never release it twice. */ | 335 | /* And zero out the PGD entry we we never release it twice. */ |
350 | spgd->raw.val = 0; | 336 | *spgd = __pgd(0); |
351 | } | 337 | } |
352 | } | 338 | } |
353 | 339 | ||
@@ -359,7 +345,7 @@ static void flush_user_mappings(struct lguest *lg, int idx) | |||
359 | { | 345 | { |
360 | unsigned int i; | 346 | unsigned int i; |
361 | /* Release every pgd entry up to the kernel's address. */ | 347 | /* Release every pgd entry up to the kernel's address. */ |
362 | for (i = 0; i < vaddr_to_pgd_index(lg->page_offset); i++) | 348 | for (i = 0; i < pgd_index(lg->page_offset); i++) |
363 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); | 349 | release_pgd(lg, lg->pgdirs[idx].pgdir + i); |
364 | } | 350 | } |
365 | 351 | ||
@@ -398,7 +384,7 @@ static unsigned int new_pgdir(struct lguest *lg, | |||
398 | next = random32() % ARRAY_SIZE(lg->pgdirs); | 384 | next = random32() % ARRAY_SIZE(lg->pgdirs); |
399 | /* If it's never been allocated at all before, try now. */ | 385 | /* If it's never been allocated at all before, try now. */ |
400 | if (!lg->pgdirs[next].pgdir) { | 386 | if (!lg->pgdirs[next].pgdir) { |
401 | lg->pgdirs[next].pgdir = (spgd_t *)get_zeroed_page(GFP_KERNEL); | 387 | lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
402 | /* If the allocation fails, just keep using the one we have */ | 388 | /* If the allocation fails, just keep using the one we have */ |
403 | if (!lg->pgdirs[next].pgdir) | 389 | if (!lg->pgdirs[next].pgdir) |
404 | next = lg->pgdidx; | 390 | next = lg->pgdidx; |
@@ -475,26 +461,27 @@ void guest_pagetable_clear_all(struct lguest *lg) | |||
475 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. | 461 | * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. |
476 | */ | 462 | */ |
477 | static void do_set_pte(struct lguest *lg, int idx, | 463 | static void do_set_pte(struct lguest *lg, int idx, |
478 | unsigned long vaddr, gpte_t gpte) | 464 | unsigned long vaddr, pte_t gpte) |
479 | { | 465 | { |
480 | /* Look up the matching shadow page directot entry. */ | 466 | /* Look up the matching shadow page directot entry. */ |
481 | spgd_t *spgd = spgd_addr(lg, idx, vaddr); | 467 | pgd_t *spgd = spgd_addr(lg, idx, vaddr); |
482 | 468 | ||
483 | /* If the top level isn't present, there's no entry to update. */ | 469 | /* If the top level isn't present, there's no entry to update. */ |
484 | if (spgd->flags & _PAGE_PRESENT) { | 470 | if (pgd_flags(*spgd) & _PAGE_PRESENT) { |
485 | /* Otherwise, we start by releasing the existing entry. */ | 471 | /* Otherwise, we start by releasing the existing entry. */ |
486 | spte_t *spte = spte_addr(lg, *spgd, vaddr); | 472 | pte_t *spte = spte_addr(lg, *spgd, vaddr); |
487 | release_pte(*spte); | 473 | release_pte(*spte); |
488 | 474 | ||
489 | /* If they're setting this entry as dirty or accessed, we might | 475 | /* If they're setting this entry as dirty or accessed, we might |
490 | * as well put that entry they've given us in now. This shaves | 476 | * as well put that entry they've given us in now. This shaves |
491 | * 10% off a copy-on-write micro-benchmark. */ | 477 | * 10% off a copy-on-write micro-benchmark. */ |
492 | if (gpte.flags & (_PAGE_DIRTY | _PAGE_ACCESSED)) { | 478 | if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { |
493 | check_gpte(lg, gpte); | 479 | check_gpte(lg, gpte); |
494 | *spte = gpte_to_spte(lg, gpte, gpte.flags&_PAGE_DIRTY); | 480 | *spte = gpte_to_spte(lg, gpte, |
481 | pte_flags(gpte) & _PAGE_DIRTY); | ||
495 | } else | 482 | } else |
496 | /* Otherwise we can demand_page() it in later. */ | 483 | /* Otherwise we can demand_page() it in later. */ |
497 | spte->raw.val = 0; | 484 | *spte = __pte(0); |
498 | } | 485 | } |
499 | } | 486 | } |
500 | 487 | ||
@@ -509,7 +496,7 @@ static void do_set_pte(struct lguest *lg, int idx, | |||
509 | * The benefit is that when we have to track a new page table, we can copy keep | 496 | * The benefit is that when we have to track a new page table, we can copy keep |
510 | * all the kernel mappings. This speeds up context switch immensely. */ | 497 | * all the kernel mappings. This speeds up context switch immensely. */ |
511 | void guest_set_pte(struct lguest *lg, | 498 | void guest_set_pte(struct lguest *lg, |
512 | unsigned long cr3, unsigned long vaddr, gpte_t gpte) | 499 | unsigned long cr3, unsigned long vaddr, pte_t gpte) |
513 | { | 500 | { |
514 | /* Kernel mappings must be changed on all top levels. Slow, but | 501 | /* Kernel mappings must be changed on all top levels. Slow, but |
515 | * doesn't happen often. */ | 502 | * doesn't happen often. */ |
@@ -564,15 +551,15 @@ void guest_set_pmd(struct lguest *lg, unsigned long cr3, u32 idx) | |||
564 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) | 551 | int init_guest_pagetable(struct lguest *lg, unsigned long pgtable) |
565 | { | 552 | { |
566 | /* In flush_user_mappings() we loop from 0 to | 553 | /* In flush_user_mappings() we loop from 0 to |
567 | * "vaddr_to_pgd_index(lg->page_offset)". This assumes it won't hit | 554 | * "pgd_index(lg->page_offset)". This assumes it won't hit |
568 | * the Switcher mappings, so check that now. */ | 555 | * the Switcher mappings, so check that now. */ |
569 | if (vaddr_to_pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) | 556 | if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX) |
570 | return -EINVAL; | 557 | return -EINVAL; |
571 | /* We start on the first shadow page table, and give it a blank PGD | 558 | /* We start on the first shadow page table, and give it a blank PGD |
572 | * page. */ | 559 | * page. */ |
573 | lg->pgdidx = 0; | 560 | lg->pgdidx = 0; |
574 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; | 561 | lg->pgdirs[lg->pgdidx].cr3 = pgtable; |
575 | lg->pgdirs[lg->pgdidx].pgdir = (spgd_t*)get_zeroed_page(GFP_KERNEL); | 562 | lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); |
576 | if (!lg->pgdirs[lg->pgdidx].pgdir) | 563 | if (!lg->pgdirs[lg->pgdidx].pgdir) |
577 | return -ENOMEM; | 564 | return -ENOMEM; |
578 | return 0; | 565 | return 0; |
@@ -597,14 +584,14 @@ void free_guest_pagetable(struct lguest *lg) | |||
597 | * for each CPU already set up, we just need to hook them in. */ | 584 | * for each CPU already set up, we just need to hook them in. */ |
598 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | 585 | void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) |
599 | { | 586 | { |
600 | spte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); | 587 | pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); |
601 | spgd_t switcher_pgd; | 588 | pgd_t switcher_pgd; |
602 | spte_t regs_pte; | 589 | pte_t regs_pte; |
603 | 590 | ||
604 | /* Make the last PGD entry for this Guest point to the Switcher's PTE | 591 | /* Make the last PGD entry for this Guest point to the Switcher's PTE |
605 | * page for this CPU (with appropriate flags). */ | 592 | * page for this CPU (with appropriate flags). */ |
606 | switcher_pgd.pfn = __pa(switcher_pte_page) >> PAGE_SHIFT; | 593 | switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); |
607 | switcher_pgd.flags = _PAGE_KERNEL; | 594 | |
608 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; | 595 | lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; |
609 | 596 | ||
610 | /* We also change the Switcher PTE page. When we're running the Guest, | 597 | /* We also change the Switcher PTE page. When we're running the Guest, |
@@ -614,10 +601,8 @@ void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages) | |||
614 | * CPU's "struct lguest_pages": if we make sure the Guest's register | 601 | * CPU's "struct lguest_pages": if we make sure the Guest's register |
615 | * page is already mapped there, we don't have to copy them out | 602 | * page is already mapped there, we don't have to copy them out |
616 | * again. */ | 603 | * again. */ |
617 | regs_pte.pfn = __pa(lg->regs_page) >> PAGE_SHIFT; | 604 | regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); |
618 | regs_pte.flags = _PAGE_KERNEL; | 605 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte; |
619 | switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTES_PER_PAGE] | ||
620 | = regs_pte; | ||
621 | } | 606 | } |
622 | /*:*/ | 607 | /*:*/ |
623 | 608 | ||
@@ -638,24 +623,25 @@ static __init void populate_switcher_pte_page(unsigned int cpu, | |||
638 | unsigned int pages) | 623 | unsigned int pages) |
639 | { | 624 | { |
640 | unsigned int i; | 625 | unsigned int i; |
641 | spte_t *pte = switcher_pte_page(cpu); | 626 | pte_t *pte = switcher_pte_page(cpu); |
642 | 627 | ||
643 | /* The first entries are easy: they map the Switcher code. */ | 628 | /* The first entries are easy: they map the Switcher code. */ |
644 | for (i = 0; i < pages; i++) { | 629 | for (i = 0; i < pages; i++) { |
645 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 630 | pte[i] = mk_pte(switcher_page[i], |
646 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 631 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); |
647 | } | 632 | } |
648 | 633 | ||
649 | /* The only other thing we map is this CPU's pair of pages. */ | 634 | /* The only other thing we map is this CPU's pair of pages. */ |
650 | i = pages + cpu*2; | 635 | i = pages + cpu*2; |
651 | 636 | ||
652 | /* First page (Guest registers) is writable from the Guest */ | 637 | /* First page (Guest registers) is writable from the Guest */ |
653 | pte[i].pfn = page_to_pfn(switcher_page[i]); | 638 | pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), |
654 | pte[i].flags = _PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW; | 639 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); |
640 | |||
655 | /* The second page contains the "struct lguest_ro_state", and is | 641 | /* The second page contains the "struct lguest_ro_state", and is |
656 | * read-only. */ | 642 | * read-only. */ |
657 | pte[i+1].pfn = page_to_pfn(switcher_page[i+1]); | 643 | pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), |
658 | pte[i+1].flags = _PAGE_PRESENT|_PAGE_ACCESSED; | 644 | __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); |
659 | } | 645 | } |
660 | 646 | ||
661 | /*H:510 At boot or module load time, init_pagetables() allocates and populates | 647 | /*H:510 At boot or module load time, init_pagetables() allocates and populates |
@@ -665,7 +651,7 @@ __init int init_pagetables(struct page **switcher_page, unsigned int pages) | |||
665 | unsigned int i; | 651 | unsigned int i; |
666 | 652 | ||
667 | for_each_possible_cpu(i) { | 653 | for_each_possible_cpu(i) { |
668 | switcher_pte_page(i) = (spte_t *)get_zeroed_page(GFP_KERNEL); | 654 | switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); |
669 | if (!switcher_pte_page(i)) { | 655 | if (!switcher_pte_page(i)) { |
670 | free_switcher_pte_pages(); | 656 | free_switcher_pte_pages(); |
671 | return -ENOMEM; | 657 | return -ENOMEM; |