aboutsummaryrefslogtreecommitdiffstats
path: root/arch/i386/xen/mmu.c
diff options
context:
space:
mode:
authorJeremy Fitzhardinge <jeremy@xensource.com>2007-07-17 21:37:04 -0400
committerJeremy Fitzhardinge <jeremy@goop.org>2007-07-18 11:47:42 -0400
commit3b827c1b3aadf3adb4c602d19863f2d24e7cbc18 (patch)
treec889f2e3023102be09173d53dd3620567c9e6fe3 /arch/i386/xen/mmu.c
parent5ead97c84fa7d63a6a7a2f4e9f18f452bd109045 (diff)
xen: virtual mmu
Xen pagetable handling, including the machinery to implement direct pagetables. Xen presents the real CPU's pagetables directly to guests, with no added shadowing or other layer of abstraction. Naturally this means the hypervisor must maintain close control over what the guest can put into the pagetable. When the guest modifies the pte/pmd/pgd, it must convert its domain-specific notion of a "physical" pfn into a global machine frame number (mfn) before inserting the entry into the pagetable. Xen will check to make sure the domain is allowed to create a mapping of the given mfn. Xen also requires that all mappings the guest has of its own active pagetable are read-only. This is relatively easy to implement in Linux because all pagetables share the same pte pages for kernel mappings, so updating the pte in one pagetable will implicitly update the mapping in all pagetables. Normally a pagetable becomes active when you point to it with cr3 (or the Xen equivalent), but when you do so, Xen must check the whole pagetable for correctness, which is clearly a performance problem. Xen solves this with pinning which keeps a pagetable effectively active even if its currently unused, which means that all the normal update rules are enforced. This means that it need not revalidate the pagetable when loading cr3. This patch has a first-cut implementation of pinning, but it is more fully implemented in a later patch. Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com> Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Diffstat (limited to 'arch/i386/xen/mmu.c')
-rw-r--r--arch/i386/xen/mmu.c420
1 files changed, 420 insertions, 0 deletions
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c
new file mode 100644
index 000000000000..de16cb5f55ca
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,420 @@
1/*
2 * Xen mmu operations
3 *
4 * This file contains the various mmu fetch and update operations.
5 * The most important job they must perform is the mapping between the
6 * domain's pfn and the overall machine mfns.
7 *
8 * Xen allows guests to directly update the pagetable, in a controlled
9 * fashion. In other words, the guest modifies the same pagetable
10 * that the CPU actually uses, which eliminates the overhead of having
11 * a separate shadow pagetable.
12 *
13 * In order to allow this, it falls on the guest domain to map its
14 * notion of a "physical" pfn - which is just a domain-local linear
15 * address - into a real "machine address" which the CPU's MMU can
16 * use.
17 *
18 * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
19 * inserted directly into the pagetable. When creating a new
20 * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
21 * when reading the content back with __(pgd|pmd|pte)_val, it converts
22 * the mfn back into a pfn.
23 *
24 * The other constraint is that all pages which make up a pagetable
25 * must be mapped read-only in the guest. This prevents uncontrolled
26 * guest updates to the pagetable. Xen strictly enforces this, and
27 * will disallow any pagetable update which will end up mapping a
28 * pagetable page RW, and will disallow using any writable page as a
29 * pagetable.
30 *
31 * Naively, when loading %cr3 with the base of a new pagetable, Xen
32 * would need to validate the whole pagetable before going on.
33 * Naturally, this is quite slow. The solution is to "pin" a
34 * pagetable, which enforces all the constraints on the pagetable even
35 * when it is not actively in use. This menas that Xen can be assured
36 * that it is still valid when you do load it into %cr3, and doesn't
37 * need to revalidate it.
38 *
39 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
40 */
41#include <linux/bug.h>
42#include <linux/sched.h>
43
44#include <asm/pgtable.h>
45#include <asm/tlbflush.h>
46#include <asm/mmu_context.h>
47
48#include <asm/xen/hypercall.h>
49#include <asm/paravirt.h>
50
51#include <xen/page.h>
52#include <xen/interface/xen.h>
53
54#include "mmu.h"
55
56xmaddr_t arbitrary_virt_to_machine(unsigned long address)
57{
58 pte_t *pte = lookup_address(address);
59 unsigned offset = address & PAGE_MASK;
60
61 BUG_ON(pte == NULL);
62
63 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
64}
65
66void make_lowmem_page_readonly(void *vaddr)
67{
68 pte_t *pte, ptev;
69 unsigned long address = (unsigned long)vaddr;
70
71 pte = lookup_address(address);
72 BUG_ON(pte == NULL);
73
74 ptev = pte_wrprotect(*pte);
75
76 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
77 BUG();
78}
79
80void make_lowmem_page_readwrite(void *vaddr)
81{
82 pte_t *pte, ptev;
83 unsigned long address = (unsigned long)vaddr;
84
85 pte = lookup_address(address);
86 BUG_ON(pte == NULL);
87
88 ptev = pte_mkwrite(*pte);
89
90 if (HYPERVISOR_update_va_mapping(address, ptev, 0))
91 BUG();
92}
93
94
95void xen_set_pte(pte_t *ptep, pte_t pte)
96{
97 struct mmu_update u;
98
99 u.ptr = virt_to_machine(ptep).maddr;
100 u.val = pte_val_ma(pte);
101 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
102 BUG();
103}
104
105void xen_set_pmd(pmd_t *ptr, pmd_t val)
106{
107 struct mmu_update u;
108
109 u.ptr = virt_to_machine(ptr).maddr;
110 u.val = pmd_val_ma(val);
111 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
112 BUG();
113}
114
115#ifdef CONFIG_X86_PAE
116void xen_set_pud(pud_t *ptr, pud_t val)
117{
118 struct mmu_update u;
119
120 u.ptr = virt_to_machine(ptr).maddr;
121 u.val = pud_val_ma(val);
122 if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
123 BUG();
124}
125#endif
126
127/*
128 * Associate a virtual page frame with a given physical page frame
129 * and protection flags for that frame.
130 */
131void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
132{
133 pgd_t *pgd;
134 pud_t *pud;
135 pmd_t *pmd;
136 pte_t *pte;
137
138 pgd = swapper_pg_dir + pgd_index(vaddr);
139 if (pgd_none(*pgd)) {
140 BUG();
141 return;
142 }
143 pud = pud_offset(pgd, vaddr);
144 if (pud_none(*pud)) {
145 BUG();
146 return;
147 }
148 pmd = pmd_offset(pud, vaddr);
149 if (pmd_none(*pmd)) {
150 BUG();
151 return;
152 }
153 pte = pte_offset_kernel(pmd, vaddr);
154 /* <mfn,flags> stored as-is, to permit clearing entries */
155 xen_set_pte(pte, mfn_pte(mfn, flags));
156
157 /*
158 * It's enough to flush this one mapping.
159 * (PGE mappings get flushed as well)
160 */
161 __flush_tlb_one(vaddr);
162}
163
164void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
165 pte_t *ptep, pte_t pteval)
166{
167 if ((mm != current->mm && mm != &init_mm) ||
168 HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
169 xen_set_pte(ptep, pteval);
170}
171
172#ifdef CONFIG_X86_PAE
173void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
174{
175 set_64bit((u64 *)ptep, pte_val_ma(pte));
176}
177
178void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
179{
180 ptep->pte_low = 0;
181 smp_wmb(); /* make sure low gets written first */
182 ptep->pte_high = 0;
183}
184
185void xen_pmd_clear(pmd_t *pmdp)
186{
187 xen_set_pmd(pmdp, __pmd(0));
188}
189
190unsigned long long xen_pte_val(pte_t pte)
191{
192 unsigned long long ret = 0;
193
194 if (pte.pte_low) {
195 ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
196 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
197 }
198
199 return ret;
200}
201
202unsigned long long xen_pmd_val(pmd_t pmd)
203{
204 unsigned long long ret = pmd.pmd;
205 if (ret)
206 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
207 return ret;
208}
209
210unsigned long long xen_pgd_val(pgd_t pgd)
211{
212 unsigned long long ret = pgd.pgd;
213 if (ret)
214 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
215 return ret;
216}
217
218pte_t xen_make_pte(unsigned long long pte)
219{
220 if (pte & 1)
221 pte = phys_to_machine(XPADDR(pte)).maddr;
222
223 return (pte_t){ pte, pte >> 32 };
224}
225
226pmd_t xen_make_pmd(unsigned long long pmd)
227{
228 if (pmd & 1)
229 pmd = phys_to_machine(XPADDR(pmd)).maddr;
230
231 return (pmd_t){ pmd };
232}
233
234pgd_t xen_make_pgd(unsigned long long pgd)
235{
236 if (pgd & _PAGE_PRESENT)
237 pgd = phys_to_machine(XPADDR(pgd)).maddr;
238
239 return (pgd_t){ pgd };
240}
241#else /* !PAE */
242unsigned long xen_pte_val(pte_t pte)
243{
244 unsigned long ret = pte.pte_low;
245
246 if (ret & _PAGE_PRESENT)
247 ret = machine_to_phys(XMADDR(ret)).paddr;
248
249 return ret;
250}
251
252unsigned long xen_pmd_val(pmd_t pmd)
253{
254 /* a BUG here is a lot easier to track down than a NULL eip */
255 BUG();
256 return 0;
257}
258
259unsigned long xen_pgd_val(pgd_t pgd)
260{
261 unsigned long ret = pgd.pgd;
262 if (ret)
263 ret = machine_to_phys(XMADDR(ret)).paddr | 1;
264 return ret;
265}
266
267pte_t xen_make_pte(unsigned long pte)
268{
269 if (pte & _PAGE_PRESENT)
270 pte = phys_to_machine(XPADDR(pte)).maddr;
271
272 return (pte_t){ pte };
273}
274
275pmd_t xen_make_pmd(unsigned long pmd)
276{
277 /* a BUG here is a lot easier to track down than a NULL eip */
278 BUG();
279 return __pmd(0);
280}
281
282pgd_t xen_make_pgd(unsigned long pgd)
283{
284 if (pgd & _PAGE_PRESENT)
285 pgd = phys_to_machine(XPADDR(pgd)).maddr;
286
287 return (pgd_t){ pgd };
288}
289#endif /* CONFIG_X86_PAE */
290
291
292
293static void pgd_walk_set_prot(void *pt, pgprot_t flags)
294{
295 unsigned long pfn = PFN_DOWN(__pa(pt));
296
297 if (HYPERVISOR_update_va_mapping((unsigned long)pt,
298 pfn_pte(pfn, flags), 0) < 0)
299 BUG();
300}
301
302static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
303{
304 pgd_t *pgd = pgd_base;
305 pud_t *pud;
306 pmd_t *pmd;
307 pte_t *pte;
308 int g, u, m;
309
310 if (xen_feature(XENFEAT_auto_translated_physmap))
311 return;
312
313 for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
314 if (pgd_none(*pgd))
315 continue;
316 pud = pud_offset(pgd, 0);
317
318 if (PTRS_PER_PUD > 1) /* not folded */
319 pgd_walk_set_prot(pud, flags);
320
321 for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
322 if (pud_none(*pud))
323 continue;
324 pmd = pmd_offset(pud, 0);
325
326 if (PTRS_PER_PMD > 1) /* not folded */
327 pgd_walk_set_prot(pmd, flags);
328
329 for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
330 if (pmd_none(*pmd))
331 continue;
332
333 /* This can get called before mem_map
334 is set up, so we assume nothing is
335 highmem at that point. */
336 if (mem_map == NULL ||
337 !PageHighMem(pmd_page(*pmd))) {
338 pte = pte_offset_kernel(pmd, 0);
339 pgd_walk_set_prot(pte, flags);
340 }
341 }
342 }
343 }
344
345 if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
346 pfn_pte(PFN_DOWN(__pa(pgd_base)),
347 flags),
348 UVMF_TLB_FLUSH) < 0)
349 BUG();
350}
351
352
353/* This is called just after a mm has been duplicated from its parent,
354 but it has not been used yet. We need to make sure that its
355 pagetable is all read-only, and can be pinned. */
356void xen_pgd_pin(pgd_t *pgd)
357{
358 struct mmuext_op op;
359
360 pgd_walk(pgd, PAGE_KERNEL_RO);
361
362#if defined(CONFIG_X86_PAE)
363 op.cmd = MMUEXT_PIN_L3_TABLE;
364#else
365 op.cmd = MMUEXT_PIN_L2_TABLE;
366#endif
367 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
368 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
369 BUG();
370}
371
372/* Release a pagetables pages back as normal RW */
373void xen_pgd_unpin(pgd_t *pgd)
374{
375 struct mmuext_op op;
376
377 op.cmd = MMUEXT_UNPIN_TABLE;
378 op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
379
380 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
381 BUG();
382
383 pgd_walk(pgd, PAGE_KERNEL);
384}
385
386
387void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
388{
389 xen_pgd_pin(next->pgd);
390}
391
392void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
393{
394 xen_pgd_pin(mm->pgd);
395}
396
397void xen_exit_mmap(struct mm_struct *mm)
398{
399 struct task_struct *tsk = current;
400
401 task_lock(tsk);
402
403 /*
404 * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
405 * *much* faster this way, as no tlb flushes means bigger wrpt batches.
406 */
407 if (tsk->active_mm == mm) {
408 tsk->active_mm = &init_mm;
409 atomic_inc(&init_mm.mm_count);
410
411 switch_mm(mm, &init_mm, tsk);
412
413 atomic_dec(&mm->mm_count);
414 BUG_ON(atomic_read(&mm->mm_count) == 0);
415 }
416
417 task_unlock(tsk);
418
419 xen_pgd_unpin(mm->pgd);
420}