diff options
author | Jeremy Fitzhardinge <jeremy@xensource.com> | 2007-07-17 21:37:05 -0400 |
---|---|---|
committer | Jeremy Fitzhardinge <jeremy@goop.org> | 2007-07-18 11:47:43 -0400 |
commit | f4f97b3ea90130520afb478cbc2918be2b6587b8 (patch) | |
tree | 1aeebe3230b4a7eef0630eec148927c1adf340a5 /arch/i386 | |
parent | c85b04c3749507546f6d5868976e4793e35c2ec0 (diff) |
xen: Complete pagetable pinning
Xen requires all active pagetables to be marked read-only. When the
base of the pagetable is loaded into %cr3, the hypervisor validates
the entire pagetable and only allows the load to proceed if it all
checks out.
This is pretty slow, so to mitigate this cost Xen has a notion of
pinned pagetables. Pinned pagetables are pagetables which are
considered to be active even if no processor's cr3 is pointing to is.
This means that it must remain read-only and all updates are validated
by the hypervisor. This makes context switches much cheaper, because
the hypervisor doesn't need to revalidate the pagetable each time.
This also adds a new paravirt hook which is called during setup once
the zones and memory allocator have been initialized. When the
init_mm pagetable is first built, the struct page array does not yet
exist, and so there's nowhere to put he init_mm pagetable's PG_pinned
flags. Once the zones are initialized and the struct page array
exists, we can set the PG_pinned flags for those pages.
This patch also adds the Xen support for pte pages allocated out of
highmem (highpte) by implementing xen_kmap_atomic_pte.
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Cc: Zach Amsden <zach@vmware.com>
Diffstat (limited to 'arch/i386')
-rw-r--r-- | arch/i386/xen/enlighten.c | 87 | ||||
-rw-r--r-- | arch/i386/xen/mmu.c | 260 | ||||
-rw-r--r-- | arch/i386/xen/mmu.h | 2 | ||||
-rw-r--r-- | arch/i386/xen/xen-ops.h | 2 |
4 files changed, 242 insertions, 109 deletions
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c index 25eb3592f11d..86e68e680116 100644 --- a/arch/i386/xen/enlighten.c +++ b/arch/i386/xen/enlighten.c | |||
@@ -21,6 +21,9 @@ | |||
21 | #include <linux/sched.h> | 21 | #include <linux/sched.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/mm.h> | ||
25 | #include <linux/page-flags.h> | ||
26 | #include <linux/highmem.h> | ||
24 | 27 | ||
25 | #include <xen/interface/xen.h> | 28 | #include <xen/interface/xen.h> |
26 | #include <xen/interface/physdev.h> | 29 | #include <xen/interface/physdev.h> |
@@ -500,32 +503,59 @@ static void xen_write_cr3(unsigned long cr3) | |||
500 | } | 503 | } |
501 | } | 504 | } |
502 | 505 | ||
503 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | 506 | /* Early in boot, while setting up the initial pagetable, assume |
507 | everything is pinned. */ | ||
508 | static void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) | ||
504 | { | 509 | { |
505 | /* XXX pfn isn't necessarily a lowmem page */ | 510 | BUG_ON(mem_map); /* should only be used early */ |
506 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 511 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); |
507 | } | 512 | } |
508 | 513 | ||
509 | static void xen_alloc_pd(u32 pfn) | 514 | /* This needs to make sure the new pte page is pinned iff its being |
515 | attached to a pinned pagetable. */ | ||
516 | static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) | ||
510 | { | 517 | { |
511 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | 518 | struct page *page = pfn_to_page(pfn); |
512 | } | ||
513 | 519 | ||
514 | static void xen_release_pd(u32 pfn) | 520 | if (PagePinned(virt_to_page(mm->pgd))) { |
515 | { | 521 | SetPagePinned(page); |
516 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 522 | |
523 | if (!PageHighMem(page)) | ||
524 | make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); | ||
525 | else | ||
526 | /* make sure there are no stray mappings of | ||
527 | this page */ | ||
528 | kmap_flush_unused(); | ||
529 | } | ||
517 | } | 530 | } |
518 | 531 | ||
532 | /* This should never happen until we're OK to use struct page */ | ||
519 | static void xen_release_pt(u32 pfn) | 533 | static void xen_release_pt(u32 pfn) |
520 | { | 534 | { |
521 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | 535 | struct page *page = pfn_to_page(pfn); |
536 | |||
537 | if (PagePinned(page)) { | ||
538 | if (!PageHighMem(page)) | ||
539 | make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); | ||
540 | } | ||
522 | } | 541 | } |
523 | 542 | ||
524 | static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, | 543 | #ifdef CONFIG_HIGHPTE |
525 | u32 start, u32 count) | 544 | static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) |
526 | { | 545 | { |
527 | xen_alloc_pd(pfn); | 546 | pgprot_t prot = PAGE_KERNEL; |
547 | |||
548 | if (PagePinned(page)) | ||
549 | prot = PAGE_KERNEL_RO; | ||
550 | |||
551 | if (0 && PageHighMem(page)) | ||
552 | printk("mapping highpte %lx type %d prot %s\n", | ||
553 | page_to_pfn(page), type, | ||
554 | (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); | ||
555 | |||
556 | return kmap_atomic_prot(page, type, prot); | ||
528 | } | 557 | } |
558 | #endif | ||
529 | 559 | ||
530 | static __init void xen_pagetable_setup_start(pgd_t *base) | 560 | static __init void xen_pagetable_setup_start(pgd_t *base) |
531 | { | 561 | { |
@@ -553,7 +583,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
553 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), | 583 | memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), |
554 | PAGE_SIZE); | 584 | PAGE_SIZE); |
555 | 585 | ||
556 | xen_alloc_pd(PFN_DOWN(__pa(pmd))); | 586 | make_lowmem_page_readonly(pmd); |
557 | 587 | ||
558 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); | 588 | set_pgd(&base[i], __pgd(1 + __pa(pmd))); |
559 | } else | 589 | } else |
@@ -574,6 +604,10 @@ static __init void xen_pagetable_setup_start(pgd_t *base) | |||
574 | 604 | ||
575 | static __init void xen_pagetable_setup_done(pgd_t *base) | 605 | static __init void xen_pagetable_setup_done(pgd_t *base) |
576 | { | 606 | { |
607 | /* This will work as long as patching hasn't happened yet | ||
608 | (which it hasn't) */ | ||
609 | paravirt_ops.alloc_pt = xen_alloc_pt; | ||
610 | |||
577 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { | 611 | if (!xen_feature(XENFEAT_auto_translated_physmap)) { |
578 | /* | 612 | /* |
579 | * Create a mapping for the shared info page. | 613 | * Create a mapping for the shared info page. |
@@ -591,7 +625,19 @@ static __init void xen_pagetable_setup_done(pgd_t *base) | |||
591 | HYPERVISOR_shared_info = | 625 | HYPERVISOR_shared_info = |
592 | (struct shared_info *)__va(xen_start_info->shared_info); | 626 | (struct shared_info *)__va(xen_start_info->shared_info); |
593 | 627 | ||
594 | xen_pgd_pin(base); | 628 | /* Actually pin the pagetable down, but we can't set PG_pinned |
629 | yet because the page structures don't exist yet. */ | ||
630 | { | ||
631 | struct mmuext_op op; | ||
632 | #ifdef CONFIG_X86_PAE | ||
633 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
634 | #else | ||
635 | op.cmd = MMUEXT_PIN_L3_TABLE; | ||
636 | #endif | ||
637 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); | ||
638 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) | ||
639 | BUG(); | ||
640 | } | ||
595 | 641 | ||
596 | xen_vcpu_setup(smp_processor_id()); | 642 | xen_vcpu_setup(smp_processor_id()); |
597 | } | 643 | } |
@@ -608,6 +654,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
608 | .memory_setup = xen_memory_setup, | 654 | .memory_setup = xen_memory_setup, |
609 | .arch_setup = xen_arch_setup, | 655 | .arch_setup = xen_arch_setup, |
610 | .init_IRQ = xen_init_IRQ, | 656 | .init_IRQ = xen_init_IRQ, |
657 | .post_allocator_init = xen_mark_init_mm_pinned, | ||
611 | 658 | ||
612 | .time_init = xen_time_init, | 659 | .time_init = xen_time_init, |
613 | .set_wallclock = xen_set_wallclock, | 660 | .set_wallclock = xen_set_wallclock, |
@@ -688,11 +735,15 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = { | |||
688 | .pagetable_setup_start = xen_pagetable_setup_start, | 735 | .pagetable_setup_start = xen_pagetable_setup_start, |
689 | .pagetable_setup_done = xen_pagetable_setup_done, | 736 | .pagetable_setup_done = xen_pagetable_setup_done, |
690 | 737 | ||
691 | .alloc_pt = xen_alloc_pt, | 738 | .alloc_pt = xen_alloc_pt_init, |
692 | .alloc_pd = xen_alloc_pd, | ||
693 | .alloc_pd_clone = xen_alloc_pd_clone, | ||
694 | .release_pd = xen_release_pd, | ||
695 | .release_pt = xen_release_pt, | 739 | .release_pt = xen_release_pt, |
740 | .alloc_pd = paravirt_nop, | ||
741 | .alloc_pd_clone = paravirt_nop, | ||
742 | .release_pd = paravirt_nop, | ||
743 | |||
744 | #ifdef CONFIG_HIGHPTE | ||
745 | .kmap_atomic_pte = xen_kmap_atomic_pte, | ||
746 | #endif | ||
696 | 747 | ||
697 | .set_pte = xen_set_pte, | 748 | .set_pte = xen_set_pte, |
698 | .set_pte_at = xen_set_pte_at, | 749 | .set_pte_at = xen_set_pte_at, |
diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c index de16cb5f55ca..53501ce2d15c 100644 --- a/arch/i386/xen/mmu.c +++ b/arch/i386/xen/mmu.c | |||
@@ -38,19 +38,22 @@ | |||
38 | * | 38 | * |
39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 | 39 | * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 |
40 | */ | 40 | */ |
41 | #include <linux/highmem.h> | ||
41 | #include <linux/bug.h> | 42 | #include <linux/bug.h> |
42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
43 | 44 | ||
44 | #include <asm/pgtable.h> | 45 | #include <asm/pgtable.h> |
45 | #include <asm/tlbflush.h> | 46 | #include <asm/tlbflush.h> |
46 | #include <asm/mmu_context.h> | 47 | #include <asm/mmu_context.h> |
48 | #include <asm/paravirt.h> | ||
47 | 49 | ||
48 | #include <asm/xen/hypercall.h> | 50 | #include <asm/xen/hypercall.h> |
49 | #include <asm/paravirt.h> | 51 | #include <asm/xen/hypervisor.h> |
50 | 52 | ||
51 | #include <xen/page.h> | 53 | #include <xen/page.h> |
52 | #include <xen/interface/xen.h> | 54 | #include <xen/interface/xen.h> |
53 | 55 | ||
56 | #include "multicalls.h" | ||
54 | #include "mmu.h" | 57 | #include "mmu.h" |
55 | 58 | ||
56 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) | 59 | xmaddr_t arbitrary_virt_to_machine(unsigned long address) |
@@ -92,16 +95,6 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
92 | } | 95 | } |
93 | 96 | ||
94 | 97 | ||
95 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
96 | { | ||
97 | struct mmu_update u; | ||
98 | |||
99 | u.ptr = virt_to_machine(ptep).maddr; | ||
100 | u.val = pte_val_ma(pte); | ||
101 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
102 | BUG(); | ||
103 | } | ||
104 | |||
105 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 98 | void xen_set_pmd(pmd_t *ptr, pmd_t val) |
106 | { | 99 | { |
107 | struct mmu_update u; | 100 | struct mmu_update u; |
@@ -112,18 +105,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) | |||
112 | BUG(); | 105 | BUG(); |
113 | } | 106 | } |
114 | 107 | ||
115 | #ifdef CONFIG_X86_PAE | ||
116 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
117 | { | ||
118 | struct mmu_update u; | ||
119 | |||
120 | u.ptr = virt_to_machine(ptr).maddr; | ||
121 | u.val = pud_val_ma(val); | ||
122 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
123 | BUG(); | ||
124 | } | ||
125 | #endif | ||
126 | |||
127 | /* | 108 | /* |
128 | * Associate a virtual page frame with a given physical page frame | 109 | * Associate a virtual page frame with a given physical page frame |
129 | * and protection flags for that frame. | 110 | * and protection flags for that frame. |
@@ -170,6 +151,23 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
170 | } | 151 | } |
171 | 152 | ||
172 | #ifdef CONFIG_X86_PAE | 153 | #ifdef CONFIG_X86_PAE |
154 | void xen_set_pud(pud_t *ptr, pud_t val) | ||
155 | { | ||
156 | struct mmu_update u; | ||
157 | |||
158 | u.ptr = virt_to_machine(ptr).maddr; | ||
159 | u.val = pud_val_ma(val); | ||
160 | if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) | ||
161 | BUG(); | ||
162 | } | ||
163 | |||
164 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
165 | { | ||
166 | ptep->pte_high = pte.pte_high; | ||
167 | smp_wmb(); | ||
168 | ptep->pte_low = pte.pte_low; | ||
169 | } | ||
170 | |||
173 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) | 171 | void xen_set_pte_atomic(pte_t *ptep, pte_t pte) |
174 | { | 172 | { |
175 | set_64bit((u64 *)ptep, pte_val_ma(pte)); | 173 | set_64bit((u64 *)ptep, pte_val_ma(pte)); |
@@ -239,6 +237,11 @@ pgd_t xen_make_pgd(unsigned long long pgd) | |||
239 | return (pgd_t){ pgd }; | 237 | return (pgd_t){ pgd }; |
240 | } | 238 | } |
241 | #else /* !PAE */ | 239 | #else /* !PAE */ |
240 | void xen_set_pte(pte_t *ptep, pte_t pte) | ||
241 | { | ||
242 | *ptep = pte; | ||
243 | } | ||
244 | |||
242 | unsigned long xen_pte_val(pte_t pte) | 245 | unsigned long xen_pte_val(pte_t pte) |
243 | { | 246 | { |
244 | unsigned long ret = pte.pte_low; | 247 | unsigned long ret = pte.pte_low; |
@@ -249,13 +252,6 @@ unsigned long xen_pte_val(pte_t pte) | |||
249 | return ret; | 252 | return ret; |
250 | } | 253 | } |
251 | 254 | ||
252 | unsigned long xen_pmd_val(pmd_t pmd) | ||
253 | { | ||
254 | /* a BUG here is a lot easier to track down than a NULL eip */ | ||
255 | BUG(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | unsigned long xen_pgd_val(pgd_t pgd) | 255 | unsigned long xen_pgd_val(pgd_t pgd) |
260 | { | 256 | { |
261 | unsigned long ret = pgd.pgd; | 257 | unsigned long ret = pgd.pgd; |
@@ -272,13 +268,6 @@ pte_t xen_make_pte(unsigned long pte) | |||
272 | return (pte_t){ pte }; | 268 | return (pte_t){ pte }; |
273 | } | 269 | } |
274 | 270 | ||
275 | pmd_t xen_make_pmd(unsigned long pmd) | ||
276 | { | ||
277 | /* a BUG here is a lot easier to track down than a NULL eip */ | ||
278 | BUG(); | ||
279 | return __pmd(0); | ||
280 | } | ||
281 | |||
282 | pgd_t xen_make_pgd(unsigned long pgd) | 271 | pgd_t xen_make_pgd(unsigned long pgd) |
283 | { | 272 | { |
284 | if (pgd & _PAGE_PRESENT) | 273 | if (pgd & _PAGE_PRESENT) |
@@ -290,108 +279,199 @@ pgd_t xen_make_pgd(unsigned long pgd) | |||
290 | 279 | ||
291 | 280 | ||
292 | 281 | ||
293 | static void pgd_walk_set_prot(void *pt, pgprot_t flags) | 282 | /* |
294 | { | 283 | (Yet another) pagetable walker. This one is intended for pinning a |
295 | unsigned long pfn = PFN_DOWN(__pa(pt)); | 284 | pagetable. This means that it walks a pagetable and calls the |
296 | 285 | callback function on each page it finds making up the page table, | |
297 | if (HYPERVISOR_update_va_mapping((unsigned long)pt, | 286 | at every level. It walks the entire pagetable, but it only bothers |
298 | pfn_pte(pfn, flags), 0) < 0) | 287 | pinning pte pages which are below pte_limit. In the normal case |
299 | BUG(); | 288 | this will be TASK_SIZE, but at boot we need to pin up to |
300 | } | 289 | FIXADDR_TOP. But the important bit is that we don't pin beyond |
301 | 290 | there, because then we start getting into Xen's ptes. | |
302 | static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) | 291 | */ |
292 | static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), | ||
293 | unsigned long limit) | ||
303 | { | 294 | { |
304 | pgd_t *pgd = pgd_base; | 295 | pgd_t *pgd = pgd_base; |
305 | pud_t *pud; | 296 | int flush = 0; |
306 | pmd_t *pmd; | 297 | unsigned long addr = 0; |
307 | pte_t *pte; | 298 | unsigned long pgd_next; |
308 | int g, u, m; | 299 | |
300 | BUG_ON(limit > FIXADDR_TOP); | ||
309 | 301 | ||
310 | if (xen_feature(XENFEAT_auto_translated_physmap)) | 302 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
311 | return; | 303 | return 0; |
304 | |||
305 | for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { | ||
306 | pud_t *pud; | ||
307 | unsigned long pud_limit, pud_next; | ||
312 | 308 | ||
313 | for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { | 309 | pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); |
314 | if (pgd_none(*pgd)) | 310 | |
311 | if (!pgd_val(*pgd)) | ||
315 | continue; | 312 | continue; |
313 | |||
316 | pud = pud_offset(pgd, 0); | 314 | pud = pud_offset(pgd, 0); |
317 | 315 | ||
318 | if (PTRS_PER_PUD > 1) /* not folded */ | 316 | if (PTRS_PER_PUD > 1) /* not folded */ |
319 | pgd_walk_set_prot(pud, flags); | 317 | flush |= (*func)(virt_to_page(pud), 0); |
318 | |||
319 | for (; addr != pud_limit; pud++, addr = pud_next) { | ||
320 | pmd_t *pmd; | ||
321 | unsigned long pmd_limit; | ||
322 | |||
323 | pud_next = pud_addr_end(addr, pud_limit); | ||
324 | |||
325 | if (pud_next < limit) | ||
326 | pmd_limit = pud_next; | ||
327 | else | ||
328 | pmd_limit = limit; | ||
320 | 329 | ||
321 | for (u = 0; u < PTRS_PER_PUD; u++, pud++) { | ||
322 | if (pud_none(*pud)) | 330 | if (pud_none(*pud)) |
323 | continue; | 331 | continue; |
332 | |||
324 | pmd = pmd_offset(pud, 0); | 333 | pmd = pmd_offset(pud, 0); |
325 | 334 | ||
326 | if (PTRS_PER_PMD > 1) /* not folded */ | 335 | if (PTRS_PER_PMD > 1) /* not folded */ |
327 | pgd_walk_set_prot(pmd, flags); | 336 | flush |= (*func)(virt_to_page(pmd), 0); |
337 | |||
338 | for (; addr != pmd_limit; pmd++) { | ||
339 | addr += (PAGE_SIZE * PTRS_PER_PTE); | ||
340 | if ((pmd_limit-1) < (addr-1)) { | ||
341 | addr = pmd_limit; | ||
342 | break; | ||
343 | } | ||
328 | 344 | ||
329 | for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { | ||
330 | if (pmd_none(*pmd)) | 345 | if (pmd_none(*pmd)) |
331 | continue; | 346 | continue; |
332 | 347 | ||
333 | /* This can get called before mem_map | 348 | flush |= (*func)(pmd_page(*pmd), 0); |
334 | is set up, so we assume nothing is | ||
335 | highmem at that point. */ | ||
336 | if (mem_map == NULL || | ||
337 | !PageHighMem(pmd_page(*pmd))) { | ||
338 | pte = pte_offset_kernel(pmd, 0); | ||
339 | pgd_walk_set_prot(pte, flags); | ||
340 | } | ||
341 | } | 349 | } |
342 | } | 350 | } |
343 | } | 351 | } |
344 | 352 | ||
345 | if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, | 353 | flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); |
346 | pfn_pte(PFN_DOWN(__pa(pgd_base)), | 354 | |
347 | flags), | 355 | return flush; |
348 | UVMF_TLB_FLUSH) < 0) | ||
349 | BUG(); | ||
350 | } | 356 | } |
351 | 357 | ||
358 | static int pin_page(struct page *page, unsigned flags) | ||
359 | { | ||
360 | unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); | ||
361 | int flush; | ||
362 | |||
363 | if (pgfl) | ||
364 | flush = 0; /* already pinned */ | ||
365 | else if (PageHighMem(page)) | ||
366 | /* kmaps need flushing if we found an unpinned | ||
367 | highpage */ | ||
368 | flush = 1; | ||
369 | else { | ||
370 | void *pt = lowmem_page_address(page); | ||
371 | unsigned long pfn = page_to_pfn(page); | ||
372 | struct multicall_space mcs = __xen_mc_entry(0); | ||
373 | |||
374 | flush = 0; | ||
375 | |||
376 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
377 | pfn_pte(pfn, PAGE_KERNEL_RO), | ||
378 | flags); | ||
379 | } | ||
380 | |||
381 | return flush; | ||
382 | } | ||
352 | 383 | ||
353 | /* This is called just after a mm has been duplicated from its parent, | 384 | /* This is called just after a mm has been created, but it has not |
354 | but it has not been used yet. We need to make sure that its | 385 | been used yet. We need to make sure that its pagetable is all |
355 | pagetable is all read-only, and can be pinned. */ | 386 | read-only, and can be pinned. */ |
356 | void xen_pgd_pin(pgd_t *pgd) | 387 | void xen_pgd_pin(pgd_t *pgd) |
357 | { | 388 | { |
358 | struct mmuext_op op; | 389 | struct multicall_space mcs; |
390 | struct mmuext_op *op; | ||
359 | 391 | ||
360 | pgd_walk(pgd, PAGE_KERNEL_RO); | 392 | xen_mc_batch(); |
361 | 393 | ||
362 | #if defined(CONFIG_X86_PAE) | 394 | if (pgd_walk(pgd, pin_page, TASK_SIZE)) |
363 | op.cmd = MMUEXT_PIN_L3_TABLE; | 395 | kmap_flush_unused(); |
396 | |||
397 | mcs = __xen_mc_entry(sizeof(*op)); | ||
398 | op = mcs.args; | ||
399 | |||
400 | #ifdef CONFIG_X86_PAE | ||
401 | op->cmd = MMUEXT_PIN_L3_TABLE; | ||
364 | #else | 402 | #else |
365 | op.cmd = MMUEXT_PIN_L2_TABLE; | 403 | op->cmd = MMUEXT_PIN_L2_TABLE; |
366 | #endif | 404 | #endif |
367 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 405 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); |
368 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) | 406 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
369 | BUG(); | 407 | |
408 | xen_mc_issue(0); | ||
370 | } | 409 | } |
371 | 410 | ||
372 | /* Release a pagetables pages back as normal RW */ | 411 | /* The init_mm pagetable is really pinned as soon as its created, but |
373 | void xen_pgd_unpin(pgd_t *pgd) | 412 | that's before we have page structures to store the bits. So do all |
413 | the book-keeping now. */ | ||
414 | static __init int mark_pinned(struct page *page, unsigned flags) | ||
374 | { | 415 | { |
375 | struct mmuext_op op; | 416 | SetPagePinned(page); |
417 | return 0; | ||
418 | } | ||
376 | 419 | ||
377 | op.cmd = MMUEXT_UNPIN_TABLE; | 420 | void __init xen_mark_init_mm_pinned(void) |
378 | op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | 421 | { |
422 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | ||
423 | } | ||
379 | 424 | ||
380 | if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) | 425 | static int unpin_page(struct page *page, unsigned flags) |
381 | BUG(); | 426 | { |
427 | unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); | ||
382 | 428 | ||
383 | pgd_walk(pgd, PAGE_KERNEL); | 429 | if (pgfl && !PageHighMem(page)) { |
430 | void *pt = lowmem_page_address(page); | ||
431 | unsigned long pfn = page_to_pfn(page); | ||
432 | struct multicall_space mcs = __xen_mc_entry(0); | ||
433 | |||
434 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | ||
435 | pfn_pte(pfn, PAGE_KERNEL), | ||
436 | flags); | ||
437 | } | ||
438 | |||
439 | return 0; /* never need to flush on unpin */ | ||
384 | } | 440 | } |
385 | 441 | ||
442 | /* Release a pagetables pages back as normal RW */ | ||
443 | static void xen_pgd_unpin(pgd_t *pgd) | ||
444 | { | ||
445 | struct mmuext_op *op; | ||
446 | struct multicall_space mcs; | ||
447 | |||
448 | xen_mc_batch(); | ||
449 | |||
450 | mcs = __xen_mc_entry(sizeof(*op)); | ||
451 | |||
452 | op = mcs.args; | ||
453 | op->cmd = MMUEXT_UNPIN_TABLE; | ||
454 | op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); | ||
455 | |||
456 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | ||
457 | |||
458 | pgd_walk(pgd, unpin_page, TASK_SIZE); | ||
459 | |||
460 | xen_mc_issue(0); | ||
461 | } | ||
386 | 462 | ||
387 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 463 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
388 | { | 464 | { |
465 | spin_lock(&next->page_table_lock); | ||
389 | xen_pgd_pin(next->pgd); | 466 | xen_pgd_pin(next->pgd); |
467 | spin_unlock(&next->page_table_lock); | ||
390 | } | 468 | } |
391 | 469 | ||
392 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 470 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
393 | { | 471 | { |
472 | spin_lock(&mm->page_table_lock); | ||
394 | xen_pgd_pin(mm->pgd); | 473 | xen_pgd_pin(mm->pgd); |
474 | spin_unlock(&mm->page_table_lock); | ||
395 | } | 475 | } |
396 | 476 | ||
397 | void xen_exit_mmap(struct mm_struct *mm) | 477 | void xen_exit_mmap(struct mm_struct *mm) |
diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h index 764eaaae0a2f..49776fe9f02a 100644 --- a/arch/i386/xen/mmu.h +++ b/arch/i386/xen/mmu.h | |||
@@ -15,7 +15,7 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); | |||
15 | void xen_exit_mmap(struct mm_struct *mm); | 15 | void xen_exit_mmap(struct mm_struct *mm); |
16 | 16 | ||
17 | void xen_pgd_pin(pgd_t *pgd); | 17 | void xen_pgd_pin(pgd_t *pgd); |
18 | void xen_pgd_unpin(pgd_t *pgd); | 18 | //void xen_pgd_unpin(pgd_t *pgd); |
19 | 19 | ||
20 | #ifdef CONFIG_X86_PAE | 20 | #ifdef CONFIG_X86_PAE |
21 | unsigned long long xen_pte_val(pte_t); | 21 | unsigned long long xen_pte_val(pte_t); |
diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h index 79648fe1ab77..54d98b52085e 100644 --- a/arch/i386/xen/xen-ops.h +++ b/arch/i386/xen/xen-ops.h | |||
@@ -20,6 +20,8 @@ unsigned long xen_get_wallclock(void); | |||
20 | int xen_set_wallclock(unsigned long time); | 20 | int xen_set_wallclock(unsigned long time); |
21 | cycle_t xen_clocksource_read(void); | 21 | cycle_t xen_clocksource_read(void); |
22 | 22 | ||
23 | void xen_mark_init_mm_pinned(void); | ||
24 | |||
23 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); | 25 | DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); |
24 | 26 | ||
25 | static inline unsigned xen_get_lazy_mode(void) | 27 | static inline unsigned xen_get_lazy_mode(void) |