aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r--arch/x86/xen/mmu.c315
1 files changed, 264 insertions, 51 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da69..d4d52f5a1cf 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,14 +838,15 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
849 vm_unmap_aliases();
726 xen_mc_batch(); 850 xen_mc_batch();
727 } 851 }
728 852
@@ -733,25 +857,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 857 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 858
735 if (user_pgd) { 859 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 860 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 861 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 862 }
739 } 863 }
740#else /* CONFIG_X86_32 */ 864#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 865#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 866 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 867 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
868 PT_PMD);
744#endif 869#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 870 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 871#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 872 xen_mc_issue(0);
748} 873}
749 874
875static void xen_pgd_pin(struct mm_struct *mm)
876{
877 __xen_pgd_pin(mm, mm->pgd);
878}
879
750/* 880/*
751 * On save, we need to pin all pagetables to make sure they get their 881 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 882 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 883 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 884 * process is under construction or destruction).
885 *
886 * Expected to be called in stop_machine() ("equivalent to taking
887 * every spinlock in the system"), so the locking doesn't really
888 * matter all that much.
755 */ 889 */
756void xen_mm_pin_all(void) 890void xen_mm_pin_all(void)
757{ 891{
@@ -762,7 +896,7 @@ void xen_mm_pin_all(void)
762 896
763 list_for_each_entry(page, &pgd_list, lru) { 897 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 898 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 899 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 900 SetPageSavePinned(page);
767 } 901 }
768 } 902 }
@@ -775,7 +909,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 909 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 910 * the book-keeping now.
777 */ 911 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 912static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
913 enum pt_level level)
779{ 914{
780 SetPagePinned(page); 915 SetPagePinned(page);
781 return 0; 916 return 0;
@@ -783,10 +918,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 918
784void __init xen_mark_init_mm_pinned(void) 919void __init xen_mark_init_mm_pinned(void)
785{ 920{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 921 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 922}
788 923
789static int unpin_page(struct page *page, enum pt_level level) 924static int xen_unpin_page(struct mm_struct *mm, struct page *page,
925 enum pt_level level)
790{ 926{
791 unsigned pgfl = TestClearPagePinned(page); 927 unsigned pgfl = TestClearPagePinned(page);
792 928
@@ -796,10 +932,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 932 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 933 struct multicall_space mcs;
798 934
935 /*
936 * Do the converse to pin_page. If we're using split
937 * pte locks, we must be holding the lock for while
938 * the pte page is unpinned but still RO to prevent
939 * concurrent updates from seeing it in this
940 * partially-pinned state.
941 */
799 if (level == PT_PTE) { 942 if (level == PT_PTE) {
800 ptl = lock_pte(page); 943 ptl = xen_pte_lock(page, mm);
801 944
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 945 if (ptl)
946 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 947 }
804 948
805 mcs = __xen_mc_entry(0); 949 mcs = __xen_mc_entry(0);
@@ -810,7 +954,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 954
811 if (ptl) { 955 if (ptl) {
812 /* unlock when batch completed */ 956 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 957 xen_mc_callback(xen_pte_unlock, ptl);
814 } 958 }
815 } 959 }
816 960
@@ -818,7 +962,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 962}
819 963
820/* Release a pagetables pages back as normal RW */ 964/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 965static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 966{
823 xen_mc_batch(); 967 xen_mc_batch();
824 968
@@ -830,21 +974,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 974
831 if (user_pgd) { 975 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 976 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 977 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 978 }
835 } 979 }
836#endif 980#endif
837 981
838#ifdef CONFIG_X86_PAE 982#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 983 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 984 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
985 PT_PMD);
841#endif 986#endif
842 987
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 988 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 989
845 xen_mc_issue(0); 990 xen_mc_issue(0);
846} 991}
847 992
993static void xen_pgd_unpin(struct mm_struct *mm)
994{
995 __xen_pgd_unpin(mm, mm->pgd);
996}
997
848/* 998/*
849 * On resume, undo any pinning done at save, so that the rest of the 999 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 1000 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1009,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1009 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1010 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1011 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1012 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1013 ClearPageSavePinned(page);
864 } 1014 }
865 } 1015 }
@@ -870,14 +1020,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1020void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1021{
872 spin_lock(&next->page_table_lock); 1022 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1023 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1024 spin_unlock(&next->page_table_lock);
875} 1025}
876 1026
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1027void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1028{
879 spin_lock(&mm->page_table_lock); 1029 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1030 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1031 spin_unlock(&mm->page_table_lock);
882} 1032}
883 1033
@@ -907,7 +1057,7 @@ static void drop_other_mm_ref(void *info)
907 } 1057 }
908} 1058}
909 1059
910static void drop_mm_ref(struct mm_struct *mm) 1060static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1061{
912 cpumask_t mask; 1062 cpumask_t mask;
913 unsigned cpu; 1063 unsigned cpu;
@@ -937,7 +1087,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1087 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1088}
939#else 1089#else
940static void drop_mm_ref(struct mm_struct *mm) 1090static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1091{
942 if (current->active_mm == mm) 1092 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1093 load_cr3(swapper_pg_dir);
@@ -961,14 +1111,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1111void xen_exit_mmap(struct mm_struct *mm)
962{ 1112{
963 get_cpu(); /* make sure we don't move around */ 1113 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1114 xen_drop_mm_ref(mm);
965 put_cpu(); 1115 put_cpu();
966 1116
967 spin_lock(&mm->page_table_lock); 1117 spin_lock(&mm->page_table_lock);
968 1118
969 /* pgd may not be pinned in the error exit path of execve */ 1119 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1120 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1121 xen_pgd_unpin(mm);
972 1122
973 spin_unlock(&mm->page_table_lock); 1123 spin_unlock(&mm->page_table_lock);
974} 1124}
1125
1126#ifdef CONFIG_XEN_DEBUG_FS
1127
1128static struct dentry *d_mmu_debug;
1129
1130static int __init xen_mmu_debugfs(void)
1131{
1132 struct dentry *d_xen = xen_init_debugfs();
1133
1134 if (d_xen == NULL)
1135 return -ENOMEM;
1136
1137 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1138
1139 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1140
1141 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1142 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1143 &mmu_stats.pgd_update_pinned);
1144 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1145 &mmu_stats.pgd_update_pinned);
1146
1147 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1148 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1149 &mmu_stats.pud_update_pinned);
1150 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1151 &mmu_stats.pud_update_pinned);
1152
1153 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1154 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1155 &mmu_stats.pmd_update_pinned);
1156 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1157 &mmu_stats.pmd_update_pinned);
1158
1159 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1160// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1161// &mmu_stats.pte_update_pinned);
1162 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1163 &mmu_stats.pte_update_pinned);
1164
1165 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1166 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1167 &mmu_stats.mmu_update_extended);
1168 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1169 mmu_stats.mmu_update_histo, 20);
1170
1171 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1172 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1173 &mmu_stats.set_pte_at_batched);
1174 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1175 &mmu_stats.set_pte_at_current);
1176 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1177 &mmu_stats.set_pte_at_kernel);
1178
1179 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1180 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1181 &mmu_stats.prot_commit_batched);
1182
1183 return 0;
1184}
1185fs_initcall(xen_mmu_debugfs);
1186
1187#endif /* CONFIG_XEN_DEBUG_FS */