aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/xen/mmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r--arch/x86/xen/mmu.c314
1 files changed, 263 insertions, 51 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da696..ae173f6edd8b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 856 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 857
735 if (user_pgd) { 858 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 859 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 860 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 861 }
739 } 862 }
740#else /* CONFIG_X86_32 */ 863#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 864#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 865 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 866 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
867 PT_PMD);
744#endif 868#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 869 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 870#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 871 xen_mc_issue(0);
748} 872}
749 873
874static void xen_pgd_pin(struct mm_struct *mm)
875{
876 __xen_pgd_pin(mm, mm->pgd);
877}
878
750/* 879/*
751 * On save, we need to pin all pagetables to make sure they get their 880 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 881 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 882 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 883 * process is under construction or destruction).
884 *
885 * Expected to be called in stop_machine() ("equivalent to taking
886 * every spinlock in the system"), so the locking doesn't really
887 * matter all that much.
755 */ 888 */
756void xen_mm_pin_all(void) 889void xen_mm_pin_all(void)
757{ 890{
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void)
762 895
763 list_for_each_entry(page, &pgd_list, lru) { 896 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 897 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 898 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 899 SetPageSavePinned(page);
767 } 900 }
768 } 901 }
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 908 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 909 * the book-keeping now.
777 */ 910 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 911static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
912 enum pt_level level)
779{ 913{
780 SetPagePinned(page); 914 SetPagePinned(page);
781 return 0; 915 return 0;
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 917
784void __init xen_mark_init_mm_pinned(void) 918void __init xen_mark_init_mm_pinned(void)
785{ 919{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 920 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 921}
788 922
789static int unpin_page(struct page *page, enum pt_level level) 923static int xen_unpin_page(struct mm_struct *mm, struct page *page,
924 enum pt_level level)
790{ 925{
791 unsigned pgfl = TestClearPagePinned(page); 926 unsigned pgfl = TestClearPagePinned(page);
792 927
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 931 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 932 struct multicall_space mcs;
798 933
934 /*
935 * Do the converse to pin_page. If we're using split
936 * pte locks, we must be holding the lock for while
937 * the pte page is unpinned but still RO to prevent
938 * concurrent updates from seeing it in this
939 * partially-pinned state.
940 */
799 if (level == PT_PTE) { 941 if (level == PT_PTE) {
800 ptl = lock_pte(page); 942 ptl = xen_pte_lock(page, mm);
801 943
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 944 if (ptl)
945 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 946 }
804 947
805 mcs = __xen_mc_entry(0); 948 mcs = __xen_mc_entry(0);
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 953
811 if (ptl) { 954 if (ptl) {
812 /* unlock when batch completed */ 955 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 956 xen_mc_callback(xen_pte_unlock, ptl);
814 } 957 }
815 } 958 }
816 959
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 961}
819 962
820/* Release a pagetables pages back as normal RW */ 963/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 964static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 965{
823 xen_mc_batch(); 966 xen_mc_batch();
824 967
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 973
831 if (user_pgd) { 974 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 975 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 976 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 977 }
835 } 978 }
836#endif 979#endif
837 980
838#ifdef CONFIG_X86_PAE 981#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 982 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 983 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
984 PT_PMD);
841#endif 985#endif
842 986
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 987 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 988
845 xen_mc_issue(0); 989 xen_mc_issue(0);
846} 990}
847 991
992static void xen_pgd_unpin(struct mm_struct *mm)
993{
994 __xen_pgd_unpin(mm, mm->pgd);
995}
996
848/* 997/*
849 * On resume, undo any pinning done at save, so that the rest of the 998 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 999 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1008 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1009 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1010 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1011 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1012 ClearPageSavePinned(page);
864 } 1013 }
865 } 1014 }
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1019void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1020{
872 spin_lock(&next->page_table_lock); 1021 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1022 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1023 spin_unlock(&next->page_table_lock);
875} 1024}
876 1025
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1026void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1027{
879 spin_lock(&mm->page_table_lock); 1028 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1029 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1030 spin_unlock(&mm->page_table_lock);
882} 1031}
883 1032
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info)
907 } 1056 }
908} 1057}
909 1058
910static void drop_mm_ref(struct mm_struct *mm) 1059static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1060{
912 cpumask_t mask; 1061 cpumask_t mask;
913 unsigned cpu; 1062 unsigned cpu;
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1086 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1087}
939#else 1088#else
940static void drop_mm_ref(struct mm_struct *mm) 1089static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1090{
942 if (current->active_mm == mm) 1091 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1092 load_cr3(swapper_pg_dir);
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1110void xen_exit_mmap(struct mm_struct *mm)
962{ 1111{
963 get_cpu(); /* make sure we don't move around */ 1112 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1113 xen_drop_mm_ref(mm);
965 put_cpu(); 1114 put_cpu();
966 1115
967 spin_lock(&mm->page_table_lock); 1116 spin_lock(&mm->page_table_lock);
968 1117
969 /* pgd may not be pinned in the error exit path of execve */ 1118 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1119 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1120 xen_pgd_unpin(mm);
972 1121
973 spin_unlock(&mm->page_table_lock); 1122 spin_unlock(&mm->page_table_lock);
974} 1123}
1124
1125#ifdef CONFIG_XEN_DEBUG_FS
1126
1127static struct dentry *d_mmu_debug;
1128
1129static int __init xen_mmu_debugfs(void)
1130{
1131 struct dentry *d_xen = xen_init_debugfs();
1132
1133 if (d_xen == NULL)
1134 return -ENOMEM;
1135
1136 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1137
1138 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1139
1140 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1141 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1142 &mmu_stats.pgd_update_pinned);
1143 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1144 &mmu_stats.pgd_update_pinned);
1145
1146 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1147 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1148 &mmu_stats.pud_update_pinned);
1149 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1150 &mmu_stats.pud_update_pinned);
1151
1152 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1153 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1154 &mmu_stats.pmd_update_pinned);
1155 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1156 &mmu_stats.pmd_update_pinned);
1157
1158 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1159// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1160// &mmu_stats.pte_update_pinned);
1161 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1162 &mmu_stats.pte_update_pinned);
1163
1164 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1165 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1166 &mmu_stats.mmu_update_extended);
1167 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1168 mmu_stats.mmu_update_histo, 20);
1169
1170 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1171 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1172 &mmu_stats.set_pte_at_batched);
1173 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1174 &mmu_stats.set_pte_at_current);
1175 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1176 &mmu_stats.set_pte_at_kernel);
1177
1178 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1179 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1180 &mmu_stats.prot_commit_batched);
1181
1182 return 0;
1183}
1184fs_initcall(xen_mmu_debugfs);
1185
1186#endif /* CONFIG_XEN_DEBUG_FS */