diff options
Diffstat (limited to 'arch/x86/xen/mmu.c')
-rw-r--r-- | arch/x86/xen/mmu.c | 314 |
1 files changed, 263 insertions, 51 deletions
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
@@ -40,6 +40,7 @@ | |||
40 | */ | 40 | */ |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> |
43 | #include <linux/debugfs.h> | ||
43 | #include <linux/bug.h> | 44 | #include <linux/bug.h> |
44 | 45 | ||
45 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> |
@@ -57,6 +58,61 @@ | |||
57 | 58 | ||
58 | #include "multicalls.h" | 59 | #include "multicalls.h" |
59 | #include "mmu.h" | 60 | #include "mmu.h" |
61 | #include "debugfs.h" | ||
62 | |||
63 | #define MMU_UPDATE_HISTO 30 | ||
64 | |||
65 | #ifdef CONFIG_XEN_DEBUG_FS | ||
66 | |||
67 | static struct { | ||
68 | u32 pgd_update; | ||
69 | u32 pgd_update_pinned; | ||
70 | u32 pgd_update_batched; | ||
71 | |||
72 | u32 pud_update; | ||
73 | u32 pud_update_pinned; | ||
74 | u32 pud_update_batched; | ||
75 | |||
76 | u32 pmd_update; | ||
77 | u32 pmd_update_pinned; | ||
78 | u32 pmd_update_batched; | ||
79 | |||
80 | u32 pte_update; | ||
81 | u32 pte_update_pinned; | ||
82 | u32 pte_update_batched; | ||
83 | |||
84 | u32 mmu_update; | ||
85 | u32 mmu_update_extended; | ||
86 | u32 mmu_update_histo[MMU_UPDATE_HISTO]; | ||
87 | |||
88 | u32 prot_commit; | ||
89 | u32 prot_commit_batched; | ||
90 | |||
91 | u32 set_pte_at; | ||
92 | u32 set_pte_at_batched; | ||
93 | u32 set_pte_at_pinned; | ||
94 | u32 set_pte_at_current; | ||
95 | u32 set_pte_at_kernel; | ||
96 | } mmu_stats; | ||
97 | |||
98 | static u8 zero_stats; | ||
99 | |||
100 | static inline void check_zero(void) | ||
101 | { | ||
102 | if (unlikely(zero_stats)) { | ||
103 | memset(&mmu_stats, 0, sizeof(mmu_stats)); | ||
104 | zero_stats = 0; | ||
105 | } | ||
106 | } | ||
107 | |||
108 | #define ADD_STATS(elem, val) \ | ||
109 | do { check_zero(); mmu_stats.elem += (val); } while(0) | ||
110 | |||
111 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
112 | |||
113 | #define ADD_STATS(elem, val) do { (void)(val); } while(0) | ||
114 | |||
115 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
60 | 116 | ||
61 | /* | 117 | /* |
62 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | 118 | * Just beyond the highest usermode address. STACK_TOP_MAX has a |
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
229 | } | 285 | } |
230 | 286 | ||
231 | 287 | ||
232 | static bool page_pinned(void *ptr) | 288 | static bool xen_page_pinned(void *ptr) |
233 | { | 289 | { |
234 | struct page *page = virt_to_page(ptr); | 290 | struct page *page = virt_to_page(ptr); |
235 | 291 | ||
236 | return PagePinned(page); | 292 | return PagePinned(page); |
237 | } | 293 | } |
238 | 294 | ||
239 | static void extend_mmu_update(const struct mmu_update *update) | 295 | static void xen_extend_mmu_update(const struct mmu_update *update) |
240 | { | 296 | { |
241 | struct multicall_space mcs; | 297 | struct multicall_space mcs; |
242 | struct mmu_update *u; | 298 | struct mmu_update *u; |
243 | 299 | ||
244 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 300 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); |
245 | 301 | ||
246 | if (mcs.mc != NULL) | 302 | if (mcs.mc != NULL) { |
303 | ADD_STATS(mmu_update_extended, 1); | ||
304 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); | ||
305 | |||
247 | mcs.mc->args[1]++; | 306 | mcs.mc->args[1]++; |
248 | else { | 307 | |
308 | if (mcs.mc->args[1] < MMU_UPDATE_HISTO) | ||
309 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); | ||
310 | else | ||
311 | ADD_STATS(mmu_update_histo[0], 1); | ||
312 | } else { | ||
313 | ADD_STATS(mmu_update, 1); | ||
249 | mcs = __xen_mc_entry(sizeof(*u)); | 314 | mcs = __xen_mc_entry(sizeof(*u)); |
250 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 315 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); |
316 | ADD_STATS(mmu_update_histo[1], 1); | ||
251 | } | 317 | } |
252 | 318 | ||
253 | u = mcs.args; | 319 | u = mcs.args; |
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
265 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 331 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
266 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 332 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
267 | u.val = pmd_val_ma(val); | 333 | u.val = pmd_val_ma(val); |
268 | extend_mmu_update(&u); | 334 | xen_extend_mmu_update(&u); |
335 | |||
336 | ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
269 | 337 | ||
270 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 338 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
271 | 339 | ||
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
274 | 342 | ||
275 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 343 | void xen_set_pmd(pmd_t *ptr, pmd_t val) |
276 | { | 344 | { |
345 | ADD_STATS(pmd_update, 1); | ||
346 | |||
277 | /* If page is not pinned, we can just update the entry | 347 | /* If page is not pinned, we can just update the entry |
278 | directly */ | 348 | directly */ |
279 | if (!page_pinned(ptr)) { | 349 | if (!xen_page_pinned(ptr)) { |
280 | *ptr = val; | 350 | *ptr = val; |
281 | return; | 351 | return; |
282 | } | 352 | } |
283 | 353 | ||
354 | ADD_STATS(pmd_update_pinned, 1); | ||
355 | |||
284 | xen_set_pmd_hyper(ptr, val); | 356 | xen_set_pmd_hyper(ptr, val); |
285 | } | 357 | } |
286 | 358 | ||
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
300 | if (mm == &init_mm) | 372 | if (mm == &init_mm) |
301 | preempt_disable(); | 373 | preempt_disable(); |
302 | 374 | ||
375 | ADD_STATS(set_pte_at, 1); | ||
376 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); | ||
377 | ADD_STATS(set_pte_at_current, mm == current->mm); | ||
378 | ADD_STATS(set_pte_at_kernel, mm == &init_mm); | ||
379 | |||
303 | if (mm == current->mm || mm == &init_mm) { | 380 | if (mm == current->mm || mm == &init_mm) { |
304 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 381 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { |
305 | struct multicall_space mcs; | 382 | struct multicall_space mcs; |
306 | mcs = xen_mc_entry(0); | 383 | mcs = xen_mc_entry(0); |
307 | 384 | ||
308 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 385 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); |
386 | ADD_STATS(set_pte_at_batched, 1); | ||
309 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 387 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
310 | goto out; | 388 | goto out; |
311 | } else | 389 | } else |
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
334 | 412 | ||
335 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 413 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; |
336 | u.val = pte_val_ma(pte); | 414 | u.val = pte_val_ma(pte); |
337 | extend_mmu_update(&u); | 415 | xen_extend_mmu_update(&u); |
416 | |||
417 | ADD_STATS(prot_commit, 1); | ||
418 | ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
338 | 419 | ||
339 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 420 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
340 | } | 421 | } |
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
400 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 481 | /* ptr may be ioremapped for 64-bit pagetable setup */ |
401 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 482 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; |
402 | u.val = pud_val_ma(val); | 483 | u.val = pud_val_ma(val); |
403 | extend_mmu_update(&u); | 484 | xen_extend_mmu_update(&u); |
485 | |||
486 | ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
404 | 487 | ||
405 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 488 | xen_mc_issue(PARAVIRT_LAZY_MMU); |
406 | 489 | ||
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
409 | 492 | ||
410 | void xen_set_pud(pud_t *ptr, pud_t val) | 493 | void xen_set_pud(pud_t *ptr, pud_t val) |
411 | { | 494 | { |
495 | ADD_STATS(pud_update, 1); | ||
496 | |||
412 | /* If page is not pinned, we can just update the entry | 497 | /* If page is not pinned, we can just update the entry |
413 | directly */ | 498 | directly */ |
414 | if (!page_pinned(ptr)) { | 499 | if (!xen_page_pinned(ptr)) { |
415 | *ptr = val; | 500 | *ptr = val; |
416 | return; | 501 | return; |
417 | } | 502 | } |
418 | 503 | ||
504 | ADD_STATS(pud_update_pinned, 1); | ||
505 | |||
419 | xen_set_pud_hyper(ptr, val); | 506 | xen_set_pud_hyper(ptr, val); |
420 | } | 507 | } |
421 | 508 | ||
422 | void xen_set_pte(pte_t *ptep, pte_t pte) | 509 | void xen_set_pte(pte_t *ptep, pte_t pte) |
423 | { | 510 | { |
511 | ADD_STATS(pte_update, 1); | ||
512 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); | ||
513 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
514 | |||
424 | #ifdef CONFIG_X86_PAE | 515 | #ifdef CONFIG_X86_PAE |
425 | ptep->pte_high = pte.pte_high; | 516 | ptep->pte_high = pte.pte_high; |
426 | smp_wmb(); | 517 | smp_wmb(); |
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
490 | 581 | ||
491 | u.ptr = virt_to_machine(ptr).maddr; | 582 | u.ptr = virt_to_machine(ptr).maddr; |
492 | u.val = pgd_val_ma(val); | 583 | u.val = pgd_val_ma(val); |
493 | extend_mmu_update(&u); | 584 | xen_extend_mmu_update(&u); |
494 | } | 585 | } |
495 | 586 | ||
496 | /* | 587 | /* |
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
517 | { | 608 | { |
518 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 609 | pgd_t *user_ptr = xen_get_user_pgd(ptr); |
519 | 610 | ||
611 | ADD_STATS(pgd_update, 1); | ||
612 | |||
520 | /* If page is not pinned, we can just update the entry | 613 | /* If page is not pinned, we can just update the entry |
521 | directly */ | 614 | directly */ |
522 | if (!page_pinned(ptr)) { | 615 | if (!xen_page_pinned(ptr)) { |
523 | *ptr = val; | 616 | *ptr = val; |
524 | if (user_ptr) { | 617 | if (user_ptr) { |
525 | WARN_ON(page_pinned(user_ptr)); | 618 | WARN_ON(xen_page_pinned(user_ptr)); |
526 | *user_ptr = val; | 619 | *user_ptr = val; |
527 | } | 620 | } |
528 | return; | 621 | return; |
529 | } | 622 | } |
530 | 623 | ||
624 | ADD_STATS(pgd_update_pinned, 1); | ||
625 | ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
626 | |||
531 | /* If it's pinned, then we can at least batch the kernel and | 627 | /* If it's pinned, then we can at least batch the kernel and |
532 | user updates together. */ | 628 | user updates together. */ |
533 | xen_mc_batch(); | 629 | xen_mc_batch(); |
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
555 | * For 64-bit, we must skip the Xen hole in the middle of the address | 651 | * For 64-bit, we must skip the Xen hole in the middle of the address |
556 | * space, just after the big x86-64 virtual hole. | 652 | * space, just after the big x86-64 virtual hole. |
557 | */ | 653 | */ |
558 | static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | 654 | static int xen_pgd_walk(struct mm_struct *mm, |
559 | unsigned long limit) | 655 | int (*func)(struct mm_struct *mm, struct page *, |
656 | enum pt_level), | ||
657 | unsigned long limit) | ||
560 | { | 658 | { |
659 | pgd_t *pgd = mm->pgd; | ||
561 | int flush = 0; | 660 | int flush = 0; |
562 | unsigned hole_low, hole_high; | 661 | unsigned hole_low, hole_high; |
563 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 662 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; |
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
590 | pmdidx_limit = 0; | 689 | pmdidx_limit = 0; |
591 | #endif | 690 | #endif |
592 | 691 | ||
593 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
594 | |||
595 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 692 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { |
596 | pud_t *pud; | 693 | pud_t *pud; |
597 | 694 | ||
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
604 | pud = pud_offset(&pgd[pgdidx], 0); | 701 | pud = pud_offset(&pgd[pgdidx], 0); |
605 | 702 | ||
606 | if (PTRS_PER_PUD > 1) /* not folded */ | 703 | if (PTRS_PER_PUD > 1) /* not folded */ |
607 | flush |= (*func)(virt_to_page(pud), PT_PUD); | 704 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); |
608 | 705 | ||
609 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 706 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { |
610 | pmd_t *pmd; | 707 | pmd_t *pmd; |
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
619 | pmd = pmd_offset(&pud[pudidx], 0); | 716 | pmd = pmd_offset(&pud[pudidx], 0); |
620 | 717 | ||
621 | if (PTRS_PER_PMD > 1) /* not folded */ | 718 | if (PTRS_PER_PMD > 1) /* not folded */ |
622 | flush |= (*func)(virt_to_page(pmd), PT_PMD); | 719 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); |
623 | 720 | ||
624 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 721 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { |
625 | struct page *pte; | 722 | struct page *pte; |
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
633 | continue; | 730 | continue; |
634 | 731 | ||
635 | pte = pmd_page(pmd[pmdidx]); | 732 | pte = pmd_page(pmd[pmdidx]); |
636 | flush |= (*func)(pte, PT_PTE); | 733 | flush |= (*func)(mm, pte, PT_PTE); |
637 | } | 734 | } |
638 | } | 735 | } |
639 | } | 736 | } |
737 | |||
640 | out: | 738 | out: |
739 | /* Do the top level last, so that the callbacks can use it as | ||
740 | a cue to do final things like tlb flushes. */ | ||
741 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); | ||
641 | 742 | ||
642 | return flush; | 743 | return flush; |
643 | } | 744 | } |
644 | 745 | ||
645 | static spinlock_t *lock_pte(struct page *page) | 746 | /* If we're using split pte locks, then take the page's lock and |
747 | return a pointer to it. Otherwise return NULL. */ | ||
748 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) | ||
646 | { | 749 | { |
647 | spinlock_t *ptl = NULL; | 750 | spinlock_t *ptl = NULL; |
648 | 751 | ||
649 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | 752 | #if USE_SPLIT_PTLOCKS |
650 | ptl = __pte_lockptr(page); | 753 | ptl = __pte_lockptr(page); |
651 | spin_lock(ptl); | 754 | spin_lock_nest_lock(ptl, &mm->page_table_lock); |
652 | #endif | 755 | #endif |
653 | 756 | ||
654 | return ptl; | 757 | return ptl; |
655 | } | 758 | } |
656 | 759 | ||
657 | static void do_unlock(void *v) | 760 | static void xen_pte_unlock(void *v) |
658 | { | 761 | { |
659 | spinlock_t *ptl = v; | 762 | spinlock_t *ptl = v; |
660 | spin_unlock(ptl); | 763 | spin_unlock(ptl); |
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) | |||
672 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 775 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); |
673 | } | 776 | } |
674 | 777 | ||
675 | static int pin_page(struct page *page, enum pt_level level) | 778 | static int xen_pin_page(struct mm_struct *mm, struct page *page, |
779 | enum pt_level level) | ||
676 | { | 780 | { |
677 | unsigned pgfl = TestSetPagePinned(page); | 781 | unsigned pgfl = TestSetPagePinned(page); |
678 | int flush; | 782 | int flush; |
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level) | |||
691 | 795 | ||
692 | flush = 0; | 796 | flush = 0; |
693 | 797 | ||
798 | /* | ||
799 | * We need to hold the pagetable lock between the time | ||
800 | * we make the pagetable RO and when we actually pin | ||
801 | * it. If we don't, then other users may come in and | ||
802 | * attempt to update the pagetable by writing it, | ||
803 | * which will fail because the memory is RO but not | ||
804 | * pinned, so Xen won't do the trap'n'emulate. | ||
805 | * | ||
806 | * If we're using split pte locks, we can't hold the | ||
807 | * entire pagetable's worth of locks during the | ||
808 | * traverse, because we may wrap the preempt count (8 | ||
809 | * bits). The solution is to mark RO and pin each PTE | ||
810 | * page while holding the lock. This means the number | ||
811 | * of locks we end up holding is never more than a | ||
812 | * batch size (~32 entries, at present). | ||
813 | * | ||
814 | * If we're not using split pte locks, we needn't pin | ||
815 | * the PTE pages independently, because we're | ||
816 | * protected by the overall pagetable lock. | ||
817 | */ | ||
694 | ptl = NULL; | 818 | ptl = NULL; |
695 | if (level == PT_PTE) | 819 | if (level == PT_PTE) |
696 | ptl = lock_pte(page); | 820 | ptl = xen_pte_lock(page, mm); |
697 | 821 | ||
698 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 822 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, |
699 | pfn_pte(pfn, PAGE_KERNEL_RO), | 823 | pfn_pte(pfn, PAGE_KERNEL_RO), |
700 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 824 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); |
701 | 825 | ||
702 | if (level == PT_PTE) | 826 | if (ptl) { |
703 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 827 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); |
704 | 828 | ||
705 | if (ptl) { | ||
706 | /* Queue a deferred unlock for when this batch | 829 | /* Queue a deferred unlock for when this batch |
707 | is completed. */ | 830 | is completed. */ |
708 | xen_mc_callback(do_unlock, ptl); | 831 | xen_mc_callback(xen_pte_unlock, ptl); |
709 | } | 832 | } |
710 | } | 833 | } |
711 | 834 | ||
@@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level) | |||
715 | /* This is called just after a mm has been created, but it has not | 838 | /* This is called just after a mm has been created, but it has not |
716 | been used yet. We need to make sure that its pagetable is all | 839 | been used yet. We need to make sure that its pagetable is all |
717 | read-only, and can be pinned. */ | 840 | read-only, and can be pinned. */ |
718 | void xen_pgd_pin(pgd_t *pgd) | 841 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) |
719 | { | 842 | { |
720 | xen_mc_batch(); | 843 | xen_mc_batch(); |
721 | 844 | ||
722 | if (pgd_walk(pgd, pin_page, USER_LIMIT)) { | 845 | if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { |
723 | /* re-enable interrupts for kmap_flush_unused */ | 846 | /* re-enable interrupts for kmap_flush_unused */ |
724 | xen_mc_issue(0); | 847 | xen_mc_issue(0); |
725 | kmap_flush_unused(); | 848 | kmap_flush_unused(); |
@@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) | |||
733 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 856 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); |
734 | 857 | ||
735 | if (user_pgd) { | 858 | if (user_pgd) { |
736 | pin_page(virt_to_page(user_pgd), PT_PGD); | 859 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); |
737 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | 860 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); |
738 | } | 861 | } |
739 | } | 862 | } |
740 | #else /* CONFIG_X86_32 */ | 863 | #else /* CONFIG_X86_32 */ |
741 | #ifdef CONFIG_X86_PAE | 864 | #ifdef CONFIG_X86_PAE |
742 | /* Need to make sure unshared kernel PMD is pinnable */ | 865 | /* Need to make sure unshared kernel PMD is pinnable */ |
743 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 866 | xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
867 | PT_PMD); | ||
744 | #endif | 868 | #endif |
745 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 869 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); |
746 | #endif /* CONFIG_X86_64 */ | 870 | #endif /* CONFIG_X86_64 */ |
747 | xen_mc_issue(0); | 871 | xen_mc_issue(0); |
748 | } | 872 | } |
749 | 873 | ||
874 | static void xen_pgd_pin(struct mm_struct *mm) | ||
875 | { | ||
876 | __xen_pgd_pin(mm, mm->pgd); | ||
877 | } | ||
878 | |||
750 | /* | 879 | /* |
751 | * On save, we need to pin all pagetables to make sure they get their | 880 | * On save, we need to pin all pagetables to make sure they get their |
752 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 881 | * mfns turned into pfns. Search the list for any unpinned pgds and pin |
753 | * them (unpinned pgds are not currently in use, probably because the | 882 | * them (unpinned pgds are not currently in use, probably because the |
754 | * process is under construction or destruction). | 883 | * process is under construction or destruction). |
884 | * | ||
885 | * Expected to be called in stop_machine() ("equivalent to taking | ||
886 | * every spinlock in the system"), so the locking doesn't really | ||
887 | * matter all that much. | ||
755 | */ | 888 | */ |
756 | void xen_mm_pin_all(void) | 889 | void xen_mm_pin_all(void) |
757 | { | 890 | { |
@@ -762,7 +895,7 @@ void xen_mm_pin_all(void) | |||
762 | 895 | ||
763 | list_for_each_entry(page, &pgd_list, lru) { | 896 | list_for_each_entry(page, &pgd_list, lru) { |
764 | if (!PagePinned(page)) { | 897 | if (!PagePinned(page)) { |
765 | xen_pgd_pin((pgd_t *)page_address(page)); | 898 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); |
766 | SetPageSavePinned(page); | 899 | SetPageSavePinned(page); |
767 | } | 900 | } |
768 | } | 901 | } |
@@ -775,7 +908,8 @@ void xen_mm_pin_all(void) | |||
775 | * that's before we have page structures to store the bits. So do all | 908 | * that's before we have page structures to store the bits. So do all |
776 | * the book-keeping now. | 909 | * the book-keeping now. |
777 | */ | 910 | */ |
778 | static __init int mark_pinned(struct page *page, enum pt_level level) | 911 | static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, |
912 | enum pt_level level) | ||
779 | { | 913 | { |
780 | SetPagePinned(page); | 914 | SetPagePinned(page); |
781 | return 0; | 915 | return 0; |
@@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level) | |||
783 | 917 | ||
784 | void __init xen_mark_init_mm_pinned(void) | 918 | void __init xen_mark_init_mm_pinned(void) |
785 | { | 919 | { |
786 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 920 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); |
787 | } | 921 | } |
788 | 922 | ||
789 | static int unpin_page(struct page *page, enum pt_level level) | 923 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, |
924 | enum pt_level level) | ||
790 | { | 925 | { |
791 | unsigned pgfl = TestClearPagePinned(page); | 926 | unsigned pgfl = TestClearPagePinned(page); |
792 | 927 | ||
@@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
796 | spinlock_t *ptl = NULL; | 931 | spinlock_t *ptl = NULL; |
797 | struct multicall_space mcs; | 932 | struct multicall_space mcs; |
798 | 933 | ||
934 | /* | ||
935 | * Do the converse to pin_page. If we're using split | ||
936 | * pte locks, we must be holding the lock for while | ||
937 | * the pte page is unpinned but still RO to prevent | ||
938 | * concurrent updates from seeing it in this | ||
939 | * partially-pinned state. | ||
940 | */ | ||
799 | if (level == PT_PTE) { | 941 | if (level == PT_PTE) { |
800 | ptl = lock_pte(page); | 942 | ptl = xen_pte_lock(page, mm); |
801 | 943 | ||
802 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 944 | if (ptl) |
945 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
803 | } | 946 | } |
804 | 947 | ||
805 | mcs = __xen_mc_entry(0); | 948 | mcs = __xen_mc_entry(0); |
@@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
810 | 953 | ||
811 | if (ptl) { | 954 | if (ptl) { |
812 | /* unlock when batch completed */ | 955 | /* unlock when batch completed */ |
813 | xen_mc_callback(do_unlock, ptl); | 956 | xen_mc_callback(xen_pte_unlock, ptl); |
814 | } | 957 | } |
815 | } | 958 | } |
816 | 959 | ||
@@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
818 | } | 961 | } |
819 | 962 | ||
820 | /* Release a pagetables pages back as normal RW */ | 963 | /* Release a pagetables pages back as normal RW */ |
821 | static void xen_pgd_unpin(pgd_t *pgd) | 964 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) |
822 | { | 965 | { |
823 | xen_mc_batch(); | 966 | xen_mc_batch(); |
824 | 967 | ||
@@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
830 | 973 | ||
831 | if (user_pgd) { | 974 | if (user_pgd) { |
832 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | 975 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); |
833 | unpin_page(virt_to_page(user_pgd), PT_PGD); | 976 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); |
834 | } | 977 | } |
835 | } | 978 | } |
836 | #endif | 979 | #endif |
837 | 980 | ||
838 | #ifdef CONFIG_X86_PAE | 981 | #ifdef CONFIG_X86_PAE |
839 | /* Need to make sure unshared kernel PMD is unpinned */ | 982 | /* Need to make sure unshared kernel PMD is unpinned */ |
840 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 983 | xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), |
984 | PT_PMD); | ||
841 | #endif | 985 | #endif |
842 | 986 | ||
843 | pgd_walk(pgd, unpin_page, USER_LIMIT); | 987 | xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); |
844 | 988 | ||
845 | xen_mc_issue(0); | 989 | xen_mc_issue(0); |
846 | } | 990 | } |
847 | 991 | ||
992 | static void xen_pgd_unpin(struct mm_struct *mm) | ||
993 | { | ||
994 | __xen_pgd_unpin(mm, mm->pgd); | ||
995 | } | ||
996 | |||
848 | /* | 997 | /* |
849 | * On resume, undo any pinning done at save, so that the rest of the | 998 | * On resume, undo any pinning done at save, so that the rest of the |
850 | * kernel doesn't see any unexpected pinned pagetables. | 999 | * kernel doesn't see any unexpected pinned pagetables. |
@@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void) | |||
859 | list_for_each_entry(page, &pgd_list, lru) { | 1008 | list_for_each_entry(page, &pgd_list, lru) { |
860 | if (PageSavePinned(page)) { | 1009 | if (PageSavePinned(page)) { |
861 | BUG_ON(!PagePinned(page)); | 1010 | BUG_ON(!PagePinned(page)); |
862 | xen_pgd_unpin((pgd_t *)page_address(page)); | 1011 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); |
863 | ClearPageSavePinned(page); | 1012 | ClearPageSavePinned(page); |
864 | } | 1013 | } |
865 | } | 1014 | } |
@@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void) | |||
870 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 1019 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) |
871 | { | 1020 | { |
872 | spin_lock(&next->page_table_lock); | 1021 | spin_lock(&next->page_table_lock); |
873 | xen_pgd_pin(next->pgd); | 1022 | xen_pgd_pin(next); |
874 | spin_unlock(&next->page_table_lock); | 1023 | spin_unlock(&next->page_table_lock); |
875 | } | 1024 | } |
876 | 1025 | ||
877 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 1026 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) |
878 | { | 1027 | { |
879 | spin_lock(&mm->page_table_lock); | 1028 | spin_lock(&mm->page_table_lock); |
880 | xen_pgd_pin(mm->pgd); | 1029 | xen_pgd_pin(mm); |
881 | spin_unlock(&mm->page_table_lock); | 1030 | spin_unlock(&mm->page_table_lock); |
882 | } | 1031 | } |
883 | 1032 | ||
@@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info) | |||
907 | } | 1056 | } |
908 | } | 1057 | } |
909 | 1058 | ||
910 | static void drop_mm_ref(struct mm_struct *mm) | 1059 | static void xen_drop_mm_ref(struct mm_struct *mm) |
911 | { | 1060 | { |
912 | cpumask_t mask; | 1061 | cpumask_t mask; |
913 | unsigned cpu; | 1062 | unsigned cpu; |
@@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
937 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | 1086 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); |
938 | } | 1087 | } |
939 | #else | 1088 | #else |
940 | static void drop_mm_ref(struct mm_struct *mm) | 1089 | static void xen_drop_mm_ref(struct mm_struct *mm) |
941 | { | 1090 | { |
942 | if (current->active_mm == mm) | 1091 | if (current->active_mm == mm) |
943 | load_cr3(swapper_pg_dir); | 1092 | load_cr3(swapper_pg_dir); |
@@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
961 | void xen_exit_mmap(struct mm_struct *mm) | 1110 | void xen_exit_mmap(struct mm_struct *mm) |
962 | { | 1111 | { |
963 | get_cpu(); /* make sure we don't move around */ | 1112 | get_cpu(); /* make sure we don't move around */ |
964 | drop_mm_ref(mm); | 1113 | xen_drop_mm_ref(mm); |
965 | put_cpu(); | 1114 | put_cpu(); |
966 | 1115 | ||
967 | spin_lock(&mm->page_table_lock); | 1116 | spin_lock(&mm->page_table_lock); |
968 | 1117 | ||
969 | /* pgd may not be pinned in the error exit path of execve */ | 1118 | /* pgd may not be pinned in the error exit path of execve */ |
970 | if (page_pinned(mm->pgd)) | 1119 | if (xen_page_pinned(mm->pgd)) |
971 | xen_pgd_unpin(mm->pgd); | 1120 | xen_pgd_unpin(mm); |
972 | 1121 | ||
973 | spin_unlock(&mm->page_table_lock); | 1122 | spin_unlock(&mm->page_table_lock); |
974 | } | 1123 | } |
1124 | |||
1125 | #ifdef CONFIG_XEN_DEBUG_FS | ||
1126 | |||
1127 | static struct dentry *d_mmu_debug; | ||
1128 | |||
1129 | static int __init xen_mmu_debugfs(void) | ||
1130 | { | ||
1131 | struct dentry *d_xen = xen_init_debugfs(); | ||
1132 | |||
1133 | if (d_xen == NULL) | ||
1134 | return -ENOMEM; | ||
1135 | |||
1136 | d_mmu_debug = debugfs_create_dir("mmu", d_xen); | ||
1137 | |||
1138 | debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); | ||
1139 | |||
1140 | debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); | ||
1141 | debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, | ||
1142 | &mmu_stats.pgd_update_pinned); | ||
1143 | debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, | ||
1144 | &mmu_stats.pgd_update_pinned); | ||
1145 | |||
1146 | debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); | ||
1147 | debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, | ||
1148 | &mmu_stats.pud_update_pinned); | ||
1149 | debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, | ||
1150 | &mmu_stats.pud_update_pinned); | ||
1151 | |||
1152 | debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); | ||
1153 | debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, | ||
1154 | &mmu_stats.pmd_update_pinned); | ||
1155 | debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, | ||
1156 | &mmu_stats.pmd_update_pinned); | ||
1157 | |||
1158 | debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); | ||
1159 | // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, | ||
1160 | // &mmu_stats.pte_update_pinned); | ||
1161 | debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, | ||
1162 | &mmu_stats.pte_update_pinned); | ||
1163 | |||
1164 | debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); | ||
1165 | debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, | ||
1166 | &mmu_stats.mmu_update_extended); | ||
1167 | xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, | ||
1168 | mmu_stats.mmu_update_histo, 20); | ||
1169 | |||
1170 | debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); | ||
1171 | debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, | ||
1172 | &mmu_stats.set_pte_at_batched); | ||
1173 | debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, | ||
1174 | &mmu_stats.set_pte_at_current); | ||
1175 | debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, | ||
1176 | &mmu_stats.set_pte_at_kernel); | ||
1177 | |||
1178 | debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); | ||
1179 | debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, | ||
1180 | &mmu_stats.prot_commit_batched); | ||
1181 | |||
1182 | return 0; | ||
1183 | } | ||
1184 | fs_initcall(xen_mmu_debugfs); | ||
1185 | |||
1186 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||