diff options
Diffstat (limited to 'arch/x86/xen/mmu.c')
| -rw-r--r-- | arch/x86/xen/mmu.c | 314 | 
1 files changed, 263 insertions, 51 deletions
| diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c | |||
| @@ -40,6 +40,7 @@ | |||
| 40 | */ | 40 | */ | 
| 41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> | 
| 42 | #include <linux/highmem.h> | 42 | #include <linux/highmem.h> | 
| 43 | #include <linux/debugfs.h> | ||
| 43 | #include <linux/bug.h> | 44 | #include <linux/bug.h> | 
| 44 | 45 | ||
| 45 | #include <asm/pgtable.h> | 46 | #include <asm/pgtable.h> | 
| @@ -57,6 +58,61 @@ | |||
| 57 | 58 | ||
| 58 | #include "multicalls.h" | 59 | #include "multicalls.h" | 
| 59 | #include "mmu.h" | 60 | #include "mmu.h" | 
| 61 | #include "debugfs.h" | ||
| 62 | |||
| 63 | #define MMU_UPDATE_HISTO 30 | ||
| 64 | |||
| 65 | #ifdef CONFIG_XEN_DEBUG_FS | ||
| 66 | |||
| 67 | static struct { | ||
| 68 | u32 pgd_update; | ||
| 69 | u32 pgd_update_pinned; | ||
| 70 | u32 pgd_update_batched; | ||
| 71 | |||
| 72 | u32 pud_update; | ||
| 73 | u32 pud_update_pinned; | ||
| 74 | u32 pud_update_batched; | ||
| 75 | |||
| 76 | u32 pmd_update; | ||
| 77 | u32 pmd_update_pinned; | ||
| 78 | u32 pmd_update_batched; | ||
| 79 | |||
| 80 | u32 pte_update; | ||
| 81 | u32 pte_update_pinned; | ||
| 82 | u32 pte_update_batched; | ||
| 83 | |||
| 84 | u32 mmu_update; | ||
| 85 | u32 mmu_update_extended; | ||
| 86 | u32 mmu_update_histo[MMU_UPDATE_HISTO]; | ||
| 87 | |||
| 88 | u32 prot_commit; | ||
| 89 | u32 prot_commit_batched; | ||
| 90 | |||
| 91 | u32 set_pte_at; | ||
| 92 | u32 set_pte_at_batched; | ||
| 93 | u32 set_pte_at_pinned; | ||
| 94 | u32 set_pte_at_current; | ||
| 95 | u32 set_pte_at_kernel; | ||
| 96 | } mmu_stats; | ||
| 97 | |||
| 98 | static u8 zero_stats; | ||
| 99 | |||
| 100 | static inline void check_zero(void) | ||
| 101 | { | ||
| 102 | if (unlikely(zero_stats)) { | ||
| 103 | memset(&mmu_stats, 0, sizeof(mmu_stats)); | ||
| 104 | zero_stats = 0; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | #define ADD_STATS(elem, val) \ | ||
| 109 | do { check_zero(); mmu_stats.elem += (val); } while(0) | ||
| 110 | |||
| 111 | #else /* !CONFIG_XEN_DEBUG_FS */ | ||
| 112 | |||
| 113 | #define ADD_STATS(elem, val) do { (void)(val); } while(0) | ||
| 114 | |||
| 115 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
| 60 | 116 | ||
| 61 | /* | 117 | /* | 
| 62 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | 118 | * Just beyond the highest usermode address. STACK_TOP_MAX has a | 
| @@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr) | |||
| 229 | } | 285 | } | 
| 230 | 286 | ||
| 231 | 287 | ||
| 232 | static bool page_pinned(void *ptr) | 288 | static bool xen_page_pinned(void *ptr) | 
| 233 | { | 289 | { | 
| 234 | struct page *page = virt_to_page(ptr); | 290 | struct page *page = virt_to_page(ptr); | 
| 235 | 291 | ||
| 236 | return PagePinned(page); | 292 | return PagePinned(page); | 
| 237 | } | 293 | } | 
| 238 | 294 | ||
| 239 | static void extend_mmu_update(const struct mmu_update *update) | 295 | static void xen_extend_mmu_update(const struct mmu_update *update) | 
| 240 | { | 296 | { | 
| 241 | struct multicall_space mcs; | 297 | struct multicall_space mcs; | 
| 242 | struct mmu_update *u; | 298 | struct mmu_update *u; | 
| 243 | 299 | ||
| 244 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 300 | mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); | 
| 245 | 301 | ||
| 246 | if (mcs.mc != NULL) | 302 | if (mcs.mc != NULL) { | 
| 303 | ADD_STATS(mmu_update_extended, 1); | ||
| 304 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); | ||
| 305 | |||
| 247 | mcs.mc->args[1]++; | 306 | mcs.mc->args[1]++; | 
| 248 | else { | 307 | |
| 308 | if (mcs.mc->args[1] < MMU_UPDATE_HISTO) | ||
| 309 | ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); | ||
| 310 | else | ||
| 311 | ADD_STATS(mmu_update_histo[0], 1); | ||
| 312 | } else { | ||
| 313 | ADD_STATS(mmu_update, 1); | ||
| 249 | mcs = __xen_mc_entry(sizeof(*u)); | 314 | mcs = __xen_mc_entry(sizeof(*u)); | 
| 250 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 315 | MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); | 
| 316 | ADD_STATS(mmu_update_histo[1], 1); | ||
| 251 | } | 317 | } | 
| 252 | 318 | ||
| 253 | u = mcs.args; | 319 | u = mcs.args; | 
| @@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
| 265 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 331 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 
| 266 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 332 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 
| 267 | u.val = pmd_val_ma(val); | 333 | u.val = pmd_val_ma(val); | 
| 268 | extend_mmu_update(&u); | 334 | xen_extend_mmu_update(&u); | 
| 335 | |||
| 336 | ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
| 269 | 337 | ||
| 270 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 338 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 
| 271 | 339 | ||
| @@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) | |||
| 274 | 342 | ||
| 275 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 343 | void xen_set_pmd(pmd_t *ptr, pmd_t val) | 
| 276 | { | 344 | { | 
| 345 | ADD_STATS(pmd_update, 1); | ||
| 346 | |||
| 277 | /* If page is not pinned, we can just update the entry | 347 | /* If page is not pinned, we can just update the entry | 
| 278 | directly */ | 348 | directly */ | 
| 279 | if (!page_pinned(ptr)) { | 349 | if (!xen_page_pinned(ptr)) { | 
| 280 | *ptr = val; | 350 | *ptr = val; | 
| 281 | return; | 351 | return; | 
| 282 | } | 352 | } | 
| 283 | 353 | ||
| 354 | ADD_STATS(pmd_update_pinned, 1); | ||
| 355 | |||
| 284 | xen_set_pmd_hyper(ptr, val); | 356 | xen_set_pmd_hyper(ptr, val); | 
| 285 | } | 357 | } | 
| 286 | 358 | ||
| @@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
| 300 | if (mm == &init_mm) | 372 | if (mm == &init_mm) | 
| 301 | preempt_disable(); | 373 | preempt_disable(); | 
| 302 | 374 | ||
| 375 | ADD_STATS(set_pte_at, 1); | ||
| 376 | // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); | ||
| 377 | ADD_STATS(set_pte_at_current, mm == current->mm); | ||
| 378 | ADD_STATS(set_pte_at_kernel, mm == &init_mm); | ||
| 379 | |||
| 303 | if (mm == current->mm || mm == &init_mm) { | 380 | if (mm == current->mm || mm == &init_mm) { | 
| 304 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 381 | if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { | 
| 305 | struct multicall_space mcs; | 382 | struct multicall_space mcs; | 
| 306 | mcs = xen_mc_entry(0); | 383 | mcs = xen_mc_entry(0); | 
| 307 | 384 | ||
| 308 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 385 | MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); | 
| 386 | ADD_STATS(set_pte_at_batched, 1); | ||
| 309 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 387 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 
| 310 | goto out; | 388 | goto out; | 
| 311 | } else | 389 | } else | 
| @@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, | |||
| 334 | 412 | ||
| 335 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 413 | u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; | 
| 336 | u.val = pte_val_ma(pte); | 414 | u.val = pte_val_ma(pte); | 
| 337 | extend_mmu_update(&u); | 415 | xen_extend_mmu_update(&u); | 
| 416 | |||
| 417 | ADD_STATS(prot_commit, 1); | ||
| 418 | ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
| 338 | 419 | ||
| 339 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 420 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 
| 340 | } | 421 | } | 
| @@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
| 400 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 481 | /* ptr may be ioremapped for 64-bit pagetable setup */ | 
| 401 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 482 | u.ptr = arbitrary_virt_to_machine(ptr).maddr; | 
| 402 | u.val = pud_val_ma(val); | 483 | u.val = pud_val_ma(val); | 
| 403 | extend_mmu_update(&u); | 484 | xen_extend_mmu_update(&u); | 
| 485 | |||
| 486 | ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
| 404 | 487 | ||
| 405 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 488 | xen_mc_issue(PARAVIRT_LAZY_MMU); | 
| 406 | 489 | ||
| @@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) | |||
| 409 | 492 | ||
| 410 | void xen_set_pud(pud_t *ptr, pud_t val) | 493 | void xen_set_pud(pud_t *ptr, pud_t val) | 
| 411 | { | 494 | { | 
| 495 | ADD_STATS(pud_update, 1); | ||
| 496 | |||
| 412 | /* If page is not pinned, we can just update the entry | 497 | /* If page is not pinned, we can just update the entry | 
| 413 | directly */ | 498 | directly */ | 
| 414 | if (!page_pinned(ptr)) { | 499 | if (!xen_page_pinned(ptr)) { | 
| 415 | *ptr = val; | 500 | *ptr = val; | 
| 416 | return; | 501 | return; | 
| 417 | } | 502 | } | 
| 418 | 503 | ||
| 504 | ADD_STATS(pud_update_pinned, 1); | ||
| 505 | |||
| 419 | xen_set_pud_hyper(ptr, val); | 506 | xen_set_pud_hyper(ptr, val); | 
| 420 | } | 507 | } | 
| 421 | 508 | ||
| 422 | void xen_set_pte(pte_t *ptep, pte_t pte) | 509 | void xen_set_pte(pte_t *ptep, pte_t pte) | 
| 423 | { | 510 | { | 
| 511 | ADD_STATS(pte_update, 1); | ||
| 512 | // ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); | ||
| 513 | ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
| 514 | |||
| 424 | #ifdef CONFIG_X86_PAE | 515 | #ifdef CONFIG_X86_PAE | 
| 425 | ptep->pte_high = pte.pte_high; | 516 | ptep->pte_high = pte.pte_high; | 
| 426 | smp_wmb(); | 517 | smp_wmb(); | 
| @@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) | |||
| 490 | 581 | ||
| 491 | u.ptr = virt_to_machine(ptr).maddr; | 582 | u.ptr = virt_to_machine(ptr).maddr; | 
| 492 | u.val = pgd_val_ma(val); | 583 | u.val = pgd_val_ma(val); | 
| 493 | extend_mmu_update(&u); | 584 | xen_extend_mmu_update(&u); | 
| 494 | } | 585 | } | 
| 495 | 586 | ||
| 496 | /* | 587 | /* | 
| @@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
| 517 | { | 608 | { | 
| 518 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 609 | pgd_t *user_ptr = xen_get_user_pgd(ptr); | 
| 519 | 610 | ||
| 611 | ADD_STATS(pgd_update, 1); | ||
| 612 | |||
| 520 | /* If page is not pinned, we can just update the entry | 613 | /* If page is not pinned, we can just update the entry | 
| 521 | directly */ | 614 | directly */ | 
| 522 | if (!page_pinned(ptr)) { | 615 | if (!xen_page_pinned(ptr)) { | 
| 523 | *ptr = val; | 616 | *ptr = val; | 
| 524 | if (user_ptr) { | 617 | if (user_ptr) { | 
| 525 | WARN_ON(page_pinned(user_ptr)); | 618 | WARN_ON(xen_page_pinned(user_ptr)); | 
| 526 | *user_ptr = val; | 619 | *user_ptr = val; | 
| 527 | } | 620 | } | 
| 528 | return; | 621 | return; | 
| 529 | } | 622 | } | 
| 530 | 623 | ||
| 624 | ADD_STATS(pgd_update_pinned, 1); | ||
| 625 | ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); | ||
| 626 | |||
| 531 | /* If it's pinned, then we can at least batch the kernel and | 627 | /* If it's pinned, then we can at least batch the kernel and | 
| 532 | user updates together. */ | 628 | user updates together. */ | 
| 533 | xen_mc_batch(); | 629 | xen_mc_batch(); | 
| @@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) | |||
| 555 | * For 64-bit, we must skip the Xen hole in the middle of the address | 651 | * For 64-bit, we must skip the Xen hole in the middle of the address | 
| 556 | * space, just after the big x86-64 virtual hole. | 652 | * space, just after the big x86-64 virtual hole. | 
| 557 | */ | 653 | */ | 
| 558 | static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | 654 | static int xen_pgd_walk(struct mm_struct *mm, | 
| 559 | unsigned long limit) | 655 | int (*func)(struct mm_struct *mm, struct page *, | 
| 656 | enum pt_level), | ||
| 657 | unsigned long limit) | ||
| 560 | { | 658 | { | 
| 659 | pgd_t *pgd = mm->pgd; | ||
| 561 | int flush = 0; | 660 | int flush = 0; | 
| 562 | unsigned hole_low, hole_high; | 661 | unsigned hole_low, hole_high; | 
| 563 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 662 | unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; | 
| @@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
| 590 | pmdidx_limit = 0; | 689 | pmdidx_limit = 0; | 
| 591 | #endif | 690 | #endif | 
| 592 | 691 | ||
| 593 | flush |= (*func)(virt_to_page(pgd), PT_PGD); | ||
| 594 | |||
| 595 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 692 | for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { | 
| 596 | pud_t *pud; | 693 | pud_t *pud; | 
| 597 | 694 | ||
| @@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
| 604 | pud = pud_offset(&pgd[pgdidx], 0); | 701 | pud = pud_offset(&pgd[pgdidx], 0); | 
| 605 | 702 | ||
| 606 | if (PTRS_PER_PUD > 1) /* not folded */ | 703 | if (PTRS_PER_PUD > 1) /* not folded */ | 
| 607 | flush |= (*func)(virt_to_page(pud), PT_PUD); | 704 | flush |= (*func)(mm, virt_to_page(pud), PT_PUD); | 
| 608 | 705 | ||
| 609 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 706 | for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { | 
| 610 | pmd_t *pmd; | 707 | pmd_t *pmd; | 
| @@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
| 619 | pmd = pmd_offset(&pud[pudidx], 0); | 716 | pmd = pmd_offset(&pud[pudidx], 0); | 
| 620 | 717 | ||
| 621 | if (PTRS_PER_PMD > 1) /* not folded */ | 718 | if (PTRS_PER_PMD > 1) /* not folded */ | 
| 622 | flush |= (*func)(virt_to_page(pmd), PT_PMD); | 719 | flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); | 
| 623 | 720 | ||
| 624 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 721 | for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { | 
| 625 | struct page *pte; | 722 | struct page *pte; | 
| @@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), | |||
| 633 | continue; | 730 | continue; | 
| 634 | 731 | ||
| 635 | pte = pmd_page(pmd[pmdidx]); | 732 | pte = pmd_page(pmd[pmdidx]); | 
| 636 | flush |= (*func)(pte, PT_PTE); | 733 | flush |= (*func)(mm, pte, PT_PTE); | 
| 637 | } | 734 | } | 
| 638 | } | 735 | } | 
| 639 | } | 736 | } | 
| 737 | |||
| 640 | out: | 738 | out: | 
| 739 | /* Do the top level last, so that the callbacks can use it as | ||
| 740 | a cue to do final things like tlb flushes. */ | ||
| 741 | flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); | ||
| 641 | 742 | ||
| 642 | return flush; | 743 | return flush; | 
| 643 | } | 744 | } | 
| 644 | 745 | ||
| 645 | static spinlock_t *lock_pte(struct page *page) | 746 | /* If we're using split pte locks, then take the page's lock and | 
| 747 | return a pointer to it. Otherwise return NULL. */ | ||
| 748 | static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) | ||
| 646 | { | 749 | { | 
| 647 | spinlock_t *ptl = NULL; | 750 | spinlock_t *ptl = NULL; | 
| 648 | 751 | ||
| 649 | #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS | 752 | #if USE_SPLIT_PTLOCKS | 
| 650 | ptl = __pte_lockptr(page); | 753 | ptl = __pte_lockptr(page); | 
| 651 | spin_lock(ptl); | 754 | spin_lock_nest_lock(ptl, &mm->page_table_lock); | 
| 652 | #endif | 755 | #endif | 
| 653 | 756 | ||
| 654 | return ptl; | 757 | return ptl; | 
| 655 | } | 758 | } | 
| 656 | 759 | ||
| 657 | static void do_unlock(void *v) | 760 | static void xen_pte_unlock(void *v) | 
| 658 | { | 761 | { | 
| 659 | spinlock_t *ptl = v; | 762 | spinlock_t *ptl = v; | 
| 660 | spin_unlock(ptl); | 763 | spin_unlock(ptl); | 
| @@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) | |||
| 672 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 775 | MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); | 
| 673 | } | 776 | } | 
| 674 | 777 | ||
| 675 | static int pin_page(struct page *page, enum pt_level level) | 778 | static int xen_pin_page(struct mm_struct *mm, struct page *page, | 
| 779 | enum pt_level level) | ||
| 676 | { | 780 | { | 
| 677 | unsigned pgfl = TestSetPagePinned(page); | 781 | unsigned pgfl = TestSetPagePinned(page); | 
| 678 | int flush; | 782 | int flush; | 
| @@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level) | |||
| 691 | 795 | ||
| 692 | flush = 0; | 796 | flush = 0; | 
| 693 | 797 | ||
| 798 | /* | ||
| 799 | * We need to hold the pagetable lock between the time | ||
| 800 | * we make the pagetable RO and when we actually pin | ||
| 801 | * it. If we don't, then other users may come in and | ||
| 802 | * attempt to update the pagetable by writing it, | ||
| 803 | * which will fail because the memory is RO but not | ||
| 804 | * pinned, so Xen won't do the trap'n'emulate. | ||
| 805 | * | ||
| 806 | * If we're using split pte locks, we can't hold the | ||
| 807 | * entire pagetable's worth of locks during the | ||
| 808 | * traverse, because we may wrap the preempt count (8 | ||
| 809 | * bits). The solution is to mark RO and pin each PTE | ||
| 810 | * page while holding the lock. This means the number | ||
| 811 | * of locks we end up holding is never more than a | ||
| 812 | * batch size (~32 entries, at present). | ||
| 813 | * | ||
| 814 | * If we're not using split pte locks, we needn't pin | ||
| 815 | * the PTE pages independently, because we're | ||
| 816 | * protected by the overall pagetable lock. | ||
| 817 | */ | ||
| 694 | ptl = NULL; | 818 | ptl = NULL; | 
| 695 | if (level == PT_PTE) | 819 | if (level == PT_PTE) | 
| 696 | ptl = lock_pte(page); | 820 | ptl = xen_pte_lock(page, mm); | 
| 697 | 821 | ||
| 698 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 822 | MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, | 
| 699 | pfn_pte(pfn, PAGE_KERNEL_RO), | 823 | pfn_pte(pfn, PAGE_KERNEL_RO), | 
| 700 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 824 | level == PT_PGD ? UVMF_TLB_FLUSH : 0); | 
| 701 | 825 | ||
| 702 | if (level == PT_PTE) | 826 | if (ptl) { | 
| 703 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 827 | xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); | 
| 704 | 828 | ||
| 705 | if (ptl) { | ||
| 706 | /* Queue a deferred unlock for when this batch | 829 | /* Queue a deferred unlock for when this batch | 
| 707 | is completed. */ | 830 | is completed. */ | 
| 708 | xen_mc_callback(do_unlock, ptl); | 831 | xen_mc_callback(xen_pte_unlock, ptl); | 
| 709 | } | 832 | } | 
| 710 | } | 833 | } | 
| 711 | 834 | ||
| @@ -715,11 +838,11 @@ static int pin_page(struct page *page, enum pt_level level) | |||
| 715 | /* This is called just after a mm has been created, but it has not | 838 | /* This is called just after a mm has been created, but it has not | 
| 716 | been used yet. We need to make sure that its pagetable is all | 839 | been used yet. We need to make sure that its pagetable is all | 
| 717 | read-only, and can be pinned. */ | 840 | read-only, and can be pinned. */ | 
| 718 | void xen_pgd_pin(pgd_t *pgd) | 841 | static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) | 
| 719 | { | 842 | { | 
| 720 | xen_mc_batch(); | 843 | xen_mc_batch(); | 
| 721 | 844 | ||
| 722 | if (pgd_walk(pgd, pin_page, USER_LIMIT)) { | 845 | if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { | 
| 723 | /* re-enable interrupts for kmap_flush_unused */ | 846 | /* re-enable interrupts for kmap_flush_unused */ | 
| 724 | xen_mc_issue(0); | 847 | xen_mc_issue(0); | 
| 725 | kmap_flush_unused(); | 848 | kmap_flush_unused(); | 
| @@ -733,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) | |||
| 733 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 856 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); | 
| 734 | 857 | ||
| 735 | if (user_pgd) { | 858 | if (user_pgd) { | 
| 736 | pin_page(virt_to_page(user_pgd), PT_PGD); | 859 | xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); | 
| 737 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | 860 | xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); | 
| 738 | } | 861 | } | 
| 739 | } | 862 | } | 
| 740 | #else /* CONFIG_X86_32 */ | 863 | #else /* CONFIG_X86_32 */ | 
| 741 | #ifdef CONFIG_X86_PAE | 864 | #ifdef CONFIG_X86_PAE | 
| 742 | /* Need to make sure unshared kernel PMD is pinnable */ | 865 | /* Need to make sure unshared kernel PMD is pinnable */ | 
| 743 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 866 | xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), | 
| 867 | PT_PMD); | ||
| 744 | #endif | 868 | #endif | 
| 745 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 869 | xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); | 
| 746 | #endif /* CONFIG_X86_64 */ | 870 | #endif /* CONFIG_X86_64 */ | 
| 747 | xen_mc_issue(0); | 871 | xen_mc_issue(0); | 
| 748 | } | 872 | } | 
| 749 | 873 | ||
| 874 | static void xen_pgd_pin(struct mm_struct *mm) | ||
| 875 | { | ||
| 876 | __xen_pgd_pin(mm, mm->pgd); | ||
| 877 | } | ||
| 878 | |||
| 750 | /* | 879 | /* | 
| 751 | * On save, we need to pin all pagetables to make sure they get their | 880 | * On save, we need to pin all pagetables to make sure they get their | 
| 752 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 881 | * mfns turned into pfns. Search the list for any unpinned pgds and pin | 
| 753 | * them (unpinned pgds are not currently in use, probably because the | 882 | * them (unpinned pgds are not currently in use, probably because the | 
| 754 | * process is under construction or destruction). | 883 | * process is under construction or destruction). | 
| 884 | * | ||
| 885 | * Expected to be called in stop_machine() ("equivalent to taking | ||
| 886 | * every spinlock in the system"), so the locking doesn't really | ||
| 887 | * matter all that much. | ||
| 755 | */ | 888 | */ | 
| 756 | void xen_mm_pin_all(void) | 889 | void xen_mm_pin_all(void) | 
| 757 | { | 890 | { | 
| @@ -762,7 +895,7 @@ void xen_mm_pin_all(void) | |||
| 762 | 895 | ||
| 763 | list_for_each_entry(page, &pgd_list, lru) { | 896 | list_for_each_entry(page, &pgd_list, lru) { | 
| 764 | if (!PagePinned(page)) { | 897 | if (!PagePinned(page)) { | 
| 765 | xen_pgd_pin((pgd_t *)page_address(page)); | 898 | __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); | 
| 766 | SetPageSavePinned(page); | 899 | SetPageSavePinned(page); | 
| 767 | } | 900 | } | 
| 768 | } | 901 | } | 
| @@ -775,7 +908,8 @@ void xen_mm_pin_all(void) | |||
| 775 | * that's before we have page structures to store the bits. So do all | 908 | * that's before we have page structures to store the bits. So do all | 
| 776 | * the book-keeping now. | 909 | * the book-keeping now. | 
| 777 | */ | 910 | */ | 
| 778 | static __init int mark_pinned(struct page *page, enum pt_level level) | 911 | static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, | 
| 912 | enum pt_level level) | ||
| 779 | { | 913 | { | 
| 780 | SetPagePinned(page); | 914 | SetPagePinned(page); | 
| 781 | return 0; | 915 | return 0; | 
| @@ -783,10 +917,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level) | |||
| 783 | 917 | ||
| 784 | void __init xen_mark_init_mm_pinned(void) | 918 | void __init xen_mark_init_mm_pinned(void) | 
| 785 | { | 919 | { | 
| 786 | pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); | 920 | xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); | 
| 787 | } | 921 | } | 
| 788 | 922 | ||
| 789 | static int unpin_page(struct page *page, enum pt_level level) | 923 | static int xen_unpin_page(struct mm_struct *mm, struct page *page, | 
| 924 | enum pt_level level) | ||
| 790 | { | 925 | { | 
| 791 | unsigned pgfl = TestClearPagePinned(page); | 926 | unsigned pgfl = TestClearPagePinned(page); | 
| 792 | 927 | ||
| @@ -796,10 +931,18 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
| 796 | spinlock_t *ptl = NULL; | 931 | spinlock_t *ptl = NULL; | 
| 797 | struct multicall_space mcs; | 932 | struct multicall_space mcs; | 
| 798 | 933 | ||
| 934 | /* | ||
| 935 | * Do the converse to pin_page. If we're using split | ||
| 936 | * pte locks, we must be holding the lock for while | ||
| 937 | * the pte page is unpinned but still RO to prevent | ||
| 938 | * concurrent updates from seeing it in this | ||
| 939 | * partially-pinned state. | ||
| 940 | */ | ||
| 799 | if (level == PT_PTE) { | 941 | if (level == PT_PTE) { | 
| 800 | ptl = lock_pte(page); | 942 | ptl = xen_pte_lock(page, mm); | 
| 801 | 943 | ||
| 802 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | 944 | if (ptl) | 
| 945 | xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); | ||
| 803 | } | 946 | } | 
| 804 | 947 | ||
| 805 | mcs = __xen_mc_entry(0); | 948 | mcs = __xen_mc_entry(0); | 
| @@ -810,7 +953,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
| 810 | 953 | ||
| 811 | if (ptl) { | 954 | if (ptl) { | 
| 812 | /* unlock when batch completed */ | 955 | /* unlock when batch completed */ | 
| 813 | xen_mc_callback(do_unlock, ptl); | 956 | xen_mc_callback(xen_pte_unlock, ptl); | 
| 814 | } | 957 | } | 
| 815 | } | 958 | } | 
| 816 | 959 | ||
| @@ -818,7 +961,7 @@ static int unpin_page(struct page *page, enum pt_level level) | |||
| 818 | } | 961 | } | 
| 819 | 962 | ||
| 820 | /* Release a pagetables pages back as normal RW */ | 963 | /* Release a pagetables pages back as normal RW */ | 
| 821 | static void xen_pgd_unpin(pgd_t *pgd) | 964 | static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) | 
| 822 | { | 965 | { | 
| 823 | xen_mc_batch(); | 966 | xen_mc_batch(); | 
| 824 | 967 | ||
| @@ -830,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) | |||
| 830 | 973 | ||
| 831 | if (user_pgd) { | 974 | if (user_pgd) { | 
| 832 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | 975 | xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); | 
| 833 | unpin_page(virt_to_page(user_pgd), PT_PGD); | 976 | xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); | 
| 834 | } | 977 | } | 
| 835 | } | 978 | } | 
| 836 | #endif | 979 | #endif | 
| 837 | 980 | ||
| 838 | #ifdef CONFIG_X86_PAE | 981 | #ifdef CONFIG_X86_PAE | 
| 839 | /* Need to make sure unshared kernel PMD is unpinned */ | 982 | /* Need to make sure unshared kernel PMD is unpinned */ | 
| 840 | pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); | 983 | xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), | 
| 984 | PT_PMD); | ||
| 841 | #endif | 985 | #endif | 
| 842 | 986 | ||
| 843 | pgd_walk(pgd, unpin_page, USER_LIMIT); | 987 | xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); | 
| 844 | 988 | ||
| 845 | xen_mc_issue(0); | 989 | xen_mc_issue(0); | 
| 846 | } | 990 | } | 
| 847 | 991 | ||
| 992 | static void xen_pgd_unpin(struct mm_struct *mm) | ||
| 993 | { | ||
| 994 | __xen_pgd_unpin(mm, mm->pgd); | ||
| 995 | } | ||
| 996 | |||
| 848 | /* | 997 | /* | 
| 849 | * On resume, undo any pinning done at save, so that the rest of the | 998 | * On resume, undo any pinning done at save, so that the rest of the | 
| 850 | * kernel doesn't see any unexpected pinned pagetables. | 999 | * kernel doesn't see any unexpected pinned pagetables. | 
| @@ -859,7 +1008,7 @@ void xen_mm_unpin_all(void) | |||
| 859 | list_for_each_entry(page, &pgd_list, lru) { | 1008 | list_for_each_entry(page, &pgd_list, lru) { | 
| 860 | if (PageSavePinned(page)) { | 1009 | if (PageSavePinned(page)) { | 
| 861 | BUG_ON(!PagePinned(page)); | 1010 | BUG_ON(!PagePinned(page)); | 
| 862 | xen_pgd_unpin((pgd_t *)page_address(page)); | 1011 | __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); | 
| 863 | ClearPageSavePinned(page); | 1012 | ClearPageSavePinned(page); | 
| 864 | } | 1013 | } | 
| 865 | } | 1014 | } | 
| @@ -870,14 +1019,14 @@ void xen_mm_unpin_all(void) | |||
| 870 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 1019 | void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) | 
| 871 | { | 1020 | { | 
| 872 | spin_lock(&next->page_table_lock); | 1021 | spin_lock(&next->page_table_lock); | 
| 873 | xen_pgd_pin(next->pgd); | 1022 | xen_pgd_pin(next); | 
| 874 | spin_unlock(&next->page_table_lock); | 1023 | spin_unlock(&next->page_table_lock); | 
| 875 | } | 1024 | } | 
| 876 | 1025 | ||
| 877 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 1026 | void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) | 
| 878 | { | 1027 | { | 
| 879 | spin_lock(&mm->page_table_lock); | 1028 | spin_lock(&mm->page_table_lock); | 
| 880 | xen_pgd_pin(mm->pgd); | 1029 | xen_pgd_pin(mm); | 
| 881 | spin_unlock(&mm->page_table_lock); | 1030 | spin_unlock(&mm->page_table_lock); | 
| 882 | } | 1031 | } | 
| 883 | 1032 | ||
| @@ -907,7 +1056,7 @@ static void drop_other_mm_ref(void *info) | |||
| 907 | } | 1056 | } | 
| 908 | } | 1057 | } | 
| 909 | 1058 | ||
| 910 | static void drop_mm_ref(struct mm_struct *mm) | 1059 | static void xen_drop_mm_ref(struct mm_struct *mm) | 
| 911 | { | 1060 | { | 
| 912 | cpumask_t mask; | 1061 | cpumask_t mask; | 
| 913 | unsigned cpu; | 1062 | unsigned cpu; | 
| @@ -937,7 +1086,7 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
| 937 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | 1086 | smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); | 
| 938 | } | 1087 | } | 
| 939 | #else | 1088 | #else | 
| 940 | static void drop_mm_ref(struct mm_struct *mm) | 1089 | static void xen_drop_mm_ref(struct mm_struct *mm) | 
| 941 | { | 1090 | { | 
| 942 | if (current->active_mm == mm) | 1091 | if (current->active_mm == mm) | 
| 943 | load_cr3(swapper_pg_dir); | 1092 | load_cr3(swapper_pg_dir); | 
| @@ -961,14 +1110,77 @@ static void drop_mm_ref(struct mm_struct *mm) | |||
| 961 | void xen_exit_mmap(struct mm_struct *mm) | 1110 | void xen_exit_mmap(struct mm_struct *mm) | 
| 962 | { | 1111 | { | 
| 963 | get_cpu(); /* make sure we don't move around */ | 1112 | get_cpu(); /* make sure we don't move around */ | 
| 964 | drop_mm_ref(mm); | 1113 | xen_drop_mm_ref(mm); | 
| 965 | put_cpu(); | 1114 | put_cpu(); | 
| 966 | 1115 | ||
| 967 | spin_lock(&mm->page_table_lock); | 1116 | spin_lock(&mm->page_table_lock); | 
| 968 | 1117 | ||
| 969 | /* pgd may not be pinned in the error exit path of execve */ | 1118 | /* pgd may not be pinned in the error exit path of execve */ | 
| 970 | if (page_pinned(mm->pgd)) | 1119 | if (xen_page_pinned(mm->pgd)) | 
| 971 | xen_pgd_unpin(mm->pgd); | 1120 | xen_pgd_unpin(mm); | 
| 972 | 1121 | ||
| 973 | spin_unlock(&mm->page_table_lock); | 1122 | spin_unlock(&mm->page_table_lock); | 
| 974 | } | 1123 | } | 
| 1124 | |||
| 1125 | #ifdef CONFIG_XEN_DEBUG_FS | ||
| 1126 | |||
| 1127 | static struct dentry *d_mmu_debug; | ||
| 1128 | |||
| 1129 | static int __init xen_mmu_debugfs(void) | ||
| 1130 | { | ||
| 1131 | struct dentry *d_xen = xen_init_debugfs(); | ||
| 1132 | |||
| 1133 | if (d_xen == NULL) | ||
| 1134 | return -ENOMEM; | ||
| 1135 | |||
| 1136 | d_mmu_debug = debugfs_create_dir("mmu", d_xen); | ||
| 1137 | |||
| 1138 | debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); | ||
| 1139 | |||
| 1140 | debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); | ||
| 1141 | debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, | ||
| 1142 | &mmu_stats.pgd_update_pinned); | ||
| 1143 | debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, | ||
| 1144 | &mmu_stats.pgd_update_pinned); | ||
| 1145 | |||
| 1146 | debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); | ||
| 1147 | debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, | ||
| 1148 | &mmu_stats.pud_update_pinned); | ||
| 1149 | debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, | ||
| 1150 | &mmu_stats.pud_update_pinned); | ||
| 1151 | |||
| 1152 | debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); | ||
| 1153 | debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, | ||
| 1154 | &mmu_stats.pmd_update_pinned); | ||
| 1155 | debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, | ||
| 1156 | &mmu_stats.pmd_update_pinned); | ||
| 1157 | |||
| 1158 | debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); | ||
| 1159 | // debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, | ||
| 1160 | // &mmu_stats.pte_update_pinned); | ||
| 1161 | debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, | ||
| 1162 | &mmu_stats.pte_update_pinned); | ||
| 1163 | |||
| 1164 | debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); | ||
| 1165 | debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, | ||
| 1166 | &mmu_stats.mmu_update_extended); | ||
| 1167 | xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, | ||
| 1168 | mmu_stats.mmu_update_histo, 20); | ||
| 1169 | |||
| 1170 | debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); | ||
| 1171 | debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, | ||
| 1172 | &mmu_stats.set_pte_at_batched); | ||
| 1173 | debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, | ||
| 1174 | &mmu_stats.set_pte_at_current); | ||
| 1175 | debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, | ||
| 1176 | &mmu_stats.set_pte_at_kernel); | ||
| 1177 | |||
| 1178 | debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); | ||
| 1179 | debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, | ||
| 1180 | &mmu_stats.prot_commit_batched); | ||
| 1181 | |||
| 1182 | return 0; | ||
| 1183 | } | ||
| 1184 | fs_initcall(xen_mmu_debugfs); | ||
| 1185 | |||
| 1186 | #endif /* CONFIG_XEN_DEBUG_FS */ | ||
