aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-10-25 08:16:31 -0400
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:39 -0500
commitd10e63f29488b0f312a443f9507ea9b6fd3c9090 (patch)
treeb39e3caa5d25e9e5ebad84c606a724e25c6b8e91
parent1ba6e0b50b479cbadb8f05ebde3020da9ac87201 (diff)
mm: numa: Create basic numa page hinting infrastructure
Note: This patch started as "mm/mpol: Create special PROT_NONE infrastructure" and preserves the basic idea but steals *very* heavily from "autonuma: numa hinting page faults entry points" for the actual fault handlers without the migration parts. The end result is barely recognisable as either patch so all Signed-off and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with this version, I will re-add the signed-offs-by to reflect the history. In order to facilitate a lazy -- fault driven -- migration of pages, create a special transient PAGE_NUMA variant, we can then use the 'spurious' protection faults to drive our migrations from. The meaning of PAGE_NUMA depends on the architecture but on x86 it is effectively PROT_NONE. Actual PROT_NONE mappings will not generate these NUMA faults for the reason that the page fault code checks the permission on the VMA (and will throw a segmentation fault on actual PROT_NONE mappings), before it ever calls handle_mm_fault. [dhillf@gmail.com: Fix typo] Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com>
-rw-r--r--include/linux/huge_mm.h10
-rw-r--r--mm/huge_memory.c22
-rw-r--r--mm/memory.c112
3 files changed, 141 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index b31cb7da0346..a1d26a98c655 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page)
159 } 159 }
160 return page; 160 return page;
161} 161}
162
163extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
164 pmd_t pmd, pmd_t *pmdp);
165
162#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 166#else /* CONFIG_TRANSPARENT_HUGEPAGE */
163#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 167#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
164#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 168#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
195{ 199{
196 return 0; 200 return 0;
197} 201}
202
203static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
204 pmd_t pmd, pmd_t *pmdp)
205{
206}
207
198#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 208#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
199 209
200#endif /* _LINUX_HUGE_MM_H */ 210#endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cd24aa562144..f5f37630c54d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1018,6 +1018,28 @@ out:
1018 return page; 1018 return page;
1019} 1019}
1020 1020
1021/* NUMA hinting page fault entry point for trans huge pmds */
1022int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr,
1023 pmd_t pmd, pmd_t *pmdp)
1024{
1025 struct page *page;
1026 unsigned long haddr = addr & HPAGE_PMD_MASK;
1027
1028 spin_lock(&mm->page_table_lock);
1029 if (unlikely(!pmd_same(pmd, *pmdp)))
1030 goto out_unlock;
1031
1032 page = pmd_page(pmd);
1033 pmd = pmd_mknonnuma(pmd);
1034 set_pmd_at(mm, haddr, pmdp, pmd);
1035 VM_BUG_ON(pmd_numa(*pmdp));
1036 update_mmu_cache_pmd(vma, addr, pmdp);
1037
1038out_unlock:
1039 spin_unlock(&mm->page_table_lock);
1040 return 0;
1041}
1042
1021int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, 1043int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1022 pmd_t *pmd, unsigned long addr) 1044 pmd_t *pmd, unsigned long addr)
1023{ 1045{
diff --git a/mm/memory.c b/mm/memory.c
index cd8e0daf1912..e30616f2cc3d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3448 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 3448 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
3449} 3449}
3450 3450
3451int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3452 unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd)
3453{
3454 struct page *page;
3455 spinlock_t *ptl;
3456
3457 /*
3458 * The "pte" at this point cannot be used safely without
3459 * validation through pte_unmap_same(). It's of NUMA type but
3460 * the pfn may be screwed if the read is non atomic.
3461 *
3462 * ptep_modify_prot_start is not called as this is clearing
3463 * the _PAGE_NUMA bit and it is not really expected that there
3464 * would be concurrent hardware modifications to the PTE.
3465 */
3466 ptl = pte_lockptr(mm, pmd);
3467 spin_lock(ptl);
3468 if (unlikely(!pte_same(*ptep, pte)))
3469 goto out_unlock;
3470 pte = pte_mknonnuma(pte);
3471 set_pte_at(mm, addr, ptep, pte);
3472 update_mmu_cache(vma, addr, ptep);
3473
3474 page = vm_normal_page(vma, addr, pte);
3475 if (!page) {
3476 pte_unmap_unlock(ptep, ptl);
3477 return 0;
3478 }
3479
3480out_unlock:
3481 pte_unmap_unlock(ptep, ptl);
3482 return 0;
3483}
3484
3485/* NUMA hinting page fault entry point for regular pmds */
3486#ifdef CONFIG_NUMA_BALANCING
3487static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3488 unsigned long addr, pmd_t *pmdp)
3489{
3490 pmd_t pmd;
3491 pte_t *pte, *orig_pte;
3492 unsigned long _addr = addr & PMD_MASK;
3493 unsigned long offset;
3494 spinlock_t *ptl;
3495 bool numa = false;
3496
3497 spin_lock(&mm->page_table_lock);
3498 pmd = *pmdp;
3499 if (pmd_numa(pmd)) {
3500 set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
3501 numa = true;
3502 }
3503 spin_unlock(&mm->page_table_lock);
3504
3505 if (!numa)
3506 return 0;
3507
3508 /* we're in a page fault so some vma must be in the range */
3509 BUG_ON(!vma);
3510 BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
3511 offset = max(_addr, vma->vm_start) & ~PMD_MASK;
3512 VM_BUG_ON(offset >= PMD_SIZE);
3513 orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
3514 pte += offset >> PAGE_SHIFT;
3515 for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
3516 pte_t pteval = *pte;
3517 struct page *page;
3518 if (!pte_present(pteval))
3519 continue;
3520 if (!pte_numa(pteval))
3521 continue;
3522 if (addr >= vma->vm_end) {
3523 vma = find_vma(mm, addr);
3524 /* there's a pte present so there must be a vma */
3525 BUG_ON(!vma);
3526 BUG_ON(addr < vma->vm_start);
3527 }
3528 if (pte_numa(pteval)) {
3529 pteval = pte_mknonnuma(pteval);
3530 set_pte_at(mm, addr, pte, pteval);
3531 }
3532 page = vm_normal_page(vma, addr, pteval);
3533 if (unlikely(!page))
3534 continue;
3535 }
3536 pte_unmap_unlock(orig_pte, ptl);
3537
3538 return 0;
3539}
3540#else
3541static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3542 unsigned long addr, pmd_t *pmdp)
3543{
3544 BUG();
3545}
3546#endif /* CONFIG_NUMA_BALANCING */
3547
3451/* 3548/*
3452 * These routines also need to handle stuff like marking pages dirty 3549 * These routines also need to handle stuff like marking pages dirty
3453 * and/or accessed for architectures that don't do it in hardware (most 3550 * and/or accessed for architectures that don't do it in hardware (most
@@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm,
3486 pte, pmd, flags, entry); 3583 pte, pmd, flags, entry);
3487 } 3584 }
3488 3585
3586 if (pte_numa(entry))
3587 return do_numa_page(mm, vma, address, entry, pte, pmd);
3588
3489 ptl = pte_lockptr(mm, pmd); 3589 ptl = pte_lockptr(mm, pmd);
3490 spin_lock(ptl); 3590 spin_lock(ptl);
3491 if (unlikely(!pte_same(*pte, entry))) 3591 if (unlikely(!pte_same(*pte, entry)))
@@ -3554,9 +3654,11 @@ retry:
3554 3654
3555 barrier(); 3655 barrier();
3556 if (pmd_trans_huge(orig_pmd)) { 3656 if (pmd_trans_huge(orig_pmd)) {
3557 if (flags & FAULT_FLAG_WRITE && 3657 if (pmd_numa(*pmd))
3558 !pmd_write(orig_pmd) && 3658 return do_huge_pmd_numa_page(mm, address,
3559 !pmd_trans_splitting(orig_pmd)) { 3659 orig_pmd, pmd);
3660
3661 if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) {
3560 ret = do_huge_pmd_wp_page(mm, vma, address, pmd, 3662 ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
3561 orig_pmd); 3663 orig_pmd);
3562 /* 3664 /*
@@ -3568,10 +3670,14 @@ retry:
3568 goto retry; 3670 goto retry;
3569 return ret; 3671 return ret;
3570 } 3672 }
3673
3571 return 0; 3674 return 0;
3572 } 3675 }
3573 } 3676 }
3574 3677
3678 if (pmd_numa(*pmd))
3679 return do_pmd_numa_page(mm, vma, address, pmd);
3680
3575 /* 3681 /*
3576 * Use __pte_alloc instead of pte_alloc_map, because we can't 3682 * Use __pte_alloc instead of pte_alloc_map, because we can't
3577 * run pte_offset_map on the pmd, if an huge pmd could 3683 * run pte_offset_map on the pmd, if an huge pmd could