diff options
author | Mel Gorman <mgorman@suse.de> | 2012-10-25 08:16:31 -0400 |
---|---|---|
committer | Mel Gorman <mgorman@suse.de> | 2012-12-11 09:42:39 -0500 |
commit | d10e63f29488b0f312a443f9507ea9b6fd3c9090 (patch) | |
tree | b39e3caa5d25e9e5ebad84c606a724e25c6b8e91 | |
parent | 1ba6e0b50b479cbadb8f05ebde3020da9ac87201 (diff) |
mm: numa: Create basic numa page hinting infrastructure
Note: This patch started as "mm/mpol: Create special PROT_NONE
infrastructure" and preserves the basic idea but steals *very*
heavily from "autonuma: numa hinting page faults entry points" for
the actual fault handlers without the migration parts. The end
result is barely recognisable as either patch so all Signed-off
and Reviewed-bys are dropped. If Peter, Ingo and Andrea are ok with
this version, I will re-add the signed-offs-by to reflect the history.
In order to facilitate a lazy -- fault driven -- migration of pages, create
a special transient PAGE_NUMA variant, we can then use the 'spurious'
protection faults to drive our migrations from.
The meaning of PAGE_NUMA depends on the architecture but on x86 it is
effectively PROT_NONE. Actual PROT_NONE mappings will not generate these
NUMA faults for the reason that the page fault code checks the permission on
the VMA (and will throw a segmentation fault on actual PROT_NONE mappings),
before it ever calls handle_mm_fault.
[dhillf@gmail.com: Fix typo]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
-rw-r--r-- | include/linux/huge_mm.h | 10 | ||||
-rw-r--r-- | mm/huge_memory.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 112 |
3 files changed, 141 insertions, 3 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b31cb7da0346..a1d26a98c655 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -159,6 +159,10 @@ static inline struct page *compound_trans_head(struct page *page) | |||
159 | } | 159 | } |
160 | return page; | 160 | return page; |
161 | } | 161 | } |
162 | |||
163 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, | ||
164 | pmd_t pmd, pmd_t *pmdp); | ||
165 | |||
162 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 166 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
163 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) | 167 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
164 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) | 168 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
@@ -195,6 +199,12 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, | |||
195 | { | 199 | { |
196 | return 0; | 200 | return 0; |
197 | } | 201 | } |
202 | |||
203 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, | ||
204 | pmd_t pmd, pmd_t *pmdp) | ||
205 | { | ||
206 | } | ||
207 | |||
198 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 208 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
199 | 209 | ||
200 | #endif /* _LINUX_HUGE_MM_H */ | 210 | #endif /* _LINUX_HUGE_MM_H */ |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cd24aa562144..f5f37630c54d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1018,6 +1018,28 @@ out: | |||
1018 | return page; | 1018 | return page; |
1019 | } | 1019 | } |
1020 | 1020 | ||
1021 | /* NUMA hinting page fault entry point for trans huge pmds */ | ||
1022 | int do_huge_pmd_numa_page(struct mm_struct *mm, unsigned long addr, | ||
1023 | pmd_t pmd, pmd_t *pmdp) | ||
1024 | { | ||
1025 | struct page *page; | ||
1026 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1027 | |||
1028 | spin_lock(&mm->page_table_lock); | ||
1029 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1030 | goto out_unlock; | ||
1031 | |||
1032 | page = pmd_page(pmd); | ||
1033 | pmd = pmd_mknonnuma(pmd); | ||
1034 | set_pmd_at(mm, haddr, pmdp, pmd); | ||
1035 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1036 | update_mmu_cache_pmd(vma, addr, pmdp); | ||
1037 | |||
1038 | out_unlock: | ||
1039 | spin_unlock(&mm->page_table_lock); | ||
1040 | return 0; | ||
1041 | } | ||
1042 | |||
1021 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1043 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1022 | pmd_t *pmd, unsigned long addr) | 1044 | pmd_t *pmd, unsigned long addr) |
1023 | { | 1045 | { |
diff --git a/mm/memory.c b/mm/memory.c index cd8e0daf1912..e30616f2cc3d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3448,6 +3448,103 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3448 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3448 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3449 | } | 3449 | } |
3450 | 3450 | ||
3451 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3452 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3453 | { | ||
3454 | struct page *page; | ||
3455 | spinlock_t *ptl; | ||
3456 | |||
3457 | /* | ||
3458 | * The "pte" at this point cannot be used safely without | ||
3459 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3460 | * the pfn may be screwed if the read is non atomic. | ||
3461 | * | ||
3462 | * ptep_modify_prot_start is not called as this is clearing | ||
3463 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3464 | * would be concurrent hardware modifications to the PTE. | ||
3465 | */ | ||
3466 | ptl = pte_lockptr(mm, pmd); | ||
3467 | spin_lock(ptl); | ||
3468 | if (unlikely(!pte_same(*ptep, pte))) | ||
3469 | goto out_unlock; | ||
3470 | pte = pte_mknonnuma(pte); | ||
3471 | set_pte_at(mm, addr, ptep, pte); | ||
3472 | update_mmu_cache(vma, addr, ptep); | ||
3473 | |||
3474 | page = vm_normal_page(vma, addr, pte); | ||
3475 | if (!page) { | ||
3476 | pte_unmap_unlock(ptep, ptl); | ||
3477 | return 0; | ||
3478 | } | ||
3479 | |||
3480 | out_unlock: | ||
3481 | pte_unmap_unlock(ptep, ptl); | ||
3482 | return 0; | ||
3483 | } | ||
3484 | |||
3485 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3486 | #ifdef CONFIG_NUMA_BALANCING | ||
3487 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3488 | unsigned long addr, pmd_t *pmdp) | ||
3489 | { | ||
3490 | pmd_t pmd; | ||
3491 | pte_t *pte, *orig_pte; | ||
3492 | unsigned long _addr = addr & PMD_MASK; | ||
3493 | unsigned long offset; | ||
3494 | spinlock_t *ptl; | ||
3495 | bool numa = false; | ||
3496 | |||
3497 | spin_lock(&mm->page_table_lock); | ||
3498 | pmd = *pmdp; | ||
3499 | if (pmd_numa(pmd)) { | ||
3500 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3501 | numa = true; | ||
3502 | } | ||
3503 | spin_unlock(&mm->page_table_lock); | ||
3504 | |||
3505 | if (!numa) | ||
3506 | return 0; | ||
3507 | |||
3508 | /* we're in a page fault so some vma must be in the range */ | ||
3509 | BUG_ON(!vma); | ||
3510 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3511 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3512 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3513 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3514 | pte += offset >> PAGE_SHIFT; | ||
3515 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3516 | pte_t pteval = *pte; | ||
3517 | struct page *page; | ||
3518 | if (!pte_present(pteval)) | ||
3519 | continue; | ||
3520 | if (!pte_numa(pteval)) | ||
3521 | continue; | ||
3522 | if (addr >= vma->vm_end) { | ||
3523 | vma = find_vma(mm, addr); | ||
3524 | /* there's a pte present so there must be a vma */ | ||
3525 | BUG_ON(!vma); | ||
3526 | BUG_ON(addr < vma->vm_start); | ||
3527 | } | ||
3528 | if (pte_numa(pteval)) { | ||
3529 | pteval = pte_mknonnuma(pteval); | ||
3530 | set_pte_at(mm, addr, pte, pteval); | ||
3531 | } | ||
3532 | page = vm_normal_page(vma, addr, pteval); | ||
3533 | if (unlikely(!page)) | ||
3534 | continue; | ||
3535 | } | ||
3536 | pte_unmap_unlock(orig_pte, ptl); | ||
3537 | |||
3538 | return 0; | ||
3539 | } | ||
3540 | #else | ||
3541 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3542 | unsigned long addr, pmd_t *pmdp) | ||
3543 | { | ||
3544 | BUG(); | ||
3545 | } | ||
3546 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3547 | |||
3451 | /* | 3548 | /* |
3452 | * These routines also need to handle stuff like marking pages dirty | 3549 | * These routines also need to handle stuff like marking pages dirty |
3453 | * and/or accessed for architectures that don't do it in hardware (most | 3550 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3486,6 +3583,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3486 | pte, pmd, flags, entry); | 3583 | pte, pmd, flags, entry); |
3487 | } | 3584 | } |
3488 | 3585 | ||
3586 | if (pte_numa(entry)) | ||
3587 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3588 | |||
3489 | ptl = pte_lockptr(mm, pmd); | 3589 | ptl = pte_lockptr(mm, pmd); |
3490 | spin_lock(ptl); | 3590 | spin_lock(ptl); |
3491 | if (unlikely(!pte_same(*pte, entry))) | 3591 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3554,9 +3654,11 @@ retry: | |||
3554 | 3654 | ||
3555 | barrier(); | 3655 | barrier(); |
3556 | if (pmd_trans_huge(orig_pmd)) { | 3656 | if (pmd_trans_huge(orig_pmd)) { |
3557 | if (flags & FAULT_FLAG_WRITE && | 3657 | if (pmd_numa(*pmd)) |
3558 | !pmd_write(orig_pmd) && | 3658 | return do_huge_pmd_numa_page(mm, address, |
3559 | !pmd_trans_splitting(orig_pmd)) { | 3659 | orig_pmd, pmd); |
3660 | |||
3661 | if ((flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { | ||
3560 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3662 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3561 | orig_pmd); | 3663 | orig_pmd); |
3562 | /* | 3664 | /* |
@@ -3568,10 +3670,14 @@ retry: | |||
3568 | goto retry; | 3670 | goto retry; |
3569 | return ret; | 3671 | return ret; |
3570 | } | 3672 | } |
3673 | |||
3571 | return 0; | 3674 | return 0; |
3572 | } | 3675 | } |
3573 | } | 3676 | } |
3574 | 3677 | ||
3678 | if (pmd_numa(*pmd)) | ||
3679 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3680 | |||
3575 | /* | 3681 | /* |
3576 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3682 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3577 | * run pte_offset_map on the pmd, if an huge pmd could | 3683 | * run pte_offset_map on the pmd, if an huge pmd could |