diff options
45 files changed, 1938 insertions, 172 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 20e248cc03a9..ea8e5b485576 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
2032 | 2032 | ||
2033 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. | 2033 | nr_uarts= [SERIAL] maximum number of UARTs to be registered. |
2034 | 2034 | ||
2035 | numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. | ||
2036 | Allowed values are enable and disable | ||
2037 | |||
2035 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. | 2038 | numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. |
2036 | one of ['zone', 'node', 'default'] can be specified | 2039 | one of ['zone', 'node', 'default'] can be specified |
2037 | This can be set from sysctl after boot. | 2040 | This can be set from sysctl after boot. |
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4dd..0f7c852f355c 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig | |||
@@ -111,6 +111,7 @@ config VSYSCALL | |||
111 | config NUMA | 111 | config NUMA |
112 | bool "Non Uniform Memory Access (NUMA) Support" | 112 | bool "Non Uniform Memory Access (NUMA) Support" |
113 | depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL | 113 | depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL |
114 | select ARCH_WANT_NUMA_VARIABLE_LOCALITY | ||
114 | default n | 115 | default n |
115 | help | 116 | help |
116 | Some SH systems have many various memories scattered around | 117 | Some SH systems have many various memories scattered around |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 65a872bf72f9..97f8c5ad8c2d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -22,6 +22,8 @@ config X86 | |||
22 | def_bool y | 22 | def_bool y |
23 | select HAVE_AOUT if X86_32 | 23 | select HAVE_AOUT if X86_32 |
24 | select HAVE_UNSTABLE_SCHED_CLOCK | 24 | select HAVE_UNSTABLE_SCHED_CLOCK |
25 | select ARCH_SUPPORTS_NUMA_BALANCING | ||
26 | select ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
25 | select HAVE_IDE | 27 | select HAVE_IDE |
26 | select HAVE_OPROFILE | 28 | select HAVE_OPROFILE |
27 | select HAVE_PCSPKR_PLATFORM | 29 | select HAVE_PCSPKR_PLATFORM |
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f76..5199db2923d3 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b) | |||
404 | 404 | ||
405 | static inline int pte_present(pte_t a) | 405 | static inline int pte_present(pte_t a) |
406 | { | 406 | { |
407 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); | 407 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | |
408 | _PAGE_NUMA); | ||
409 | } | ||
410 | |||
411 | #define pte_accessible pte_accessible | ||
412 | static inline int pte_accessible(pte_t a) | ||
413 | { | ||
414 | return pte_flags(a) & _PAGE_PRESENT; | ||
408 | } | 415 | } |
409 | 416 | ||
410 | static inline int pte_hidden(pte_t pte) | 417 | static inline int pte_hidden(pte_t pte) |
@@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd) | |||
420 | * the _PAGE_PSE flag will remain set at all times while the | 427 | * the _PAGE_PSE flag will remain set at all times while the |
421 | * _PAGE_PRESENT bit is clear). | 428 | * _PAGE_PRESENT bit is clear). |
422 | */ | 429 | */ |
423 | return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); | 430 | return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | |
431 | _PAGE_NUMA); | ||
424 | } | 432 | } |
425 | 433 | ||
426 | static inline int pmd_none(pmd_t pmd) | 434 | static inline int pmd_none(pmd_t pmd) |
@@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) | |||
479 | 487 | ||
480 | static inline int pmd_bad(pmd_t pmd) | 488 | static inline int pmd_bad(pmd_t pmd) |
481 | { | 489 | { |
490 | #ifdef CONFIG_NUMA_BALANCING | ||
491 | /* pmd_numa check */ | ||
492 | if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) | ||
493 | return 0; | ||
494 | #endif | ||
482 | return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; | 495 | return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; |
483 | } | 496 | } |
484 | 497 | ||
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505d..3c32db8c539d 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -64,6 +64,26 @@ | |||
64 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) | 64 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) |
65 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | 65 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
66 | 66 | ||
67 | /* | ||
68 | * _PAGE_NUMA indicates that this page will trigger a numa hinting | ||
69 | * minor page fault to gather numa placement statistics (see | ||
70 | * pte_numa()). The bit picked (8) is within the range between | ||
71 | * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't | ||
72 | * require changes to the swp entry format because that bit is always | ||
73 | * zero when the pte is not present. | ||
74 | * | ||
75 | * The bit picked must be always zero when the pmd is present and not | ||
76 | * present, so that we don't lose information when we set it while | ||
77 | * atomically clearing the present bit. | ||
78 | * | ||
79 | * Because we shared the same bit (8) with _PAGE_PROTNONE this can be | ||
80 | * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE | ||
81 | * couldn't reach, like handle_mm_fault() (see access_error in | ||
82 | * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for | ||
83 | * handle_mm_fault() to be invoked). | ||
84 | */ | ||
85 | #define _PAGE_NUMA _PAGE_PROTNONE | ||
86 | |||
67 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | 87 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
68 | _PAGE_ACCESSED | _PAGE_DIRTY) | 88 | _PAGE_ACCESSED | _PAGE_DIRTY) |
69 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ | 89 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ |
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217eb705fac0..e27fbf887f3b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) | |||
301 | free_page((unsigned long)pgd); | 301 | free_page((unsigned long)pgd); |
302 | } | 302 | } |
303 | 303 | ||
304 | /* | ||
305 | * Used to set accessed or dirty bits in the page table entries | ||
306 | * on other architectures. On x86, the accessed and dirty bits | ||
307 | * are tracked by hardware. However, do_wp_page calls this function | ||
308 | * to also make the pte writeable at the same time the dirty bit is | ||
309 | * set. In that case we do actually need to write the PTE. | ||
310 | */ | ||
304 | int ptep_set_access_flags(struct vm_area_struct *vma, | 311 | int ptep_set_access_flags(struct vm_area_struct *vma, |
305 | unsigned long address, pte_t *ptep, | 312 | unsigned long address, pte_t *ptep, |
306 | pte_t entry, int dirty) | 313 | pte_t entry, int dirty) |
@@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
310 | if (changed && dirty) { | 317 | if (changed && dirty) { |
311 | *ptep = entry; | 318 | *ptep = entry; |
312 | pte_update_defer(vma->vm_mm, address, ptep); | 319 | pte_update_defer(vma->vm_mm, address, ptep); |
313 | flush_tlb_page(vma, address); | ||
314 | } | 320 | } |
315 | 321 | ||
316 | return changed; | 322 | return changed; |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 284e80831d2c..701beab27aab 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) | |||
219 | #define move_pte(pte, prot, old_addr, new_addr) (pte) | 219 | #define move_pte(pte, prot, old_addr, new_addr) (pte) |
220 | #endif | 220 | #endif |
221 | 221 | ||
222 | #ifndef pte_accessible | ||
223 | # define pte_accessible(pte) ((void)(pte),1) | ||
224 | #endif | ||
225 | |||
222 | #ifndef flush_tlb_fix_spurious_fault | 226 | #ifndef flush_tlb_fix_spurious_fault |
223 | #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) | 227 | #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) |
224 | #endif | 228 | #endif |
@@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd) | |||
580 | #endif | 584 | #endif |
581 | } | 585 | } |
582 | 586 | ||
587 | #ifdef CONFIG_NUMA_BALANCING | ||
588 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
589 | /* | ||
590 | * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the | ||
591 | * same bit too). It's set only when _PAGE_PRESET is not set and it's | ||
592 | * never set if _PAGE_PRESENT is set. | ||
593 | * | ||
594 | * pte/pmd_present() returns true if pte/pmd_numa returns true. Page | ||
595 | * fault triggers on those regions if pte/pmd_numa returns true | ||
596 | * (because _PAGE_PRESENT is not set). | ||
597 | */ | ||
598 | #ifndef pte_numa | ||
599 | static inline int pte_numa(pte_t pte) | ||
600 | { | ||
601 | return (pte_flags(pte) & | ||
602 | (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; | ||
603 | } | ||
604 | #endif | ||
605 | |||
606 | #ifndef pmd_numa | ||
607 | static inline int pmd_numa(pmd_t pmd) | ||
608 | { | ||
609 | return (pmd_flags(pmd) & | ||
610 | (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; | ||
611 | } | ||
612 | #endif | ||
613 | |||
614 | /* | ||
615 | * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically | ||
616 | * because they're called by the NUMA hinting minor page fault. If we | ||
617 | * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler | ||
618 | * would be forced to set it later while filling the TLB after we | ||
619 | * return to userland. That would trigger a second write to memory | ||
620 | * that we optimize away by setting _PAGE_ACCESSED here. | ||
621 | */ | ||
622 | #ifndef pte_mknonnuma | ||
623 | static inline pte_t pte_mknonnuma(pte_t pte) | ||
624 | { | ||
625 | pte = pte_clear_flags(pte, _PAGE_NUMA); | ||
626 | return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); | ||
627 | } | ||
628 | #endif | ||
629 | |||
630 | #ifndef pmd_mknonnuma | ||
631 | static inline pmd_t pmd_mknonnuma(pmd_t pmd) | ||
632 | { | ||
633 | pmd = pmd_clear_flags(pmd, _PAGE_NUMA); | ||
634 | return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); | ||
635 | } | ||
636 | #endif | ||
637 | |||
638 | #ifndef pte_mknuma | ||
639 | static inline pte_t pte_mknuma(pte_t pte) | ||
640 | { | ||
641 | pte = pte_set_flags(pte, _PAGE_NUMA); | ||
642 | return pte_clear_flags(pte, _PAGE_PRESENT); | ||
643 | } | ||
644 | #endif | ||
645 | |||
646 | #ifndef pmd_mknuma | ||
647 | static inline pmd_t pmd_mknuma(pmd_t pmd) | ||
648 | { | ||
649 | pmd = pmd_set_flags(pmd, _PAGE_NUMA); | ||
650 | return pmd_clear_flags(pmd, _PAGE_PRESENT); | ||
651 | } | ||
652 | #endif | ||
653 | #else | ||
654 | extern int pte_numa(pte_t pte); | ||
655 | extern int pmd_numa(pmd_t pmd); | ||
656 | extern pte_t pte_mknonnuma(pte_t pte); | ||
657 | extern pmd_t pmd_mknonnuma(pmd_t pmd); | ||
658 | extern pte_t pte_mknuma(pte_t pte); | ||
659 | extern pmd_t pmd_mknuma(pmd_t pmd); | ||
660 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
661 | #else | ||
662 | static inline int pmd_numa(pmd_t pmd) | ||
663 | { | ||
664 | return 0; | ||
665 | } | ||
666 | |||
667 | static inline int pte_numa(pte_t pte) | ||
668 | { | ||
669 | return 0; | ||
670 | } | ||
671 | |||
672 | static inline pte_t pte_mknonnuma(pte_t pte) | ||
673 | { | ||
674 | return pte; | ||
675 | } | ||
676 | |||
677 | static inline pmd_t pmd_mknonnuma(pmd_t pmd) | ||
678 | { | ||
679 | return pmd; | ||
680 | } | ||
681 | |||
682 | static inline pte_t pte_mknuma(pte_t pte) | ||
683 | { | ||
684 | return pte; | ||
685 | } | ||
686 | |||
687 | static inline pmd_t pmd_mknuma(pmd_t pmd) | ||
688 | { | ||
689 | return pmd; | ||
690 | } | ||
691 | #endif /* CONFIG_NUMA_BALANCING */ | ||
692 | |||
583 | #endif /* CONFIG_MMU */ | 693 | #endif /* CONFIG_MMU */ |
584 | 694 | ||
585 | #endif /* !__ASSEMBLY__ */ | 695 | #endif /* !__ASSEMBLY__ */ |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 092dc5305a32..1d76f8ca90f0 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, | |||
31 | unsigned long new_addr, unsigned long old_end, | 31 | unsigned long new_addr, unsigned long old_end, |
32 | pmd_t *old_pmd, pmd_t *new_pmd); | 32 | pmd_t *old_pmd, pmd_t *new_pmd); |
33 | extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 33 | extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
34 | unsigned long addr, pgprot_t newprot); | 34 | unsigned long addr, pgprot_t newprot, |
35 | int prot_numa); | ||
35 | 36 | ||
36 | enum transparent_hugepage_flag { | 37 | enum transparent_hugepage_flag { |
37 | TRANSPARENT_HUGEPAGE_FLAG, | 38 | TRANSPARENT_HUGEPAGE_FLAG, |
@@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, | |||
111 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 112 | #define wait_split_huge_page(__anon_vma, __pmd) \ |
112 | do { \ | 113 | do { \ |
113 | pmd_t *____pmd = (__pmd); \ | 114 | pmd_t *____pmd = (__pmd); \ |
114 | anon_vma_lock(__anon_vma); \ | 115 | anon_vma_lock_write(__anon_vma); \ |
115 | anon_vma_unlock(__anon_vma); \ | 116 | anon_vma_unlock(__anon_vma); \ |
116 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
117 | pmd_trans_huge(*____pmd)); \ | 118 | pmd_trans_huge(*____pmd)); \ |
@@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page) | |||
171 | } | 172 | } |
172 | return page; | 173 | return page; |
173 | } | 174 | } |
175 | |||
176 | extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
177 | unsigned long addr, pmd_t pmd, pmd_t *pmdp); | ||
178 | |||
174 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | 179 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ |
175 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) | 180 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) |
176 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) | 181 | #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) |
@@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, | |||
209 | { | 214 | { |
210 | return 0; | 215 | return 0; |
211 | } | 216 | } |
217 | |||
218 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
219 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
220 | { | ||
221 | return 0; | ||
222 | } | ||
223 | |||
212 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 224 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
213 | 225 | ||
214 | #endif /* _LINUX_HUGE_MM_H */ | 226 | #endif /* _LINUX_HUGE_MM_H */ |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3e7fa1acf09c..0c80d3f57a5b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
87 | pud_t *pud, int write); | 87 | pud_t *pud, int write); |
88 | int pmd_huge(pmd_t pmd); | 88 | int pmd_huge(pmd_t pmd); |
89 | int pud_huge(pud_t pmd); | 89 | int pud_huge(pud_t pmd); |
90 | void hugetlb_change_protection(struct vm_area_struct *vma, | 90 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
91 | unsigned long address, unsigned long end, pgprot_t newprot); | 91 | unsigned long address, unsigned long end, pgprot_t newprot); |
92 | 92 | ||
93 | #else /* !CONFIG_HUGETLB_PAGE */ | 93 | #else /* !CONFIG_HUGETLB_PAGE */ |
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src) | |||
132 | { | 132 | { |
133 | } | 133 | } |
134 | 134 | ||
135 | #define hugetlb_change_protection(vma, address, end, newprot) | 135 | static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
136 | unsigned long address, unsigned long end, pgprot_t newprot) | ||
137 | { | ||
138 | return 0; | ||
139 | } | ||
136 | 140 | ||
137 | static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, | 141 | static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, |
138 | struct vm_area_struct *vma, unsigned long start, | 142 | struct vm_area_struct *vma, unsigned long start, |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index dbd212723b74..9adc270de7ef 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) | |||
188 | return 1; | 188 | return 1; |
189 | } | 189 | } |
190 | 190 | ||
191 | extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); | ||
192 | |||
191 | #else | 193 | #else |
192 | 194 | ||
193 | struct mempolicy {}; | 195 | struct mempolicy {}; |
@@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, | |||
307 | return 0; | 309 | return 0; |
308 | } | 310 | } |
309 | 311 | ||
312 | static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, | ||
313 | unsigned long address) | ||
314 | { | ||
315 | return -1; /* no node preference */ | ||
316 | } | ||
317 | |||
310 | #endif /* CONFIG_NUMA */ | 318 | #endif /* CONFIG_NUMA */ |
311 | #endif | 319 | #endif |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0b5865c61efd..1e9f627967a3 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); | |||
23 | #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page | 23 | #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page |
24 | * sucessful migration case. | 24 | * sucessful migration case. |
25 | */ | 25 | */ |
26 | enum migrate_reason { | ||
27 | MR_COMPACTION, | ||
28 | MR_MEMORY_FAILURE, | ||
29 | MR_MEMORY_HOTPLUG, | ||
30 | MR_SYSCALL, /* also applies to cpusets */ | ||
31 | MR_MEMPOLICY_MBIND, | ||
32 | MR_NUMA_MISPLACED, | ||
33 | MR_CMA | ||
34 | }; | ||
26 | 35 | ||
27 | #ifdef CONFIG_MIGRATION | 36 | #ifdef CONFIG_MIGRATION |
28 | 37 | ||
@@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *, | |||
32 | struct page *, struct page *, enum migrate_mode); | 41 | struct page *, struct page *, enum migrate_mode); |
33 | extern int migrate_pages(struct list_head *l, new_page_t x, | 42 | extern int migrate_pages(struct list_head *l, new_page_t x, |
34 | unsigned long private, bool offlining, | 43 | unsigned long private, bool offlining, |
35 | enum migrate_mode mode); | 44 | enum migrate_mode mode, int reason); |
36 | extern int migrate_huge_page(struct page *, new_page_t x, | 45 | extern int migrate_huge_page(struct page *, new_page_t x, |
37 | unsigned long private, bool offlining, | 46 | unsigned long private, bool offlining, |
38 | enum migrate_mode mode); | 47 | enum migrate_mode mode); |
@@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {} | |||
54 | static inline void putback_movable_pages(struct list_head *l) {} | 63 | static inline void putback_movable_pages(struct list_head *l) {} |
55 | static inline int migrate_pages(struct list_head *l, new_page_t x, | 64 | static inline int migrate_pages(struct list_head *l, new_page_t x, |
56 | unsigned long private, bool offlining, | 65 | unsigned long private, bool offlining, |
57 | enum migrate_mode mode) { return -ENOSYS; } | 66 | enum migrate_mode mode, int reason) { return -ENOSYS; } |
58 | static inline int migrate_huge_page(struct page *page, new_page_t x, | 67 | static inline int migrate_huge_page(struct page *page, new_page_t x, |
59 | unsigned long private, bool offlining, | 68 | unsigned long private, bool offlining, |
60 | enum migrate_mode mode) { return -ENOSYS; } | 69 | enum migrate_mode mode) { return -ENOSYS; } |
@@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
83 | #define fail_migrate_page NULL | 92 | #define fail_migrate_page NULL |
84 | 93 | ||
85 | #endif /* CONFIG_MIGRATION */ | 94 | #endif /* CONFIG_MIGRATION */ |
95 | |||
96 | #ifdef CONFIG_NUMA_BALANCING | ||
97 | extern int migrate_misplaced_page(struct page *page, int node); | ||
98 | extern int migrate_misplaced_page(struct page *page, int node); | ||
99 | extern bool migrate_ratelimited(int node); | ||
100 | #else | ||
101 | static inline int migrate_misplaced_page(struct page *page, int node) | ||
102 | { | ||
103 | return -EAGAIN; /* can't migrate now */ | ||
104 | } | ||
105 | static inline bool migrate_ratelimited(int node) | ||
106 | { | ||
107 | return false; | ||
108 | } | ||
109 | #endif /* CONFIG_NUMA_BALANCING */ | ||
110 | |||
111 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
112 | extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
113 | struct vm_area_struct *vma, | ||
114 | pmd_t *pmd, pmd_t entry, | ||
115 | unsigned long address, | ||
116 | struct page *page, int node); | ||
117 | #else | ||
118 | static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
119 | struct vm_area_struct *vma, | ||
120 | pmd_t *pmd, pmd_t entry, | ||
121 | unsigned long address, | ||
122 | struct page *page, int node) | ||
123 | { | ||
124 | return -EAGAIN; | ||
125 | } | ||
126 | #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ | ||
127 | |||
86 | #endif /* _LINUX_MIGRATE_H */ | 128 | #endif /* _LINUX_MIGRATE_H */ |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 4af4f0b1be4c..7f4f906190bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page) | |||
693 | } | 693 | } |
694 | #endif | 694 | #endif |
695 | 695 | ||
696 | #ifdef CONFIG_NUMA_BALANCING | ||
697 | static inline int page_xchg_last_nid(struct page *page, int nid) | ||
698 | { | ||
699 | return xchg(&page->_last_nid, nid); | ||
700 | } | ||
701 | |||
702 | static inline int page_last_nid(struct page *page) | ||
703 | { | ||
704 | return page->_last_nid; | ||
705 | } | ||
706 | static inline void reset_page_last_nid(struct page *page) | ||
707 | { | ||
708 | page->_last_nid = -1; | ||
709 | } | ||
710 | #else | ||
711 | static inline int page_xchg_last_nid(struct page *page, int nid) | ||
712 | { | ||
713 | return page_to_nid(page); | ||
714 | } | ||
715 | |||
716 | static inline int page_last_nid(struct page *page) | ||
717 | { | ||
718 | return page_to_nid(page); | ||
719 | } | ||
720 | |||
721 | static inline void reset_page_last_nid(struct page *page) | ||
722 | { | ||
723 | } | ||
724 | #endif | ||
725 | |||
696 | static inline struct zone *page_zone(const struct page *page) | 726 | static inline struct zone *page_zone(const struct page *page) |
697 | { | 727 | { |
698 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; | 728 | return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; |
@@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, | |||
1078 | extern unsigned long do_mremap(unsigned long addr, | 1108 | extern unsigned long do_mremap(unsigned long addr, |
1079 | unsigned long old_len, unsigned long new_len, | 1109 | unsigned long old_len, unsigned long new_len, |
1080 | unsigned long flags, unsigned long new_addr); | 1110 | unsigned long flags, unsigned long new_addr); |
1111 | extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | ||
1112 | unsigned long end, pgprot_t newprot, | ||
1113 | int dirty_accountable, int prot_numa); | ||
1081 | extern int mprotect_fixup(struct vm_area_struct *vma, | 1114 | extern int mprotect_fixup(struct vm_area_struct *vma, |
1082 | struct vm_area_struct **pprev, unsigned long start, | 1115 | struct vm_area_struct **pprev, unsigned long start, |
1083 | unsigned long end, unsigned long newflags); | 1116 | unsigned long end, unsigned long newflags); |
@@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) | |||
1579 | } | 1612 | } |
1580 | #endif | 1613 | #endif |
1581 | 1614 | ||
1615 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
1616 | unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
1617 | unsigned long start, unsigned long end); | ||
1618 | #endif | ||
1619 | |||
1582 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); | 1620 | struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); |
1583 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | 1621 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
1584 | unsigned long pfn, unsigned long size, pgprot_t); | 1622 | unsigned long pfn, unsigned long size, pgprot_t); |
@@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, | |||
1600 | #define FOLL_MLOCK 0x40 /* mark page as mlocked */ | 1638 | #define FOLL_MLOCK 0x40 /* mark page as mlocked */ |
1601 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ | 1639 | #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ |
1602 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ | 1640 | #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ |
1641 | #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ | ||
1603 | 1642 | ||
1604 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, | 1643 | typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, |
1605 | void *data); | 1644 | void *data); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7ade2731b5d6..7d9ebb7cc982 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -175,6 +175,10 @@ struct page { | |||
175 | */ | 175 | */ |
176 | void *shadow; | 176 | void *shadow; |
177 | #endif | 177 | #endif |
178 | |||
179 | #ifdef CONFIG_NUMA_BALANCING | ||
180 | int _last_nid; | ||
181 | #endif | ||
178 | } | 182 | } |
179 | /* | 183 | /* |
180 | * The struct page can be forced to be double word aligned so that atomic ops | 184 | * The struct page can be forced to be double word aligned so that atomic ops |
@@ -411,9 +415,36 @@ struct mm_struct { | |||
411 | #ifdef CONFIG_CPUMASK_OFFSTACK | 415 | #ifdef CONFIG_CPUMASK_OFFSTACK |
412 | struct cpumask cpumask_allocation; | 416 | struct cpumask cpumask_allocation; |
413 | #endif | 417 | #endif |
418 | #ifdef CONFIG_NUMA_BALANCING | ||
419 | /* | ||
420 | * numa_next_scan is the next time when the PTEs will me marked | ||
421 | * pte_numa to gather statistics and migrate pages to new nodes | ||
422 | * if necessary | ||
423 | */ | ||
424 | unsigned long numa_next_scan; | ||
425 | |||
426 | /* numa_next_reset is when the PTE scanner period will be reset */ | ||
427 | unsigned long numa_next_reset; | ||
428 | |||
429 | /* Restart point for scanning and setting pte_numa */ | ||
430 | unsigned long numa_scan_offset; | ||
431 | |||
432 | /* numa_scan_seq prevents two threads setting pte_numa */ | ||
433 | int numa_scan_seq; | ||
434 | |||
435 | /* | ||
436 | * The first node a task was scheduled on. If a task runs on | ||
437 | * a different node than Make PTE Scan Go Now. | ||
438 | */ | ||
439 | int first_nid; | ||
440 | #endif | ||
414 | struct uprobes_state uprobes_state; | 441 | struct uprobes_state uprobes_state; |
415 | }; | 442 | }; |
416 | 443 | ||
444 | /* first nid will either be a valid NID or one of these values */ | ||
445 | #define NUMA_PTE_SCAN_INIT -1 | ||
446 | #define NUMA_PTE_SCAN_ACTIVE -2 | ||
447 | |||
417 | static inline void mm_init_cpumask(struct mm_struct *mm) | 448 | static inline void mm_init_cpumask(struct mm_struct *mm) |
418 | { | 449 | { |
419 | #ifdef CONFIG_CPUMASK_OFFSTACK | 450 | #ifdef CONFIG_CPUMASK_OFFSTACK |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd55dad56aac..4bec5be82cab 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -735,6 +735,19 @@ typedef struct pglist_data { | |||
735 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ | 735 | struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ |
736 | int kswapd_max_order; | 736 | int kswapd_max_order; |
737 | enum zone_type classzone_idx; | 737 | enum zone_type classzone_idx; |
738 | #ifdef CONFIG_NUMA_BALANCING | ||
739 | /* | ||
740 | * Lock serializing the per destination node AutoNUMA memory | ||
741 | * migration rate limiting data. | ||
742 | */ | ||
743 | spinlock_t numabalancing_migrate_lock; | ||
744 | |||
745 | /* Rate limiting time interval */ | ||
746 | unsigned long numabalancing_migrate_next_window; | ||
747 | |||
748 | /* Number of pages migrated during the rate limiting time interval */ | ||
749 | unsigned long numabalancing_migrate_nr_pages; | ||
750 | #endif | ||
738 | } pg_data_t; | 751 | } pg_data_t; |
739 | 752 | ||
740 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) | 753 | #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bfe1f4780644..c20635c527a9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -7,7 +7,7 @@ | |||
7 | #include <linux/list.h> | 7 | #include <linux/list.h> |
8 | #include <linux/slab.h> | 8 | #include <linux/slab.h> |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/mutex.h> | 10 | #include <linux/rwsem.h> |
11 | #include <linux/memcontrol.h> | 11 | #include <linux/memcontrol.h> |
12 | 12 | ||
13 | /* | 13 | /* |
@@ -25,8 +25,8 @@ | |||
25 | * pointing to this anon_vma once its vma list is empty. | 25 | * pointing to this anon_vma once its vma list is empty. |
26 | */ | 26 | */ |
27 | struct anon_vma { | 27 | struct anon_vma { |
28 | struct anon_vma *root; /* Root of this anon_vma tree */ | 28 | struct anon_vma *root; /* Root of this anon_vma tree */ |
29 | struct mutex mutex; /* Serialize access to vma list */ | 29 | struct rw_semaphore rwsem; /* W: modification, R: walking the list */ |
30 | /* | 30 | /* |
31 | * The refcount is taken on an anon_vma when there is no | 31 | * The refcount is taken on an anon_vma when there is no |
32 | * guarantee that the vma of page tables will exist for | 32 | * guarantee that the vma of page tables will exist for |
@@ -64,7 +64,7 @@ struct anon_vma_chain { | |||
64 | struct vm_area_struct *vma; | 64 | struct vm_area_struct *vma; |
65 | struct anon_vma *anon_vma; | 65 | struct anon_vma *anon_vma; |
66 | struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ | 66 | struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ |
67 | struct rb_node rb; /* locked by anon_vma->mutex */ | 67 | struct rb_node rb; /* locked by anon_vma->rwsem */ |
68 | unsigned long rb_subtree_last; | 68 | unsigned long rb_subtree_last; |
69 | #ifdef CONFIG_DEBUG_VM_RB | 69 | #ifdef CONFIG_DEBUG_VM_RB |
70 | unsigned long cached_vma_start, cached_vma_last; | 70 | unsigned long cached_vma_start, cached_vma_last; |
@@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) | |||
108 | { | 108 | { |
109 | struct anon_vma *anon_vma = vma->anon_vma; | 109 | struct anon_vma *anon_vma = vma->anon_vma; |
110 | if (anon_vma) | 110 | if (anon_vma) |
111 | mutex_lock(&anon_vma->root->mutex); | 111 | down_write(&anon_vma->root->rwsem); |
112 | } | 112 | } |
113 | 113 | ||
114 | static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) | 114 | static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) |
115 | { | 115 | { |
116 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
117 | if (anon_vma) | 117 | if (anon_vma) |
118 | mutex_unlock(&anon_vma->root->mutex); | 118 | up_write(&anon_vma->root->rwsem); |
119 | } | 119 | } |
120 | 120 | ||
121 | static inline void anon_vma_lock(struct anon_vma *anon_vma) | 121 | static inline void anon_vma_lock_write(struct anon_vma *anon_vma) |
122 | { | 122 | { |
123 | mutex_lock(&anon_vma->root->mutex); | 123 | down_write(&anon_vma->root->rwsem); |
124 | } | 124 | } |
125 | 125 | ||
126 | static inline void anon_vma_unlock(struct anon_vma *anon_vma) | 126 | static inline void anon_vma_unlock(struct anon_vma *anon_vma) |
127 | { | 127 | { |
128 | mutex_unlock(&anon_vma->root->mutex); | 128 | up_write(&anon_vma->root->rwsem); |
129 | } | 129 | } |
130 | 130 | ||
131 | static inline void anon_vma_lock_read(struct anon_vma *anon_vma) | ||
132 | { | ||
133 | down_read(&anon_vma->root->rwsem); | ||
134 | } | ||
135 | |||
136 | static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) | ||
137 | { | ||
138 | up_read(&anon_vma->root->rwsem); | ||
139 | } | ||
140 | |||
141 | |||
131 | /* | 142 | /* |
132 | * anon_vma helper functions. | 143 | * anon_vma helper functions. |
133 | */ | 144 | */ |
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *); | |||
220 | /* | 231 | /* |
221 | * Called by memory-failure.c to kill processes. | 232 | * Called by memory-failure.c to kill processes. |
222 | */ | 233 | */ |
223 | struct anon_vma *page_lock_anon_vma(struct page *page); | 234 | struct anon_vma *page_lock_anon_vma_read(struct page *page); |
224 | void page_unlock_anon_vma(struct anon_vma *anon_vma); | 235 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma); |
225 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); | 236 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); |
226 | 237 | ||
227 | /* | 238 | /* |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c2f3072beef..b089c92c609b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1527,6 +1527,14 @@ struct task_struct { | |||
1527 | short il_next; | 1527 | short il_next; |
1528 | short pref_node_fork; | 1528 | short pref_node_fork; |
1529 | #endif | 1529 | #endif |
1530 | #ifdef CONFIG_NUMA_BALANCING | ||
1531 | int numa_scan_seq; | ||
1532 | int numa_migrate_seq; | ||
1533 | unsigned int numa_scan_period; | ||
1534 | u64 node_stamp; /* migration stamp */ | ||
1535 | struct callback_head numa_work; | ||
1536 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1537 | |||
1530 | struct rcu_head rcu; | 1538 | struct rcu_head rcu; |
1531 | 1539 | ||
1532 | /* | 1540 | /* |
@@ -1601,6 +1609,18 @@ struct task_struct { | |||
1601 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ | 1609 | /* Future-safe accessor for struct task_struct's cpus_allowed. */ |
1602 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) | 1610 | #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) |
1603 | 1611 | ||
1612 | #ifdef CONFIG_NUMA_BALANCING | ||
1613 | extern void task_numa_fault(int node, int pages, bool migrated); | ||
1614 | extern void set_numabalancing_state(bool enabled); | ||
1615 | #else | ||
1616 | static inline void task_numa_fault(int node, int pages, bool migrated) | ||
1617 | { | ||
1618 | } | ||
1619 | static inline void set_numabalancing_state(bool enabled) | ||
1620 | { | ||
1621 | } | ||
1622 | #endif | ||
1623 | |||
1604 | /* | 1624 | /* |
1605 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT | 1625 | * Priority of a process goes from 0..MAX_PRIO-1, valid RT |
1606 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH | 1626 | * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH |
@@ -2030,6 +2050,13 @@ enum sched_tunable_scaling { | |||
2030 | }; | 2050 | }; |
2031 | extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; | 2051 | extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; |
2032 | 2052 | ||
2053 | extern unsigned int sysctl_numa_balancing_scan_delay; | ||
2054 | extern unsigned int sysctl_numa_balancing_scan_period_min; | ||
2055 | extern unsigned int sysctl_numa_balancing_scan_period_max; | ||
2056 | extern unsigned int sysctl_numa_balancing_scan_period_reset; | ||
2057 | extern unsigned int sysctl_numa_balancing_scan_size; | ||
2058 | extern unsigned int sysctl_numa_balancing_settle_count; | ||
2059 | |||
2033 | #ifdef CONFIG_SCHED_DEBUG | 2060 | #ifdef CONFIG_SCHED_DEBUG |
2034 | extern unsigned int sysctl_sched_migration_cost; | 2061 | extern unsigned int sysctl_sched_migration_cost; |
2035 | extern unsigned int sysctl_sched_nr_migrate; | 2062 | extern unsigned int sysctl_sched_nr_migrate; |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fe786f07d2bd..fce0a2799d43 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, | 38 | KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, |
39 | KSWAPD_SKIP_CONGESTION_WAIT, | 39 | KSWAPD_SKIP_CONGESTION_WAIT, |
40 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, | 40 | PAGEOUTRUN, ALLOCSTALL, PGROTATED, |
41 | #ifdef CONFIG_NUMA_BALANCING | ||
42 | NUMA_PTE_UPDATES, | ||
43 | NUMA_HINT_FAULTS, | ||
44 | NUMA_HINT_FAULTS_LOCAL, | ||
45 | NUMA_PAGE_MIGRATE, | ||
46 | #endif | ||
47 | #ifdef CONFIG_MIGRATION | ||
48 | PGMIGRATE_SUCCESS, PGMIGRATE_FAIL, | ||
49 | #endif | ||
41 | #ifdef CONFIG_COMPACTION | 50 | #ifdef CONFIG_COMPACTION |
42 | COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, | 51 | COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED, |
52 | COMPACTISOLATED, | ||
43 | COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, | 53 | COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, |
44 | #endif | 54 | #endif |
45 | #ifdef CONFIG_HUGETLB_PAGE | 55 | #ifdef CONFIG_HUGETLB_PAGE |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 92a86b2cce33..a13291f7da88 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu) | |||
80 | 80 | ||
81 | #endif /* CONFIG_VM_EVENT_COUNTERS */ | 81 | #endif /* CONFIG_VM_EVENT_COUNTERS */ |
82 | 82 | ||
83 | #ifdef CONFIG_NUMA_BALANCING | ||
84 | #define count_vm_numa_event(x) count_vm_event(x) | ||
85 | #define count_vm_numa_events(x, y) count_vm_events(x, y) | ||
86 | #else | ||
87 | #define count_vm_numa_event(x) do {} while (0) | ||
88 | #define count_vm_numa_events(x, y) do {} while (0) | ||
89 | #endif /* CONFIG_NUMA_BALANCING */ | ||
90 | |||
83 | #define __count_zone_vm_events(item, zone, delta) \ | 91 | #define __count_zone_vm_events(item, zone, delta) \ |
84 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ | 92 | __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ |
85 | zone_idx(zone), delta) | 93 | zone_idx(zone), delta) |
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h new file mode 100644 index 000000000000..ec2a6ccfd7e5 --- /dev/null +++ b/include/trace/events/migrate.h | |||
@@ -0,0 +1,51 @@ | |||
1 | #undef TRACE_SYSTEM | ||
2 | #define TRACE_SYSTEM migrate | ||
3 | |||
4 | #if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ) | ||
5 | #define _TRACE_MIGRATE_H | ||
6 | |||
7 | #define MIGRATE_MODE \ | ||
8 | {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \ | ||
9 | {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \ | ||
10 | {MIGRATE_SYNC, "MIGRATE_SYNC"} | ||
11 | |||
12 | #define MIGRATE_REASON \ | ||
13 | {MR_COMPACTION, "compaction"}, \ | ||
14 | {MR_MEMORY_FAILURE, "memory_failure"}, \ | ||
15 | {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ | ||
16 | {MR_SYSCALL, "syscall_or_cpuset"}, \ | ||
17 | {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ | ||
18 | {MR_CMA, "cma"} | ||
19 | |||
20 | TRACE_EVENT(mm_migrate_pages, | ||
21 | |||
22 | TP_PROTO(unsigned long succeeded, unsigned long failed, | ||
23 | enum migrate_mode mode, int reason), | ||
24 | |||
25 | TP_ARGS(succeeded, failed, mode, reason), | ||
26 | |||
27 | TP_STRUCT__entry( | ||
28 | __field( unsigned long, succeeded) | ||
29 | __field( unsigned long, failed) | ||
30 | __field( enum migrate_mode, mode) | ||
31 | __field( int, reason) | ||
32 | ), | ||
33 | |||
34 | TP_fast_assign( | ||
35 | __entry->succeeded = succeeded; | ||
36 | __entry->failed = failed; | ||
37 | __entry->mode = mode; | ||
38 | __entry->reason = reason; | ||
39 | ), | ||
40 | |||
41 | TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s", | ||
42 | __entry->succeeded, | ||
43 | __entry->failed, | ||
44 | __print_symbolic(__entry->mode, MIGRATE_MODE), | ||
45 | __print_symbolic(__entry->reason, MIGRATE_REASON)) | ||
46 | ); | ||
47 | |||
48 | #endif /* _TRACE_MIGRATE_H */ | ||
49 | |||
50 | /* This part must be outside protection */ | ||
51 | #include <trace/define_trace.h> | ||
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 23e62e0537e2..0d11c3dcd3a1 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h | |||
@@ -20,6 +20,7 @@ enum { | |||
20 | MPOL_PREFERRED, | 20 | MPOL_PREFERRED, |
21 | MPOL_BIND, | 21 | MPOL_BIND, |
22 | MPOL_INTERLEAVE, | 22 | MPOL_INTERLEAVE, |
23 | MPOL_LOCAL, | ||
23 | MPOL_MAX, /* always last member of enum */ | 24 | MPOL_MAX, /* always last member of enum */ |
24 | }; | 25 | }; |
25 | 26 | ||
@@ -47,9 +48,15 @@ enum mpol_rebind_step { | |||
47 | 48 | ||
48 | /* Flags for mbind */ | 49 | /* Flags for mbind */ |
49 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ | 50 | #define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ |
50 | #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ | 51 | #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform |
51 | #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ | 52 | to policy */ |
52 | #define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ | 53 | #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ |
54 | #define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ | ||
55 | #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ | ||
56 | |||
57 | #define MPOL_MF_VALID (MPOL_MF_STRICT | \ | ||
58 | MPOL_MF_MOVE | \ | ||
59 | MPOL_MF_MOVE_ALL) | ||
53 | 60 | ||
54 | /* | 61 | /* |
55 | * Internal flags that share the struct mempolicy flags word with | 62 | * Internal flags that share the struct mempolicy flags word with |
@@ -59,6 +66,8 @@ enum mpol_rebind_step { | |||
59 | #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ | 66 | #define MPOL_F_SHARED (1 << 0) /* identify shared policies */ |
60 | #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ | 67 | #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ |
61 | #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ | 68 | #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ |
69 | #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ | ||
70 | #define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ | ||
62 | 71 | ||
63 | 72 | ||
64 | #endif /* _UAPI_LINUX_MEMPOLICY_H */ | 73 | #endif /* _UAPI_LINUX_MEMPOLICY_H */ |
diff --git a/init/Kconfig b/init/Kconfig index 2054e048bb98..1a207efca591 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -717,6 +717,50 @@ config LOG_BUF_SHIFT | |||
717 | config HAVE_UNSTABLE_SCHED_CLOCK | 717 | config HAVE_UNSTABLE_SCHED_CLOCK |
718 | bool | 718 | bool |
719 | 719 | ||
720 | # | ||
721 | # For architectures that want to enable the support for NUMA-affine scheduler | ||
722 | # balancing logic: | ||
723 | # | ||
724 | config ARCH_SUPPORTS_NUMA_BALANCING | ||
725 | bool | ||
726 | |||
727 | # For architectures that (ab)use NUMA to represent different memory regions | ||
728 | # all cpu-local but of different latencies, such as SuperH. | ||
729 | # | ||
730 | config ARCH_WANT_NUMA_VARIABLE_LOCALITY | ||
731 | bool | ||
732 | |||
733 | # | ||
734 | # For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE | ||
735 | config ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
736 | bool | ||
737 | |||
738 | config ARCH_USES_NUMA_PROT_NONE | ||
739 | bool | ||
740 | default y | ||
741 | depends on ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
742 | depends on NUMA_BALANCING | ||
743 | |||
744 | config NUMA_BALANCING_DEFAULT_ENABLED | ||
745 | bool "Automatically enable NUMA aware memory/task placement" | ||
746 | default y | ||
747 | depends on NUMA_BALANCING | ||
748 | help | ||
749 | If set, autonumic NUMA balancing will be enabled if running on a NUMA | ||
750 | machine. | ||
751 | |||
752 | config NUMA_BALANCING | ||
753 | bool "Memory placement aware NUMA scheduler" | ||
754 | depends on ARCH_SUPPORTS_NUMA_BALANCING | ||
755 | depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY | ||
756 | depends on SMP && NUMA && MIGRATION | ||
757 | help | ||
758 | This option adds support for automatic NUMA aware memory/task placement. | ||
759 | The mechanism is quite primitive and is based on migrating memory when | ||
760 | it is references to the node the task is running on. | ||
761 | |||
762 | This system will be inactive on UMA systems. | ||
763 | |||
720 | menuconfig CGROUPS | 764 | menuconfig CGROUPS |
721 | boolean "Control Group support" | 765 | boolean "Control Group support" |
722 | depends on EVENTFD | 766 | depends on EVENTFD |
diff --git a/kernel/fork.c b/kernel/fork.c index 3c31e874afad..115d6c2e4cca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -823,6 +823,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
823 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 823 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
824 | mm->pmd_huge_pte = NULL; | 824 | mm->pmd_huge_pte = NULL; |
825 | #endif | 825 | #endif |
826 | #ifdef CONFIG_NUMA_BALANCING | ||
827 | mm->first_nid = NUMA_PTE_SCAN_INIT; | ||
828 | #endif | ||
826 | if (!mm_init(mm, tsk)) | 829 | if (!mm_init(mm, tsk)) |
827 | goto fail_nomem; | 830 | goto fail_nomem; |
828 | 831 | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533496b6228..c1fb82104bfb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -193,23 +193,10 @@ static void sched_feat_disable(int i) { }; | |||
193 | static void sched_feat_enable(int i) { }; | 193 | static void sched_feat_enable(int i) { }; |
194 | #endif /* HAVE_JUMP_LABEL */ | 194 | #endif /* HAVE_JUMP_LABEL */ |
195 | 195 | ||
196 | static ssize_t | 196 | static int sched_feat_set(char *cmp) |
197 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
198 | size_t cnt, loff_t *ppos) | ||
199 | { | 197 | { |
200 | char buf[64]; | ||
201 | char *cmp; | ||
202 | int neg = 0; | ||
203 | int i; | 198 | int i; |
204 | 199 | int neg = 0; | |
205 | if (cnt > 63) | ||
206 | cnt = 63; | ||
207 | |||
208 | if (copy_from_user(&buf, ubuf, cnt)) | ||
209 | return -EFAULT; | ||
210 | |||
211 | buf[cnt] = 0; | ||
212 | cmp = strstrip(buf); | ||
213 | 200 | ||
214 | if (strncmp(cmp, "NO_", 3) == 0) { | 201 | if (strncmp(cmp, "NO_", 3) == 0) { |
215 | neg = 1; | 202 | neg = 1; |
@@ -229,6 +216,27 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
229 | } | 216 | } |
230 | } | 217 | } |
231 | 218 | ||
219 | return i; | ||
220 | } | ||
221 | |||
222 | static ssize_t | ||
223 | sched_feat_write(struct file *filp, const char __user *ubuf, | ||
224 | size_t cnt, loff_t *ppos) | ||
225 | { | ||
226 | char buf[64]; | ||
227 | char *cmp; | ||
228 | int i; | ||
229 | |||
230 | if (cnt > 63) | ||
231 | cnt = 63; | ||
232 | |||
233 | if (copy_from_user(&buf, ubuf, cnt)) | ||
234 | return -EFAULT; | ||
235 | |||
236 | buf[cnt] = 0; | ||
237 | cmp = strstrip(buf); | ||
238 | |||
239 | i = sched_feat_set(cmp); | ||
232 | if (i == __SCHED_FEAT_NR) | 240 | if (i == __SCHED_FEAT_NR) |
233 | return -EINVAL; | 241 | return -EINVAL; |
234 | 242 | ||
@@ -1560,7 +1568,40 @@ static void __sched_fork(struct task_struct *p) | |||
1560 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1568 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1561 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1569 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1562 | #endif | 1570 | #endif |
1571 | |||
1572 | #ifdef CONFIG_NUMA_BALANCING | ||
1573 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
1574 | p->mm->numa_next_scan = jiffies; | ||
1575 | p->mm->numa_next_reset = jiffies; | ||
1576 | p->mm->numa_scan_seq = 0; | ||
1577 | } | ||
1578 | |||
1579 | p->node_stamp = 0ULL; | ||
1580 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
1581 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1582 | p->numa_scan_period = sysctl_numa_balancing_scan_delay; | ||
1583 | p->numa_work.next = &p->numa_work; | ||
1584 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1585 | } | ||
1586 | |||
1587 | #ifdef CONFIG_NUMA_BALANCING | ||
1588 | #ifdef CONFIG_SCHED_DEBUG | ||
1589 | void set_numabalancing_state(bool enabled) | ||
1590 | { | ||
1591 | if (enabled) | ||
1592 | sched_feat_set("NUMA"); | ||
1593 | else | ||
1594 | sched_feat_set("NO_NUMA"); | ||
1595 | } | ||
1596 | #else | ||
1597 | __read_mostly bool numabalancing_enabled; | ||
1598 | |||
1599 | void set_numabalancing_state(bool enabled) | ||
1600 | { | ||
1601 | numabalancing_enabled = enabled; | ||
1563 | } | 1602 | } |
1603 | #endif /* CONFIG_SCHED_DEBUG */ | ||
1604 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1564 | 1605 | ||
1565 | /* | 1606 | /* |
1566 | * fork()/clone()-time setup: | 1607 | * fork()/clone()-time setup: |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 756f9f9e8542..9af5af979a13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/migrate.h> | ||
31 | #include <linux/task_work.h> | ||
29 | 32 | ||
30 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
31 | 34 | ||
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
774 | * Scheduling class queueing methods: | 777 | * Scheduling class queueing methods: |
775 | */ | 778 | */ |
776 | 779 | ||
780 | #ifdef CONFIG_NUMA_BALANCING | ||
781 | /* | ||
782 | * numa task sample period in ms | ||
783 | */ | ||
784 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | ||
785 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
787 | |||
788 | /* Portion of address space to scan in MB */ | ||
789 | unsigned int sysctl_numa_balancing_scan_size = 256; | ||
790 | |||
791 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | ||
792 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | ||
793 | |||
794 | static void task_numa_placement(struct task_struct *p) | ||
795 | { | ||
796 | int seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
797 | |||
798 | if (p->numa_scan_seq == seq) | ||
799 | return; | ||
800 | p->numa_scan_seq = seq; | ||
801 | |||
802 | /* FIXME: Scheduling placement policy hints go here */ | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Got a PROT_NONE fault for a page on @node. | ||
807 | */ | ||
808 | void task_numa_fault(int node, int pages, bool migrated) | ||
809 | { | ||
810 | struct task_struct *p = current; | ||
811 | |||
812 | if (!sched_feat_numa(NUMA)) | ||
813 | return; | ||
814 | |||
815 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
816 | |||
817 | /* | ||
818 | * If pages are properly placed (did not migrate) then scan slower. | ||
819 | * This is reset periodically in case of phase changes | ||
820 | */ | ||
821 | if (!migrated) | ||
822 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | ||
823 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
824 | |||
825 | task_numa_placement(p); | ||
826 | } | ||
827 | |||
828 | static void reset_ptenuma_scan(struct task_struct *p) | ||
829 | { | ||
830 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | ||
831 | p->mm->numa_scan_offset = 0; | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * The expensive part of numa migration is done from task_work context. | ||
836 | * Triggered from task_tick_numa(). | ||
837 | */ | ||
838 | void task_numa_work(struct callback_head *work) | ||
839 | { | ||
840 | unsigned long migrate, next_scan, now = jiffies; | ||
841 | struct task_struct *p = current; | ||
842 | struct mm_struct *mm = p->mm; | ||
843 | struct vm_area_struct *vma; | ||
844 | unsigned long start, end; | ||
845 | long pages; | ||
846 | |||
847 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
848 | |||
849 | work->next = work; /* protect against double add */ | ||
850 | /* | ||
851 | * Who cares about NUMA placement when they're dying. | ||
852 | * | ||
853 | * NOTE: make sure not to dereference p->mm before this check, | ||
854 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
855 | * without p->mm even though we still had it when we enqueued this | ||
856 | * work. | ||
857 | */ | ||
858 | if (p->flags & PF_EXITING) | ||
859 | return; | ||
860 | |||
861 | /* | ||
862 | * We do not care about task placement until a task runs on a node | ||
863 | * other than the first one used by the address space. This is | ||
864 | * largely because migrations are driven by what CPU the task | ||
865 | * is running on. If it's never scheduled on another node, it'll | ||
866 | * not migrate so why bother trapping the fault. | ||
867 | */ | ||
868 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
869 | mm->first_nid = numa_node_id(); | ||
870 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
871 | /* Are we running on a new node yet? */ | ||
872 | if (numa_node_id() == mm->first_nid && | ||
873 | !sched_feat_numa(NUMA_FORCE)) | ||
874 | return; | ||
875 | |||
876 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
877 | } | ||
878 | |||
879 | /* | ||
880 | * Reset the scan period if enough time has gone by. Objective is that | ||
881 | * scanning will be reduced if pages are properly placed. As tasks | ||
882 | * can enter different phases this needs to be re-examined. Lacking | ||
883 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
884 | */ | ||
885 | migrate = mm->numa_next_reset; | ||
886 | if (time_after(now, migrate)) { | ||
887 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
888 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
889 | xchg(&mm->numa_next_reset, next_scan); | ||
890 | } | ||
891 | |||
892 | /* | ||
893 | * Enforce maximal scan/migration frequency.. | ||
894 | */ | ||
895 | migrate = mm->numa_next_scan; | ||
896 | if (time_before(now, migrate)) | ||
897 | return; | ||
898 | |||
899 | if (p->numa_scan_period == 0) | ||
900 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
901 | |||
902 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | ||
903 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
904 | return; | ||
905 | |||
906 | /* | ||
907 | * Do not set pte_numa if the current running node is rate-limited. | ||
908 | * This loses statistics on the fault but if we are unwilling to | ||
909 | * migrate to this node, it is less likely we can do useful work | ||
910 | */ | ||
911 | if (migrate_ratelimited(numa_node_id())) | ||
912 | return; | ||
913 | |||
914 | start = mm->numa_scan_offset; | ||
915 | pages = sysctl_numa_balancing_scan_size; | ||
916 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | ||
917 | if (!pages) | ||
918 | return; | ||
919 | |||
920 | down_read(&mm->mmap_sem); | ||
921 | vma = find_vma(mm, start); | ||
922 | if (!vma) { | ||
923 | reset_ptenuma_scan(p); | ||
924 | start = 0; | ||
925 | vma = mm->mmap; | ||
926 | } | ||
927 | for (; vma; vma = vma->vm_next) { | ||
928 | if (!vma_migratable(vma)) | ||
929 | continue; | ||
930 | |||
931 | /* Skip small VMAs. They are not likely to be of relevance */ | ||
932 | if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) | ||
933 | continue; | ||
934 | |||
935 | do { | ||
936 | start = max(start, vma->vm_start); | ||
937 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | ||
938 | end = min(end, vma->vm_end); | ||
939 | pages -= change_prot_numa(vma, start, end); | ||
940 | |||
941 | start = end; | ||
942 | if (pages <= 0) | ||
943 | goto out; | ||
944 | } while (end != vma->vm_end); | ||
945 | } | ||
946 | |||
947 | out: | ||
948 | /* | ||
949 | * It is possible to reach the end of the VMA list but the last few VMAs are | ||
950 | * not guaranteed to the vma_migratable. If they are not, we would find the | ||
951 | * !migratable VMA on the next scan but not reset the scanner to the start | ||
952 | * so check it now. | ||
953 | */ | ||
954 | if (vma) | ||
955 | mm->numa_scan_offset = start; | ||
956 | else | ||
957 | reset_ptenuma_scan(p); | ||
958 | up_read(&mm->mmap_sem); | ||
959 | } | ||
960 | |||
961 | /* | ||
962 | * Drive the periodic memory faults.. | ||
963 | */ | ||
964 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
965 | { | ||
966 | struct callback_head *work = &curr->numa_work; | ||
967 | u64 period, now; | ||
968 | |||
969 | /* | ||
970 | * We don't care about NUMA placement if we don't have memory. | ||
971 | */ | ||
972 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
973 | return; | ||
974 | |||
975 | /* | ||
976 | * Using runtime rather than walltime has the dual advantage that | ||
977 | * we (mostly) drive the selection from busy threads and that the | ||
978 | * task needs to have done some actual work before we bother with | ||
979 | * NUMA placement. | ||
980 | */ | ||
981 | now = curr->se.sum_exec_runtime; | ||
982 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
983 | |||
984 | if (now - curr->node_stamp > period) { | ||
985 | if (!curr->node_stamp) | ||
986 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
987 | curr->node_stamp = now; | ||
988 | |||
989 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
990 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
991 | task_work_add(curr, work, true); | ||
992 | } | ||
993 | } | ||
994 | } | ||
995 | #else | ||
996 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
997 | { | ||
998 | } | ||
999 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1000 | |||
777 | static void | 1001 | static void |
778 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1002 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
779 | { | 1003 | { |
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
5501 | entity_tick(cfs_rq, se, queued); | 5725 | entity_tick(cfs_rq, se, queued); |
5502 | } | 5726 | } |
5503 | 5727 | ||
5728 | if (sched_feat_numa(NUMA)) | ||
5729 | task_tick_numa(rq, curr); | ||
5730 | |||
5504 | update_rq_runnable_avg(rq, 1); | 5731 | update_rq_runnable_avg(rq, 1); |
5505 | } | 5732 | } |
5506 | 5733 | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e68e69ab917d..1ad1d2b5395f 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -66,3 +66,14 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
66 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 66 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
67 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 67 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
68 | SCHED_FEAT(LB_MIN, false) | 68 | SCHED_FEAT(LB_MIN, false) |
69 | |||
70 | /* | ||
71 | * Apply the automatic NUMA scheduling policy. Enabled automatically | ||
72 | * at runtime if running on a NUMA machine. Can be controlled via | ||
73 | * numa_balancing=. Allow PTE scanning to be forced on UMA machines | ||
74 | * for debugging the core machinery. | ||
75 | */ | ||
76 | #ifdef CONFIG_NUMA_BALANCING | ||
77 | SCHED_FEAT(NUMA, false) | ||
78 | SCHED_FEAT(NUMA_FORCE, false) | ||
79 | #endif | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5eca173b563f..fc886441436a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -663,6 +663,18 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
663 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 663 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
664 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 664 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
665 | 665 | ||
666 | #ifdef CONFIG_NUMA_BALANCING | ||
667 | #define sched_feat_numa(x) sched_feat(x) | ||
668 | #ifdef CONFIG_SCHED_DEBUG | ||
669 | #define numabalancing_enabled sched_feat_numa(NUMA) | ||
670 | #else | ||
671 | extern bool numabalancing_enabled; | ||
672 | #endif /* CONFIG_SCHED_DEBUG */ | ||
673 | #else | ||
674 | #define sched_feat_numa(x) (0) | ||
675 | #define numabalancing_enabled (0) | ||
676 | #endif /* CONFIG_NUMA_BALANCING */ | ||
677 | |||
666 | static inline u64 global_rt_period(void) | 678 | static inline u64 global_rt_period(void) |
667 | { | 679 | { |
668 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 680 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 33f71f37267e..c88878db491e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ | |||
256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
257 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 257 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
259 | #ifdef CONFIG_SMP | ||
259 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
260 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
261 | #endif | 262 | #endif /* CONFIG_SMP */ |
263 | #endif /* CONFIG_SCHED_DEBUG */ | ||
262 | 264 | ||
263 | #ifdef CONFIG_COMPACTION | 265 | #ifdef CONFIG_COMPACTION |
264 | static int min_extfrag_threshold; | 266 | static int min_extfrag_threshold; |
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { | |||
301 | .extra1 = &min_wakeup_granularity_ns, | 303 | .extra1 = &min_wakeup_granularity_ns, |
302 | .extra2 = &max_wakeup_granularity_ns, | 304 | .extra2 = &max_wakeup_granularity_ns, |
303 | }, | 305 | }, |
306 | #ifdef CONFIG_SMP | ||
304 | { | 307 | { |
305 | .procname = "sched_tunable_scaling", | 308 | .procname = "sched_tunable_scaling", |
306 | .data = &sysctl_sched_tunable_scaling, | 309 | .data = &sysctl_sched_tunable_scaling, |
@@ -347,7 +350,45 @@ static struct ctl_table kern_table[] = { | |||
347 | .extra1 = &zero, | 350 | .extra1 = &zero, |
348 | .extra2 = &one, | 351 | .extra2 = &one, |
349 | }, | 352 | }, |
350 | #endif | 353 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | ||
355 | { | ||
356 | .procname = "numa_balancing_scan_delay_ms", | ||
357 | .data = &sysctl_numa_balancing_scan_delay, | ||
358 | .maxlen = sizeof(unsigned int), | ||
359 | .mode = 0644, | ||
360 | .proc_handler = proc_dointvec, | ||
361 | }, | ||
362 | { | ||
363 | .procname = "numa_balancing_scan_period_min_ms", | ||
364 | .data = &sysctl_numa_balancing_scan_period_min, | ||
365 | .maxlen = sizeof(unsigned int), | ||
366 | .mode = 0644, | ||
367 | .proc_handler = proc_dointvec, | ||
368 | }, | ||
369 | { | ||
370 | .procname = "numa_balancing_scan_period_reset", | ||
371 | .data = &sysctl_numa_balancing_scan_period_reset, | ||
372 | .maxlen = sizeof(unsigned int), | ||
373 | .mode = 0644, | ||
374 | .proc_handler = proc_dointvec, | ||
375 | }, | ||
376 | { | ||
377 | .procname = "numa_balancing_scan_period_max_ms", | ||
378 | .data = &sysctl_numa_balancing_scan_period_max, | ||
379 | .maxlen = sizeof(unsigned int), | ||
380 | .mode = 0644, | ||
381 | .proc_handler = proc_dointvec, | ||
382 | }, | ||
383 | { | ||
384 | .procname = "numa_balancing_scan_size_mb", | ||
385 | .data = &sysctl_numa_balancing_scan_size, | ||
386 | .maxlen = sizeof(unsigned int), | ||
387 | .mode = 0644, | ||
388 | .proc_handler = proc_dointvec, | ||
389 | }, | ||
390 | #endif /* CONFIG_NUMA_BALANCING */ | ||
391 | #endif /* CONFIG_SCHED_DEBUG */ | ||
351 | { | 392 | { |
352 | .procname = "sched_rt_period_us", | 393 | .procname = "sched_rt_period_us", |
353 | .data = &sysctl_sched_rt_period, | 394 | .data = &sysctl_sched_rt_period, |
diff --git a/mm/compaction.c b/mm/compaction.c index 129791218226..5ad7f4f4d6f7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -303,6 +303,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
303 | if (blockpfn == end_pfn) | 303 | if (blockpfn == end_pfn) |
304 | update_pageblock_skip(cc, valid_page, total_isolated, false); | 304 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
305 | 305 | ||
306 | count_vm_events(COMPACTFREE_SCANNED, nr_scanned); | ||
307 | if (total_isolated) | ||
308 | count_vm_events(COMPACTISOLATED, total_isolated); | ||
309 | |||
306 | return total_isolated; | 310 | return total_isolated; |
307 | } | 311 | } |
308 | 312 | ||
@@ -609,6 +613,10 @@ next_pageblock: | |||
609 | 613 | ||
610 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 614 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
611 | 615 | ||
616 | count_vm_events(COMPACTMIGRATE_SCANNED, nr_scanned); | ||
617 | if (nr_isolated) | ||
618 | count_vm_events(COMPACTISOLATED, nr_isolated); | ||
619 | |||
612 | return low_pfn; | 620 | return low_pfn; |
613 | } | 621 | } |
614 | 622 | ||
@@ -1015,14 +1023,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1015 | nr_migrate = cc->nr_migratepages; | 1023 | nr_migrate = cc->nr_migratepages; |
1016 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1024 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1017 | (unsigned long)cc, false, | 1025 | (unsigned long)cc, false, |
1018 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); | 1026 | cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, |
1027 | MR_COMPACTION); | ||
1019 | update_nr_listpages(cc); | 1028 | update_nr_listpages(cc); |
1020 | nr_remaining = cc->nr_migratepages; | 1029 | nr_remaining = cc->nr_migratepages; |
1021 | 1030 | ||
1022 | count_vm_event(COMPACTBLOCKS); | ||
1023 | count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); | ||
1024 | if (nr_remaining) | ||
1025 | count_vm_events(COMPACTPAGEFAILED, nr_remaining); | ||
1026 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, | 1031 | trace_mm_compaction_migratepages(nr_migrate - nr_remaining, |
1027 | nr_remaining); | 1032 | nr_remaining); |
1028 | 1033 | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 827d9c813051..d7ee1691fd21 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
20 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
21 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
22 | #include <linux/migrate.h> | ||
22 | 23 | ||
23 | #include <asm/tlb.h> | 24 | #include <asm/tlb.h> |
24 | #include <asm/pgalloc.h> | 25 | #include <asm/pgalloc.h> |
@@ -690,7 +691,7 @@ out: | |||
690 | } | 691 | } |
691 | __setup("transparent_hugepage=", setup_transparent_hugepage); | 692 | __setup("transparent_hugepage=", setup_transparent_hugepage); |
692 | 693 | ||
693 | static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) | 694 | pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) |
694 | { | 695 | { |
695 | if (likely(vma->vm_flags & VM_WRITE)) | 696 | if (likely(vma->vm_flags & VM_WRITE)) |
696 | pmd = pmd_mkwrite(pmd); | 697 | pmd = pmd_mkwrite(pmd); |
@@ -848,7 +849,8 @@ out: | |||
848 | * run pte_offset_map on the pmd, if an huge pmd could | 849 | * run pte_offset_map on the pmd, if an huge pmd could |
849 | * materialize from under us from a different thread. | 850 | * materialize from under us from a different thread. |
850 | */ | 851 | */ |
851 | if (unlikely(__pte_alloc(mm, vma, pmd, address))) | 852 | if (unlikely(pmd_none(*pmd)) && |
853 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
852 | return VM_FAULT_OOM; | 854 | return VM_FAULT_OOM; |
853 | /* if an huge pmd materialized from under us just retry later */ | 855 | /* if an huge pmd materialized from under us just retry later */ |
854 | if (unlikely(pmd_trans_huge(*pmd))) | 856 | if (unlikely(pmd_trans_huge(*pmd))) |
@@ -1287,6 +1289,81 @@ out: | |||
1287 | return page; | 1289 | return page; |
1288 | } | 1290 | } |
1289 | 1291 | ||
1292 | /* NUMA hinting page fault entry point for trans huge pmds */ | ||
1293 | int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
1294 | unsigned long addr, pmd_t pmd, pmd_t *pmdp) | ||
1295 | { | ||
1296 | struct page *page; | ||
1297 | unsigned long haddr = addr & HPAGE_PMD_MASK; | ||
1298 | int target_nid; | ||
1299 | int current_nid = -1; | ||
1300 | bool migrated; | ||
1301 | bool page_locked = false; | ||
1302 | |||
1303 | spin_lock(&mm->page_table_lock); | ||
1304 | if (unlikely(!pmd_same(pmd, *pmdp))) | ||
1305 | goto out_unlock; | ||
1306 | |||
1307 | page = pmd_page(pmd); | ||
1308 | get_page(page); | ||
1309 | current_nid = page_to_nid(page); | ||
1310 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
1311 | if (current_nid == numa_node_id()) | ||
1312 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
1313 | |||
1314 | target_nid = mpol_misplaced(page, vma, haddr); | ||
1315 | if (target_nid == -1) { | ||
1316 | put_page(page); | ||
1317 | goto clear_pmdnuma; | ||
1318 | } | ||
1319 | |||
1320 | /* Acquire the page lock to serialise THP migrations */ | ||
1321 | spin_unlock(&mm->page_table_lock); | ||
1322 | lock_page(page); | ||
1323 | page_locked = true; | ||
1324 | |||
1325 | /* Confirm the PTE did not while locked */ | ||
1326 | spin_lock(&mm->page_table_lock); | ||
1327 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1328 | unlock_page(page); | ||
1329 | put_page(page); | ||
1330 | goto out_unlock; | ||
1331 | } | ||
1332 | spin_unlock(&mm->page_table_lock); | ||
1333 | |||
1334 | /* Migrate the THP to the requested node */ | ||
1335 | migrated = migrate_misplaced_transhuge_page(mm, vma, | ||
1336 | pmdp, pmd, addr, | ||
1337 | page, target_nid); | ||
1338 | if (migrated) | ||
1339 | current_nid = target_nid; | ||
1340 | else { | ||
1341 | spin_lock(&mm->page_table_lock); | ||
1342 | if (unlikely(!pmd_same(pmd, *pmdp))) { | ||
1343 | unlock_page(page); | ||
1344 | goto out_unlock; | ||
1345 | } | ||
1346 | goto clear_pmdnuma; | ||
1347 | } | ||
1348 | |||
1349 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1350 | return 0; | ||
1351 | |||
1352 | clear_pmdnuma: | ||
1353 | pmd = pmd_mknonnuma(pmd); | ||
1354 | set_pmd_at(mm, haddr, pmdp, pmd); | ||
1355 | VM_BUG_ON(pmd_numa(*pmdp)); | ||
1356 | update_mmu_cache_pmd(vma, addr, pmdp); | ||
1357 | if (page_locked) | ||
1358 | unlock_page(page); | ||
1359 | |||
1360 | out_unlock: | ||
1361 | spin_unlock(&mm->page_table_lock); | ||
1362 | if (current_nid != -1) | ||
1363 | task_numa_fault(current_nid, HPAGE_PMD_NR, migrated); | ||
1364 | return 0; | ||
1365 | } | ||
1366 | |||
1290 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1367 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1291 | pmd_t *pmd, unsigned long addr) | 1368 | pmd_t *pmd, unsigned long addr) |
1292 | { | 1369 | { |
@@ -1375,7 +1452,7 @@ out: | |||
1375 | } | 1452 | } |
1376 | 1453 | ||
1377 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 1454 | int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
1378 | unsigned long addr, pgprot_t newprot) | 1455 | unsigned long addr, pgprot_t newprot, int prot_numa) |
1379 | { | 1456 | { |
1380 | struct mm_struct *mm = vma->vm_mm; | 1457 | struct mm_struct *mm = vma->vm_mm; |
1381 | int ret = 0; | 1458 | int ret = 0; |
@@ -1383,7 +1460,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1383 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { | 1460 | if (__pmd_trans_huge_lock(pmd, vma) == 1) { |
1384 | pmd_t entry; | 1461 | pmd_t entry; |
1385 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1462 | entry = pmdp_get_and_clear(mm, addr, pmd); |
1386 | entry = pmd_modify(entry, newprot); | 1463 | if (!prot_numa) |
1464 | entry = pmd_modify(entry, newprot); | ||
1465 | else { | ||
1466 | struct page *page = pmd_page(*pmd); | ||
1467 | |||
1468 | /* only check non-shared pages */ | ||
1469 | if (page_mapcount(page) == 1 && | ||
1470 | !pmd_numa(*pmd)) { | ||
1471 | entry = pmd_mknuma(entry); | ||
1472 | } | ||
1473 | } | ||
1387 | BUG_ON(pmd_write(entry)); | 1474 | BUG_ON(pmd_write(entry)); |
1388 | set_pmd_at(mm, addr, pmd, entry); | 1475 | set_pmd_at(mm, addr, pmd, entry); |
1389 | spin_unlock(&vma->vm_mm->page_table_lock); | 1476 | spin_unlock(&vma->vm_mm->page_table_lock); |
@@ -1474,7 +1561,7 @@ static int __split_huge_page_splitting(struct page *page, | |||
1474 | * We can't temporarily set the pmd to null in order | 1561 | * We can't temporarily set the pmd to null in order |
1475 | * to split it, the pmd must remain marked huge at all | 1562 | * to split it, the pmd must remain marked huge at all |
1476 | * times or the VM won't take the pmd_trans_huge paths | 1563 | * times or the VM won't take the pmd_trans_huge paths |
1477 | * and it won't wait on the anon_vma->root->mutex to | 1564 | * and it won't wait on the anon_vma->root->rwsem to |
1478 | * serialize against split_huge_page*. | 1565 | * serialize against split_huge_page*. |
1479 | */ | 1566 | */ |
1480 | pmdp_splitting_flush(vma, address, pmd); | 1567 | pmdp_splitting_flush(vma, address, pmd); |
@@ -1565,6 +1652,7 @@ static void __split_huge_page_refcount(struct page *page) | |||
1565 | page_tail->mapping = page->mapping; | 1652 | page_tail->mapping = page->mapping; |
1566 | 1653 | ||
1567 | page_tail->index = page->index + i; | 1654 | page_tail->index = page->index + i; |
1655 | page_xchg_last_nid(page_tail, page_last_nid(page)); | ||
1568 | 1656 | ||
1569 | BUG_ON(!PageAnon(page_tail)); | 1657 | BUG_ON(!PageAnon(page_tail)); |
1570 | BUG_ON(!PageUptodate(page_tail)); | 1658 | BUG_ON(!PageUptodate(page_tail)); |
@@ -1632,6 +1720,8 @@ static int __split_huge_page_map(struct page *page, | |||
1632 | BUG_ON(page_mapcount(page) != 1); | 1720 | BUG_ON(page_mapcount(page) != 1); |
1633 | if (!pmd_young(*pmd)) | 1721 | if (!pmd_young(*pmd)) |
1634 | entry = pte_mkold(entry); | 1722 | entry = pte_mkold(entry); |
1723 | if (pmd_numa(*pmd)) | ||
1724 | entry = pte_mknuma(entry); | ||
1635 | pte = pte_offset_map(&_pmd, haddr); | 1725 | pte = pte_offset_map(&_pmd, haddr); |
1636 | BUG_ON(!pte_none(*pte)); | 1726 | BUG_ON(!pte_none(*pte)); |
1637 | set_pte_at(mm, haddr, pte, entry); | 1727 | set_pte_at(mm, haddr, pte, entry); |
@@ -1674,7 +1764,7 @@ static int __split_huge_page_map(struct page *page, | |||
1674 | return ret; | 1764 | return ret; |
1675 | } | 1765 | } |
1676 | 1766 | ||
1677 | /* must be called with anon_vma->root->mutex hold */ | 1767 | /* must be called with anon_vma->root->rwsem held */ |
1678 | static void __split_huge_page(struct page *page, | 1768 | static void __split_huge_page(struct page *page, |
1679 | struct anon_vma *anon_vma) | 1769 | struct anon_vma *anon_vma) |
1680 | { | 1770 | { |
@@ -1729,7 +1819,7 @@ int split_huge_page(struct page *page) | |||
1729 | 1819 | ||
1730 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | 1820 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); |
1731 | BUG_ON(!PageAnon(page)); | 1821 | BUG_ON(!PageAnon(page)); |
1732 | anon_vma = page_lock_anon_vma(page); | 1822 | anon_vma = page_lock_anon_vma_read(page); |
1733 | if (!anon_vma) | 1823 | if (!anon_vma) |
1734 | goto out; | 1824 | goto out; |
1735 | ret = 0; | 1825 | ret = 0; |
@@ -1742,7 +1832,7 @@ int split_huge_page(struct page *page) | |||
1742 | 1832 | ||
1743 | BUG_ON(PageCompound(page)); | 1833 | BUG_ON(PageCompound(page)); |
1744 | out_unlock: | 1834 | out_unlock: |
1745 | page_unlock_anon_vma(anon_vma); | 1835 | page_unlock_anon_vma_read(anon_vma); |
1746 | out: | 1836 | out: |
1747 | return ret; | 1837 | return ret; |
1748 | } | 1838 | } |
@@ -2234,7 +2324,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2234 | if (pmd_trans_huge(*pmd)) | 2324 | if (pmd_trans_huge(*pmd)) |
2235 | goto out; | 2325 | goto out; |
2236 | 2326 | ||
2237 | anon_vma_lock(vma->anon_vma); | 2327 | anon_vma_lock_write(vma->anon_vma); |
2238 | 2328 | ||
2239 | pte = pte_offset_map(pmd, address); | 2329 | pte = pte_offset_map(pmd, address); |
2240 | ptl = pte_lockptr(mm, pmd); | 2330 | ptl = pte_lockptr(mm, pmd); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 88e7293b96bd..e5318c7793ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3016,7 +3016,7 @@ same_page: | |||
3016 | return i ? i : -EFAULT; | 3016 | return i ? i : -EFAULT; |
3017 | } | 3017 | } |
3018 | 3018 | ||
3019 | void hugetlb_change_protection(struct vm_area_struct *vma, | 3019 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
3020 | unsigned long address, unsigned long end, pgprot_t newprot) | 3020 | unsigned long address, unsigned long end, pgprot_t newprot) |
3021 | { | 3021 | { |
3022 | struct mm_struct *mm = vma->vm_mm; | 3022 | struct mm_struct *mm = vma->vm_mm; |
@@ -3024,6 +3024,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3024 | pte_t *ptep; | 3024 | pte_t *ptep; |
3025 | pte_t pte; | 3025 | pte_t pte; |
3026 | struct hstate *h = hstate_vma(vma); | 3026 | struct hstate *h = hstate_vma(vma); |
3027 | unsigned long pages = 0; | ||
3027 | 3028 | ||
3028 | BUG_ON(address >= end); | 3029 | BUG_ON(address >= end); |
3029 | flush_cache_range(vma, address, end); | 3030 | flush_cache_range(vma, address, end); |
@@ -3034,12 +3035,15 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3034 | ptep = huge_pte_offset(mm, address); | 3035 | ptep = huge_pte_offset(mm, address); |
3035 | if (!ptep) | 3036 | if (!ptep) |
3036 | continue; | 3037 | continue; |
3037 | if (huge_pmd_unshare(mm, &address, ptep)) | 3038 | if (huge_pmd_unshare(mm, &address, ptep)) { |
3039 | pages++; | ||
3038 | continue; | 3040 | continue; |
3041 | } | ||
3039 | if (!huge_pte_none(huge_ptep_get(ptep))) { | 3042 | if (!huge_pte_none(huge_ptep_get(ptep))) { |
3040 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 3043 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
3041 | pte = pte_mkhuge(pte_modify(pte, newprot)); | 3044 | pte = pte_mkhuge(pte_modify(pte, newprot)); |
3042 | set_huge_pte_at(mm, address, ptep, pte); | 3045 | set_huge_pte_at(mm, address, ptep, pte); |
3046 | pages++; | ||
3043 | } | 3047 | } |
3044 | } | 3048 | } |
3045 | spin_unlock(&mm->page_table_lock); | 3049 | spin_unlock(&mm->page_table_lock); |
@@ -3051,6 +3055,8 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
3051 | */ | 3055 | */ |
3052 | flush_tlb_range(vma, start, end); | 3056 | flush_tlb_range(vma, start, end); |
3053 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); | 3057 | mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); |
3058 | |||
3059 | return pages << h->order; | ||
3054 | } | 3060 | } |
3055 | 3061 | ||
3056 | int hugetlb_reserve_pages(struct inode *inode, | 3062 | int hugetlb_reserve_pages(struct inode *inode, |
diff --git a/mm/internal.h b/mm/internal.h index 52d1fa957194..d597f94cc205 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -217,15 +217,18 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
217 | { | 217 | { |
218 | if (TestClearPageMlocked(page)) { | 218 | if (TestClearPageMlocked(page)) { |
219 | unsigned long flags; | 219 | unsigned long flags; |
220 | int nr_pages = hpage_nr_pages(page); | ||
220 | 221 | ||
221 | local_irq_save(flags); | 222 | local_irq_save(flags); |
222 | __dec_zone_page_state(page, NR_MLOCK); | 223 | __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); |
223 | SetPageMlocked(newpage); | 224 | SetPageMlocked(newpage); |
224 | __inc_zone_page_state(newpage, NR_MLOCK); | 225 | __mod_zone_page_state(page_zone(newpage), NR_MLOCK, nr_pages); |
225 | local_irq_restore(flags); | 226 | local_irq_restore(flags); |
226 | } | 227 | } |
227 | } | 228 | } |
228 | 229 | ||
230 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); | ||
231 | |||
229 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 232 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
230 | extern unsigned long vma_address(struct page *page, | 233 | extern unsigned long vma_address(struct page *page, |
231 | struct vm_area_struct *vma); | 234 | struct vm_area_struct *vma); |
@@ -1624,7 +1624,7 @@ again: | |||
1624 | struct anon_vma_chain *vmac; | 1624 | struct anon_vma_chain *vmac; |
1625 | struct vm_area_struct *vma; | 1625 | struct vm_area_struct *vma; |
1626 | 1626 | ||
1627 | anon_vma_lock(anon_vma); | 1627 | anon_vma_lock_write(anon_vma); |
1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1628 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1629 | 0, ULONG_MAX) { | 1629 | 0, ULONG_MAX) { |
1630 | vma = vmac->vma; | 1630 | vma = vmac->vma; |
@@ -1678,7 +1678,7 @@ again: | |||
1678 | struct anon_vma_chain *vmac; | 1678 | struct anon_vma_chain *vmac; |
1679 | struct vm_area_struct *vma; | 1679 | struct vm_area_struct *vma; |
1680 | 1680 | ||
1681 | anon_vma_lock(anon_vma); | 1681 | anon_vma_lock_write(anon_vma); |
1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1682 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1683 | 0, ULONG_MAX) { | 1683 | 0, ULONG_MAX) { |
1684 | vma = vmac->vma; | 1684 | vma = vmac->vma; |
@@ -1731,7 +1731,7 @@ again: | |||
1731 | struct anon_vma_chain *vmac; | 1731 | struct anon_vma_chain *vmac; |
1732 | struct vm_area_struct *vma; | 1732 | struct vm_area_struct *vma; |
1733 | 1733 | ||
1734 | anon_vma_lock(anon_vma); | 1734 | anon_vma_lock_write(anon_vma); |
1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, | 1735 | anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, |
1736 | 0, ULONG_MAX) { | 1736 | 0, ULONG_MAX) { |
1737 | vma = vmac->vma; | 1737 | vma = vmac->vma; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c055929c8cc..bbfac5063ca8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -3289,15 +3289,18 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3289 | struct mem_cgroup **memcgp) | 3289 | struct mem_cgroup **memcgp) |
3290 | { | 3290 | { |
3291 | struct mem_cgroup *memcg = NULL; | 3291 | struct mem_cgroup *memcg = NULL; |
3292 | unsigned int nr_pages = 1; | ||
3292 | struct page_cgroup *pc; | 3293 | struct page_cgroup *pc; |
3293 | enum charge_type ctype; | 3294 | enum charge_type ctype; |
3294 | 3295 | ||
3295 | *memcgp = NULL; | 3296 | *memcgp = NULL; |
3296 | 3297 | ||
3297 | VM_BUG_ON(PageTransHuge(page)); | ||
3298 | if (mem_cgroup_disabled()) | 3298 | if (mem_cgroup_disabled()) |
3299 | return; | 3299 | return; |
3300 | 3300 | ||
3301 | if (PageTransHuge(page)) | ||
3302 | nr_pages <<= compound_order(page); | ||
3303 | |||
3301 | pc = lookup_page_cgroup(page); | 3304 | pc = lookup_page_cgroup(page); |
3302 | lock_page_cgroup(pc); | 3305 | lock_page_cgroup(pc); |
3303 | if (PageCgroupUsed(pc)) { | 3306 | if (PageCgroupUsed(pc)) { |
@@ -3359,7 +3362,7 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, | |||
3359 | * charged to the res_counter since we plan on replacing the | 3362 | * charged to the res_counter since we plan on replacing the |
3360 | * old one and only one page is going to be left afterwards. | 3363 | * old one and only one page is going to be left afterwards. |
3361 | */ | 3364 | */ |
3362 | __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); | 3365 | __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); |
3363 | } | 3366 | } |
3364 | 3367 | ||
3365 | /* remove redundant charge if migration failed*/ | 3368 | /* remove redundant charge if migration failed*/ |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 108c52fa60f6..c6e4dd3e1c08 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -402,7 +402,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
402 | struct anon_vma *av; | 402 | struct anon_vma *av; |
403 | pgoff_t pgoff; | 403 | pgoff_t pgoff; |
404 | 404 | ||
405 | av = page_lock_anon_vma(page); | 405 | av = page_lock_anon_vma_read(page); |
406 | if (av == NULL) /* Not actually mapped anymore */ | 406 | if (av == NULL) /* Not actually mapped anymore */ |
407 | return; | 407 | return; |
408 | 408 | ||
@@ -423,7 +423,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
423 | } | 423 | } |
424 | } | 424 | } |
425 | read_unlock(&tasklist_lock); | 425 | read_unlock(&tasklist_lock); |
426 | page_unlock_anon_vma(av); | 426 | page_unlock_anon_vma_read(av); |
427 | } | 427 | } |
428 | 428 | ||
429 | /* | 429 | /* |
@@ -1566,7 +1566,8 @@ int soft_offline_page(struct page *page, int flags) | |||
1566 | page_is_file_cache(page)); | 1566 | page_is_file_cache(page)); |
1567 | list_add(&page->lru, &pagelist); | 1567 | list_add(&page->lru, &pagelist); |
1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, | 1568 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, |
1569 | false, MIGRATE_SYNC); | 1569 | false, MIGRATE_SYNC, |
1570 | MR_MEMORY_FAILURE); | ||
1570 | if (ret) { | 1571 | if (ret) { |
1571 | putback_lru_pages(&pagelist); | 1572 | putback_lru_pages(&pagelist); |
1572 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", | 1573 | pr_info("soft offline: %#lx: migration failed %d, type %lx\n", |
diff --git a/mm/memory.c b/mm/memory.c index db2e9e797a05..e6a3b933517e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -57,6 +57,7 @@ | |||
57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
59 | #include <linux/gfp.h> | 59 | #include <linux/gfp.h> |
60 | #include <linux/migrate.h> | ||
60 | 61 | ||
61 | #include <asm/io.h> | 62 | #include <asm/io.h> |
62 | #include <asm/pgalloc.h> | 63 | #include <asm/pgalloc.h> |
@@ -1503,6 +1504,8 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
1503 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1504 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
1504 | goto out; | 1505 | goto out; |
1505 | } | 1506 | } |
1507 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | ||
1508 | goto no_page_table; | ||
1506 | if (pmd_trans_huge(*pmd)) { | 1509 | if (pmd_trans_huge(*pmd)) { |
1507 | if (flags & FOLL_SPLIT) { | 1510 | if (flags & FOLL_SPLIT) { |
1508 | split_huge_page_pmd(vma, address, pmd); | 1511 | split_huge_page_pmd(vma, address, pmd); |
@@ -1532,6 +1535,8 @@ split_fallthrough: | |||
1532 | pte = *ptep; | 1535 | pte = *ptep; |
1533 | if (!pte_present(pte)) | 1536 | if (!pte_present(pte)) |
1534 | goto no_page; | 1537 | goto no_page; |
1538 | if ((flags & FOLL_NUMA) && pte_numa(pte)) | ||
1539 | goto no_page; | ||
1535 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1540 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
1536 | goto unlock; | 1541 | goto unlock; |
1537 | 1542 | ||
@@ -1683,6 +1688,19 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1683 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1688 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
1684 | vm_flags &= (gup_flags & FOLL_FORCE) ? | 1689 | vm_flags &= (gup_flags & FOLL_FORCE) ? |
1685 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1690 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
1691 | |||
1692 | /* | ||
1693 | * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault | ||
1694 | * would be called on PROT_NONE ranges. We must never invoke | ||
1695 | * handle_mm_fault on PROT_NONE ranges or the NUMA hinting | ||
1696 | * page faults would unprotect the PROT_NONE ranges if | ||
1697 | * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd | ||
1698 | * bitflag. So to avoid that, don't set FOLL_NUMA if | ||
1699 | * FOLL_FORCE is set. | ||
1700 | */ | ||
1701 | if (!(gup_flags & FOLL_FORCE)) | ||
1702 | gup_flags |= FOLL_NUMA; | ||
1703 | |||
1686 | i = 0; | 1704 | i = 0; |
1687 | 1705 | ||
1688 | do { | 1706 | do { |
@@ -3412,6 +3430,169 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3412 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 3430 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
3413 | } | 3431 | } |
3414 | 3432 | ||
3433 | int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | ||
3434 | unsigned long addr, int current_nid) | ||
3435 | { | ||
3436 | get_page(page); | ||
3437 | |||
3438 | count_vm_numa_event(NUMA_HINT_FAULTS); | ||
3439 | if (current_nid == numa_node_id()) | ||
3440 | count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); | ||
3441 | |||
3442 | return mpol_misplaced(page, vma, addr); | ||
3443 | } | ||
3444 | |||
3445 | int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3446 | unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) | ||
3447 | { | ||
3448 | struct page *page = NULL; | ||
3449 | spinlock_t *ptl; | ||
3450 | int current_nid = -1; | ||
3451 | int target_nid; | ||
3452 | bool migrated = false; | ||
3453 | |||
3454 | /* | ||
3455 | * The "pte" at this point cannot be used safely without | ||
3456 | * validation through pte_unmap_same(). It's of NUMA type but | ||
3457 | * the pfn may be screwed if the read is non atomic. | ||
3458 | * | ||
3459 | * ptep_modify_prot_start is not called as this is clearing | ||
3460 | * the _PAGE_NUMA bit and it is not really expected that there | ||
3461 | * would be concurrent hardware modifications to the PTE. | ||
3462 | */ | ||
3463 | ptl = pte_lockptr(mm, pmd); | ||
3464 | spin_lock(ptl); | ||
3465 | if (unlikely(!pte_same(*ptep, pte))) { | ||
3466 | pte_unmap_unlock(ptep, ptl); | ||
3467 | goto out; | ||
3468 | } | ||
3469 | |||
3470 | pte = pte_mknonnuma(pte); | ||
3471 | set_pte_at(mm, addr, ptep, pte); | ||
3472 | update_mmu_cache(vma, addr, ptep); | ||
3473 | |||
3474 | page = vm_normal_page(vma, addr, pte); | ||
3475 | if (!page) { | ||
3476 | pte_unmap_unlock(ptep, ptl); | ||
3477 | return 0; | ||
3478 | } | ||
3479 | |||
3480 | current_nid = page_to_nid(page); | ||
3481 | target_nid = numa_migrate_prep(page, vma, addr, current_nid); | ||
3482 | pte_unmap_unlock(ptep, ptl); | ||
3483 | if (target_nid == -1) { | ||
3484 | /* | ||
3485 | * Account for the fault against the current node if it not | ||
3486 | * being replaced regardless of where the page is located. | ||
3487 | */ | ||
3488 | current_nid = numa_node_id(); | ||
3489 | put_page(page); | ||
3490 | goto out; | ||
3491 | } | ||
3492 | |||
3493 | /* Migrate to the requested node */ | ||
3494 | migrated = migrate_misplaced_page(page, target_nid); | ||
3495 | if (migrated) | ||
3496 | current_nid = target_nid; | ||
3497 | |||
3498 | out: | ||
3499 | if (current_nid != -1) | ||
3500 | task_numa_fault(current_nid, 1, migrated); | ||
3501 | return 0; | ||
3502 | } | ||
3503 | |||
3504 | /* NUMA hinting page fault entry point for regular pmds */ | ||
3505 | #ifdef CONFIG_NUMA_BALANCING | ||
3506 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3507 | unsigned long addr, pmd_t *pmdp) | ||
3508 | { | ||
3509 | pmd_t pmd; | ||
3510 | pte_t *pte, *orig_pte; | ||
3511 | unsigned long _addr = addr & PMD_MASK; | ||
3512 | unsigned long offset; | ||
3513 | spinlock_t *ptl; | ||
3514 | bool numa = false; | ||
3515 | int local_nid = numa_node_id(); | ||
3516 | |||
3517 | spin_lock(&mm->page_table_lock); | ||
3518 | pmd = *pmdp; | ||
3519 | if (pmd_numa(pmd)) { | ||
3520 | set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); | ||
3521 | numa = true; | ||
3522 | } | ||
3523 | spin_unlock(&mm->page_table_lock); | ||
3524 | |||
3525 | if (!numa) | ||
3526 | return 0; | ||
3527 | |||
3528 | /* we're in a page fault so some vma must be in the range */ | ||
3529 | BUG_ON(!vma); | ||
3530 | BUG_ON(vma->vm_start >= _addr + PMD_SIZE); | ||
3531 | offset = max(_addr, vma->vm_start) & ~PMD_MASK; | ||
3532 | VM_BUG_ON(offset >= PMD_SIZE); | ||
3533 | orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); | ||
3534 | pte += offset >> PAGE_SHIFT; | ||
3535 | for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { | ||
3536 | pte_t pteval = *pte; | ||
3537 | struct page *page; | ||
3538 | int curr_nid = local_nid; | ||
3539 | int target_nid; | ||
3540 | bool migrated; | ||
3541 | if (!pte_present(pteval)) | ||
3542 | continue; | ||
3543 | if (!pte_numa(pteval)) | ||
3544 | continue; | ||
3545 | if (addr >= vma->vm_end) { | ||
3546 | vma = find_vma(mm, addr); | ||
3547 | /* there's a pte present so there must be a vma */ | ||
3548 | BUG_ON(!vma); | ||
3549 | BUG_ON(addr < vma->vm_start); | ||
3550 | } | ||
3551 | if (pte_numa(pteval)) { | ||
3552 | pteval = pte_mknonnuma(pteval); | ||
3553 | set_pte_at(mm, addr, pte, pteval); | ||
3554 | } | ||
3555 | page = vm_normal_page(vma, addr, pteval); | ||
3556 | if (unlikely(!page)) | ||
3557 | continue; | ||
3558 | /* only check non-shared pages */ | ||
3559 | if (unlikely(page_mapcount(page) != 1)) | ||
3560 | continue; | ||
3561 | |||
3562 | /* | ||
3563 | * Note that the NUMA fault is later accounted to either | ||
3564 | * the node that is currently running or where the page is | ||
3565 | * migrated to. | ||
3566 | */ | ||
3567 | curr_nid = local_nid; | ||
3568 | target_nid = numa_migrate_prep(page, vma, addr, | ||
3569 | page_to_nid(page)); | ||
3570 | if (target_nid == -1) { | ||
3571 | put_page(page); | ||
3572 | continue; | ||
3573 | } | ||
3574 | |||
3575 | /* Migrate to the requested node */ | ||
3576 | pte_unmap_unlock(pte, ptl); | ||
3577 | migrated = migrate_misplaced_page(page, target_nid); | ||
3578 | if (migrated) | ||
3579 | curr_nid = target_nid; | ||
3580 | task_numa_fault(curr_nid, 1, migrated); | ||
3581 | |||
3582 | pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); | ||
3583 | } | ||
3584 | pte_unmap_unlock(orig_pte, ptl); | ||
3585 | |||
3586 | return 0; | ||
3587 | } | ||
3588 | #else | ||
3589 | static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
3590 | unsigned long addr, pmd_t *pmdp) | ||
3591 | { | ||
3592 | BUG(); | ||
3593 | } | ||
3594 | #endif /* CONFIG_NUMA_BALANCING */ | ||
3595 | |||
3415 | /* | 3596 | /* |
3416 | * These routines also need to handle stuff like marking pages dirty | 3597 | * These routines also need to handle stuff like marking pages dirty |
3417 | * and/or accessed for architectures that don't do it in hardware (most | 3598 | * and/or accessed for architectures that don't do it in hardware (most |
@@ -3450,6 +3631,9 @@ int handle_pte_fault(struct mm_struct *mm, | |||
3450 | pte, pmd, flags, entry); | 3631 | pte, pmd, flags, entry); |
3451 | } | 3632 | } |
3452 | 3633 | ||
3634 | if (pte_numa(entry)) | ||
3635 | return do_numa_page(mm, vma, address, entry, pte, pmd); | ||
3636 | |||
3453 | ptl = pte_lockptr(mm, pmd); | 3637 | ptl = pte_lockptr(mm, pmd); |
3454 | spin_lock(ptl); | 3638 | spin_lock(ptl); |
3455 | if (unlikely(!pte_same(*pte, entry))) | 3639 | if (unlikely(!pte_same(*pte, entry))) |
@@ -3520,8 +3704,11 @@ retry: | |||
3520 | if (pmd_trans_huge(orig_pmd)) { | 3704 | if (pmd_trans_huge(orig_pmd)) { |
3521 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | 3705 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3522 | 3706 | ||
3523 | if (dirty && !pmd_write(orig_pmd) && | 3707 | if (pmd_numa(orig_pmd)) |
3524 | !pmd_trans_splitting(orig_pmd)) { | 3708 | return do_huge_pmd_numa_page(mm, vma, address, |
3709 | orig_pmd, pmd); | ||
3710 | |||
3711 | if (dirty && !pmd_write(orig_pmd)) { | ||
3525 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, | 3712 | ret = do_huge_pmd_wp_page(mm, vma, address, pmd, |
3526 | orig_pmd); | 3713 | orig_pmd); |
3527 | /* | 3714 | /* |
@@ -3536,16 +3723,21 @@ retry: | |||
3536 | huge_pmd_set_accessed(mm, vma, address, pmd, | 3723 | huge_pmd_set_accessed(mm, vma, address, pmd, |
3537 | orig_pmd, dirty); | 3724 | orig_pmd, dirty); |
3538 | } | 3725 | } |
3726 | |||
3539 | return 0; | 3727 | return 0; |
3540 | } | 3728 | } |
3541 | } | 3729 | } |
3542 | 3730 | ||
3731 | if (pmd_numa(*pmd)) | ||
3732 | return do_pmd_numa_page(mm, vma, address, pmd); | ||
3733 | |||
3543 | /* | 3734 | /* |
3544 | * Use __pte_alloc instead of pte_alloc_map, because we can't | 3735 | * Use __pte_alloc instead of pte_alloc_map, because we can't |
3545 | * run pte_offset_map on the pmd, if an huge pmd could | 3736 | * run pte_offset_map on the pmd, if an huge pmd could |
3546 | * materialize from under us from a different thread. | 3737 | * materialize from under us from a different thread. |
3547 | */ | 3738 | */ |
3548 | if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address)) | 3739 | if (unlikely(pmd_none(*pmd)) && |
3740 | unlikely(__pte_alloc(mm, vma, pmd, address))) | ||
3549 | return VM_FAULT_OOM; | 3741 | return VM_FAULT_OOM; |
3550 | /* if an huge pmd materialized from under us just retry later */ | 3742 | /* if an huge pmd materialized from under us just retry later */ |
3551 | if (unlikely(pmd_trans_huge(*pmd))) | 3743 | if (unlikely(pmd_trans_huge(*pmd))) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 518baa896e83..962e353aa86f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1055,7 +1055,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
1055 | * migrate_pages returns # of failed pages. | 1055 | * migrate_pages returns # of failed pages. |
1056 | */ | 1056 | */ |
1057 | ret = migrate_pages(&source, alloc_migrate_target, 0, | 1057 | ret = migrate_pages(&source, alloc_migrate_target, 0, |
1058 | true, MIGRATE_SYNC); | 1058 | true, MIGRATE_SYNC, |
1059 | MR_MEMORY_HOTPLUG); | ||
1059 | if (ret) | 1060 | if (ret) |
1060 | putback_lru_pages(&source); | 1061 | putback_lru_pages(&source); |
1061 | } | 1062 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aaf54566cb6b..d1b315e98627 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -90,6 +90,7 @@ | |||
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | 92 | #include <linux/mm_inline.h> |
93 | #include <linux/mmu_notifier.h> | ||
93 | 94 | ||
94 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
95 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
@@ -117,6 +118,26 @@ static struct mempolicy default_policy = { | |||
117 | .flags = MPOL_F_LOCAL, | 118 | .flags = MPOL_F_LOCAL, |
118 | }; | 119 | }; |
119 | 120 | ||
121 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | ||
122 | |||
123 | static struct mempolicy *get_task_policy(struct task_struct *p) | ||
124 | { | ||
125 | struct mempolicy *pol = p->mempolicy; | ||
126 | int node; | ||
127 | |||
128 | if (!pol) { | ||
129 | node = numa_node_id(); | ||
130 | if (node != -1) | ||
131 | pol = &preferred_node_policy[node]; | ||
132 | |||
133 | /* preferred_node_policy is not initialised early in boot */ | ||
134 | if (!pol->mode) | ||
135 | pol = NULL; | ||
136 | } | ||
137 | |||
138 | return pol; | ||
139 | } | ||
140 | |||
120 | static const struct mempolicy_operations { | 141 | static const struct mempolicy_operations { |
121 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); | 142 | int (*create)(struct mempolicy *pol, const nodemask_t *nodes); |
122 | /* | 143 | /* |
@@ -254,7 +275,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
254 | if (mode == MPOL_DEFAULT) { | 275 | if (mode == MPOL_DEFAULT) { |
255 | if (nodes && !nodes_empty(*nodes)) | 276 | if (nodes && !nodes_empty(*nodes)) |
256 | return ERR_PTR(-EINVAL); | 277 | return ERR_PTR(-EINVAL); |
257 | return NULL; /* simply delete any existing policy */ | 278 | return NULL; |
258 | } | 279 | } |
259 | VM_BUG_ON(!nodes); | 280 | VM_BUG_ON(!nodes); |
260 | 281 | ||
@@ -269,6 +290,10 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
269 | (flags & MPOL_F_RELATIVE_NODES))) | 290 | (flags & MPOL_F_RELATIVE_NODES))) |
270 | return ERR_PTR(-EINVAL); | 291 | return ERR_PTR(-EINVAL); |
271 | } | 292 | } |
293 | } else if (mode == MPOL_LOCAL) { | ||
294 | if (!nodes_empty(*nodes)) | ||
295 | return ERR_PTR(-EINVAL); | ||
296 | mode = MPOL_PREFERRED; | ||
272 | } else if (nodes_empty(*nodes)) | 297 | } else if (nodes_empty(*nodes)) |
273 | return ERR_PTR(-EINVAL); | 298 | return ERR_PTR(-EINVAL); |
274 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 299 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -561,6 +586,36 @@ static inline int check_pgd_range(struct vm_area_struct *vma, | |||
561 | return 0; | 586 | return 0; |
562 | } | 587 | } |
563 | 588 | ||
589 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
590 | /* | ||
591 | * This is used to mark a range of virtual addresses to be inaccessible. | ||
592 | * These are later cleared by a NUMA hinting fault. Depending on these | ||
593 | * faults, pages may be migrated for better NUMA placement. | ||
594 | * | ||
595 | * This is assuming that NUMA faults are handled using PROT_NONE. If | ||
596 | * an architecture makes a different choice, it will need further | ||
597 | * changes to the core. | ||
598 | */ | ||
599 | unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
600 | unsigned long addr, unsigned long end) | ||
601 | { | ||
602 | int nr_updated; | ||
603 | BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); | ||
604 | |||
605 | nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); | ||
606 | if (nr_updated) | ||
607 | count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); | ||
608 | |||
609 | return nr_updated; | ||
610 | } | ||
611 | #else | ||
612 | static unsigned long change_prot_numa(struct vm_area_struct *vma, | ||
613 | unsigned long addr, unsigned long end) | ||
614 | { | ||
615 | return 0; | ||
616 | } | ||
617 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
618 | |||
564 | /* | 619 | /* |
565 | * Check if all pages in a range are on a set of nodes. | 620 | * Check if all pages in a range are on a set of nodes. |
566 | * If pagelist != NULL then isolate pages from the LRU and | 621 | * If pagelist != NULL then isolate pages from the LRU and |
@@ -579,22 +634,32 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
579 | return ERR_PTR(-EFAULT); | 634 | return ERR_PTR(-EFAULT); |
580 | prev = NULL; | 635 | prev = NULL; |
581 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 636 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
637 | unsigned long endvma = vma->vm_end; | ||
638 | |||
639 | if (endvma > end) | ||
640 | endvma = end; | ||
641 | if (vma->vm_start > start) | ||
642 | start = vma->vm_start; | ||
643 | |||
582 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { | 644 | if (!(flags & MPOL_MF_DISCONTIG_OK)) { |
583 | if (!vma->vm_next && vma->vm_end < end) | 645 | if (!vma->vm_next && vma->vm_end < end) |
584 | return ERR_PTR(-EFAULT); | 646 | return ERR_PTR(-EFAULT); |
585 | if (prev && prev->vm_end < vma->vm_start) | 647 | if (prev && prev->vm_end < vma->vm_start) |
586 | return ERR_PTR(-EFAULT); | 648 | return ERR_PTR(-EFAULT); |
587 | } | 649 | } |
588 | if (!is_vm_hugetlb_page(vma) && | 650 | |
589 | ((flags & MPOL_MF_STRICT) || | 651 | if (is_vm_hugetlb_page(vma)) |
652 | goto next; | ||
653 | |||
654 | if (flags & MPOL_MF_LAZY) { | ||
655 | change_prot_numa(vma, start, endvma); | ||
656 | goto next; | ||
657 | } | ||
658 | |||
659 | if ((flags & MPOL_MF_STRICT) || | ||
590 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && | 660 | ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && |
591 | vma_migratable(vma)))) { | 661 | vma_migratable(vma))) { |
592 | unsigned long endvma = vma->vm_end; | ||
593 | 662 | ||
594 | if (endvma > end) | ||
595 | endvma = end; | ||
596 | if (vma->vm_start > start) | ||
597 | start = vma->vm_start; | ||
598 | err = check_pgd_range(vma, start, endvma, nodes, | 663 | err = check_pgd_range(vma, start, endvma, nodes, |
599 | flags, private); | 664 | flags, private); |
600 | if (err) { | 665 | if (err) { |
@@ -602,6 +667,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
602 | break; | 667 | break; |
603 | } | 668 | } |
604 | } | 669 | } |
670 | next: | ||
605 | prev = vma; | 671 | prev = vma; |
606 | } | 672 | } |
607 | return first; | 673 | return first; |
@@ -961,7 +1027,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
961 | 1027 | ||
962 | if (!list_empty(&pagelist)) { | 1028 | if (!list_empty(&pagelist)) { |
963 | err = migrate_pages(&pagelist, new_node_page, dest, | 1029 | err = migrate_pages(&pagelist, new_node_page, dest, |
964 | false, MIGRATE_SYNC); | 1030 | false, MIGRATE_SYNC, |
1031 | MR_SYSCALL); | ||
965 | if (err) | 1032 | if (err) |
966 | putback_lru_pages(&pagelist); | 1033 | putback_lru_pages(&pagelist); |
967 | } | 1034 | } |
@@ -1133,8 +1200,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1133 | int err; | 1200 | int err; |
1134 | LIST_HEAD(pagelist); | 1201 | LIST_HEAD(pagelist); |
1135 | 1202 | ||
1136 | if (flags & ~(unsigned long)(MPOL_MF_STRICT | | 1203 | if (flags & ~(unsigned long)MPOL_MF_VALID) |
1137 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
1138 | return -EINVAL; | 1204 | return -EINVAL; |
1139 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) | 1205 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
1140 | return -EPERM; | 1206 | return -EPERM; |
@@ -1157,6 +1223,9 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1157 | if (IS_ERR(new)) | 1223 | if (IS_ERR(new)) |
1158 | return PTR_ERR(new); | 1224 | return PTR_ERR(new); |
1159 | 1225 | ||
1226 | if (flags & MPOL_MF_LAZY) | ||
1227 | new->flags |= MPOL_F_MOF; | ||
1228 | |||
1160 | /* | 1229 | /* |
1161 | * If we are using the default policy then operation | 1230 | * If we are using the default policy then operation |
1162 | * on discontinuous address spaces is okay after all | 1231 | * on discontinuous address spaces is okay after all |
@@ -1193,21 +1262,24 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1193 | vma = check_range(mm, start, end, nmask, | 1262 | vma = check_range(mm, start, end, nmask, |
1194 | flags | MPOL_MF_INVERT, &pagelist); | 1263 | flags | MPOL_MF_INVERT, &pagelist); |
1195 | 1264 | ||
1196 | err = PTR_ERR(vma); | 1265 | err = PTR_ERR(vma); /* maybe ... */ |
1197 | if (!IS_ERR(vma)) { | 1266 | if (!IS_ERR(vma)) |
1198 | int nr_failed = 0; | ||
1199 | |||
1200 | err = mbind_range(mm, start, end, new); | 1267 | err = mbind_range(mm, start, end, new); |
1201 | 1268 | ||
1269 | if (!err) { | ||
1270 | int nr_failed = 0; | ||
1271 | |||
1202 | if (!list_empty(&pagelist)) { | 1272 | if (!list_empty(&pagelist)) { |
1273 | WARN_ON_ONCE(flags & MPOL_MF_LAZY); | ||
1203 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1274 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1204 | (unsigned long)vma, | 1275 | (unsigned long)vma, |
1205 | false, MIGRATE_SYNC); | 1276 | false, MIGRATE_SYNC, |
1277 | MR_MEMPOLICY_MBIND); | ||
1206 | if (nr_failed) | 1278 | if (nr_failed) |
1207 | putback_lru_pages(&pagelist); | 1279 | putback_lru_pages(&pagelist); |
1208 | } | 1280 | } |
1209 | 1281 | ||
1210 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1282 | if (nr_failed && (flags & MPOL_MF_STRICT)) |
1211 | err = -EIO; | 1283 | err = -EIO; |
1212 | } else | 1284 | } else |
1213 | putback_lru_pages(&pagelist); | 1285 | putback_lru_pages(&pagelist); |
@@ -1546,7 +1618,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, | |||
1546 | struct mempolicy *get_vma_policy(struct task_struct *task, | 1618 | struct mempolicy *get_vma_policy(struct task_struct *task, |
1547 | struct vm_area_struct *vma, unsigned long addr) | 1619 | struct vm_area_struct *vma, unsigned long addr) |
1548 | { | 1620 | { |
1549 | struct mempolicy *pol = task->mempolicy; | 1621 | struct mempolicy *pol = get_task_policy(task); |
1550 | 1622 | ||
1551 | if (vma) { | 1623 | if (vma) { |
1552 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1624 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
@@ -1956,7 +2028,7 @@ retry_cpuset: | |||
1956 | */ | 2028 | */ |
1957 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2029 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
1958 | { | 2030 | { |
1959 | struct mempolicy *pol = current->mempolicy; | 2031 | struct mempolicy *pol = get_task_policy(current); |
1960 | struct page *page; | 2032 | struct page *page; |
1961 | unsigned int cpuset_mems_cookie; | 2033 | unsigned int cpuset_mems_cookie; |
1962 | 2034 | ||
@@ -2140,6 +2212,115 @@ static void sp_free(struct sp_node *n) | |||
2140 | kmem_cache_free(sn_cache, n); | 2212 | kmem_cache_free(sn_cache, n); |
2141 | } | 2213 | } |
2142 | 2214 | ||
2215 | /** | ||
2216 | * mpol_misplaced - check whether current page node is valid in policy | ||
2217 | * | ||
2218 | * @page - page to be checked | ||
2219 | * @vma - vm area where page mapped | ||
2220 | * @addr - virtual address where page mapped | ||
2221 | * | ||
2222 | * Lookup current policy node id for vma,addr and "compare to" page's | ||
2223 | * node id. | ||
2224 | * | ||
2225 | * Returns: | ||
2226 | * -1 - not misplaced, page is in the right node | ||
2227 | * node - node id where the page should be | ||
2228 | * | ||
2229 | * Policy determination "mimics" alloc_page_vma(). | ||
2230 | * Called from fault path where we know the vma and faulting address. | ||
2231 | */ | ||
2232 | int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) | ||
2233 | { | ||
2234 | struct mempolicy *pol; | ||
2235 | struct zone *zone; | ||
2236 | int curnid = page_to_nid(page); | ||
2237 | unsigned long pgoff; | ||
2238 | int polnid = -1; | ||
2239 | int ret = -1; | ||
2240 | |||
2241 | BUG_ON(!vma); | ||
2242 | |||
2243 | pol = get_vma_policy(current, vma, addr); | ||
2244 | if (!(pol->flags & MPOL_F_MOF)) | ||
2245 | goto out; | ||
2246 | |||
2247 | switch (pol->mode) { | ||
2248 | case MPOL_INTERLEAVE: | ||
2249 | BUG_ON(addr >= vma->vm_end); | ||
2250 | BUG_ON(addr < vma->vm_start); | ||
2251 | |||
2252 | pgoff = vma->vm_pgoff; | ||
2253 | pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; | ||
2254 | polnid = offset_il_node(pol, vma, pgoff); | ||
2255 | break; | ||
2256 | |||
2257 | case MPOL_PREFERRED: | ||
2258 | if (pol->flags & MPOL_F_LOCAL) | ||
2259 | polnid = numa_node_id(); | ||
2260 | else | ||
2261 | polnid = pol->v.preferred_node; | ||
2262 | break; | ||
2263 | |||
2264 | case MPOL_BIND: | ||
2265 | /* | ||
2266 | * allows binding to multiple nodes. | ||
2267 | * use current page if in policy nodemask, | ||
2268 | * else select nearest allowed node, if any. | ||
2269 | * If no allowed nodes, use current [!misplaced]. | ||
2270 | */ | ||
2271 | if (node_isset(curnid, pol->v.nodes)) | ||
2272 | goto out; | ||
2273 | (void)first_zones_zonelist( | ||
2274 | node_zonelist(numa_node_id(), GFP_HIGHUSER), | ||
2275 | gfp_zone(GFP_HIGHUSER), | ||
2276 | &pol->v.nodes, &zone); | ||
2277 | polnid = zone->node; | ||
2278 | break; | ||
2279 | |||
2280 | default: | ||
2281 | BUG(); | ||
2282 | } | ||
2283 | |||
2284 | /* Migrate the page towards the node whose CPU is referencing it */ | ||
2285 | if (pol->flags & MPOL_F_MORON) { | ||
2286 | int last_nid; | ||
2287 | |||
2288 | polnid = numa_node_id(); | ||
2289 | |||
2290 | /* | ||
2291 | * Multi-stage node selection is used in conjunction | ||
2292 | * with a periodic migration fault to build a temporal | ||
2293 | * task<->page relation. By using a two-stage filter we | ||
2294 | * remove short/unlikely relations. | ||
2295 | * | ||
2296 | * Using P(p) ~ n_p / n_t as per frequentist | ||
2297 | * probability, we can equate a task's usage of a | ||
2298 | * particular page (n_p) per total usage of this | ||
2299 | * page (n_t) (in a given time-span) to a probability. | ||
2300 | * | ||
2301 | * Our periodic faults will sample this probability and | ||
2302 | * getting the same result twice in a row, given these | ||
2303 | * samples are fully independent, is then given by | ||
2304 | * P(n)^2, provided our sample period is sufficiently | ||
2305 | * short compared to the usage pattern. | ||
2306 | * | ||
2307 | * This quadric squishes small probabilities, making | ||
2308 | * it less likely we act on an unlikely task<->page | ||
2309 | * relation. | ||
2310 | */ | ||
2311 | last_nid = page_xchg_last_nid(page, polnid); | ||
2312 | if (last_nid != polnid) | ||
2313 | goto out; | ||
2314 | } | ||
2315 | |||
2316 | if (curnid != polnid) | ||
2317 | ret = polnid; | ||
2318 | out: | ||
2319 | mpol_cond_put(pol); | ||
2320 | |||
2321 | return ret; | ||
2322 | } | ||
2323 | |||
2143 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 2324 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
2144 | { | 2325 | { |
2145 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); | 2326 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
@@ -2305,6 +2486,50 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
2305 | mutex_unlock(&p->mutex); | 2486 | mutex_unlock(&p->mutex); |
2306 | } | 2487 | } |
2307 | 2488 | ||
2489 | #ifdef CONFIG_NUMA_BALANCING | ||
2490 | static bool __initdata numabalancing_override; | ||
2491 | |||
2492 | static void __init check_numabalancing_enable(void) | ||
2493 | { | ||
2494 | bool numabalancing_default = false; | ||
2495 | |||
2496 | if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) | ||
2497 | numabalancing_default = true; | ||
2498 | |||
2499 | if (nr_node_ids > 1 && !numabalancing_override) { | ||
2500 | printk(KERN_INFO "Enabling automatic NUMA balancing. " | ||
2501 | "Configure with numa_balancing= or sysctl"); | ||
2502 | set_numabalancing_state(numabalancing_default); | ||
2503 | } | ||
2504 | } | ||
2505 | |||
2506 | static int __init setup_numabalancing(char *str) | ||
2507 | { | ||
2508 | int ret = 0; | ||
2509 | if (!str) | ||
2510 | goto out; | ||
2511 | numabalancing_override = true; | ||
2512 | |||
2513 | if (!strcmp(str, "enable")) { | ||
2514 | set_numabalancing_state(true); | ||
2515 | ret = 1; | ||
2516 | } else if (!strcmp(str, "disable")) { | ||
2517 | set_numabalancing_state(false); | ||
2518 | ret = 1; | ||
2519 | } | ||
2520 | out: | ||
2521 | if (!ret) | ||
2522 | printk(KERN_WARNING "Unable to parse numa_balancing=\n"); | ||
2523 | |||
2524 | return ret; | ||
2525 | } | ||
2526 | __setup("numa_balancing=", setup_numabalancing); | ||
2527 | #else | ||
2528 | static inline void __init check_numabalancing_enable(void) | ||
2529 | { | ||
2530 | } | ||
2531 | #endif /* CONFIG_NUMA_BALANCING */ | ||
2532 | |||
2308 | /* assumes fs == KERNEL_DS */ | 2533 | /* assumes fs == KERNEL_DS */ |
2309 | void __init numa_policy_init(void) | 2534 | void __init numa_policy_init(void) |
2310 | { | 2535 | { |
@@ -2320,6 +2545,15 @@ void __init numa_policy_init(void) | |||
2320 | sizeof(struct sp_node), | 2545 | sizeof(struct sp_node), |
2321 | 0, SLAB_PANIC, NULL); | 2546 | 0, SLAB_PANIC, NULL); |
2322 | 2547 | ||
2548 | for_each_node(nid) { | ||
2549 | preferred_node_policy[nid] = (struct mempolicy) { | ||
2550 | .refcnt = ATOMIC_INIT(1), | ||
2551 | .mode = MPOL_PREFERRED, | ||
2552 | .flags = MPOL_F_MOF | MPOL_F_MORON, | ||
2553 | .v = { .preferred_node = nid, }, | ||
2554 | }; | ||
2555 | } | ||
2556 | |||
2323 | /* | 2557 | /* |
2324 | * Set interleaving policy for system init. Interleaving is only | 2558 | * Set interleaving policy for system init. Interleaving is only |
2325 | * enabled across suitably sized nodes (default is >= 16MB), or | 2559 | * enabled across suitably sized nodes (default is >= 16MB), or |
@@ -2346,6 +2580,8 @@ void __init numa_policy_init(void) | |||
2346 | 2580 | ||
2347 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) | 2581 | if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) |
2348 | printk("numa_policy_init: interleaving failed\n"); | 2582 | printk("numa_policy_init: interleaving failed\n"); |
2583 | |||
2584 | check_numabalancing_enable(); | ||
2349 | } | 2585 | } |
2350 | 2586 | ||
2351 | /* Reset policy of current process to default */ | 2587 | /* Reset policy of current process to default */ |
@@ -2362,14 +2598,13 @@ void numa_default_policy(void) | |||
2362 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag | 2598 | * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag |
2363 | * Used only for mpol_parse_str() and mpol_to_str() | 2599 | * Used only for mpol_parse_str() and mpol_to_str() |
2364 | */ | 2600 | */ |
2365 | #define MPOL_LOCAL MPOL_MAX | ||
2366 | static const char * const policy_modes[] = | 2601 | static const char * const policy_modes[] = |
2367 | { | 2602 | { |
2368 | [MPOL_DEFAULT] = "default", | 2603 | [MPOL_DEFAULT] = "default", |
2369 | [MPOL_PREFERRED] = "prefer", | 2604 | [MPOL_PREFERRED] = "prefer", |
2370 | [MPOL_BIND] = "bind", | 2605 | [MPOL_BIND] = "bind", |
2371 | [MPOL_INTERLEAVE] = "interleave", | 2606 | [MPOL_INTERLEAVE] = "interleave", |
2372 | [MPOL_LOCAL] = "local" | 2607 | [MPOL_LOCAL] = "local", |
2373 | }; | 2608 | }; |
2374 | 2609 | ||
2375 | 2610 | ||
@@ -2415,12 +2650,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2415 | if (flags) | 2650 | if (flags) |
2416 | *flags++ = '\0'; /* terminate mode string */ | 2651 | *flags++ = '\0'; /* terminate mode string */ |
2417 | 2652 | ||
2418 | for (mode = 0; mode <= MPOL_LOCAL; mode++) { | 2653 | for (mode = 0; mode < MPOL_MAX; mode++) { |
2419 | if (!strcmp(str, policy_modes[mode])) { | 2654 | if (!strcmp(str, policy_modes[mode])) { |
2420 | break; | 2655 | break; |
2421 | } | 2656 | } |
2422 | } | 2657 | } |
2423 | if (mode > MPOL_LOCAL) | 2658 | if (mode >= MPOL_MAX) |
2424 | goto out; | 2659 | goto out; |
2425 | 2660 | ||
2426 | switch (mode) { | 2661 | switch (mode) { |
diff --git a/mm/migrate.c b/mm/migrate.c index cae02711181d..32efd8028bc9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -39,6 +39,9 @@ | |||
39 | 39 | ||
40 | #include <asm/tlbflush.h> | 40 | #include <asm/tlbflush.h> |
41 | 41 | ||
42 | #define CREATE_TRACE_POINTS | ||
43 | #include <trace/events/migrate.h> | ||
44 | |||
42 | #include "internal.h" | 45 | #include "internal.h" |
43 | 46 | ||
44 | /* | 47 | /* |
@@ -293,7 +296,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
293 | struct page *newpage, struct page *page, | 296 | struct page *newpage, struct page *page, |
294 | struct buffer_head *head, enum migrate_mode mode) | 297 | struct buffer_head *head, enum migrate_mode mode) |
295 | { | 298 | { |
296 | int expected_count; | 299 | int expected_count = 0; |
297 | void **pslot; | 300 | void **pslot; |
298 | 301 | ||
299 | if (!mapping) { | 302 | if (!mapping) { |
@@ -421,7 +424,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
421 | */ | 424 | */ |
422 | void migrate_page_copy(struct page *newpage, struct page *page) | 425 | void migrate_page_copy(struct page *newpage, struct page *page) |
423 | { | 426 | { |
424 | if (PageHuge(page)) | 427 | if (PageHuge(page) || PageTransHuge(page)) |
425 | copy_huge_page(newpage, page); | 428 | copy_huge_page(newpage, page); |
426 | else | 429 | else |
427 | copy_highpage(newpage, page); | 430 | copy_highpage(newpage, page); |
@@ -765,7 +768,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
765 | */ | 768 | */ |
766 | if (PageAnon(page)) { | 769 | if (PageAnon(page)) { |
767 | /* | 770 | /* |
768 | * Only page_lock_anon_vma() understands the subtleties of | 771 | * Only page_lock_anon_vma_read() understands the subtleties of |
769 | * getting a hold on an anon_vma from outside one of its mms. | 772 | * getting a hold on an anon_vma from outside one of its mms. |
770 | */ | 773 | */ |
771 | anon_vma = page_get_anon_vma(page); | 774 | anon_vma = page_get_anon_vma(page); |
@@ -998,10 +1001,11 @@ out: | |||
998 | */ | 1001 | */ |
999 | int migrate_pages(struct list_head *from, | 1002 | int migrate_pages(struct list_head *from, |
1000 | new_page_t get_new_page, unsigned long private, bool offlining, | 1003 | new_page_t get_new_page, unsigned long private, bool offlining, |
1001 | enum migrate_mode mode) | 1004 | enum migrate_mode mode, int reason) |
1002 | { | 1005 | { |
1003 | int retry = 1; | 1006 | int retry = 1; |
1004 | int nr_failed = 0; | 1007 | int nr_failed = 0; |
1008 | int nr_succeeded = 0; | ||
1005 | int pass = 0; | 1009 | int pass = 0; |
1006 | struct page *page; | 1010 | struct page *page; |
1007 | struct page *page2; | 1011 | struct page *page2; |
@@ -1028,6 +1032,7 @@ int migrate_pages(struct list_head *from, | |||
1028 | retry++; | 1032 | retry++; |
1029 | break; | 1033 | break; |
1030 | case MIGRATEPAGE_SUCCESS: | 1034 | case MIGRATEPAGE_SUCCESS: |
1035 | nr_succeeded++; | ||
1031 | break; | 1036 | break; |
1032 | default: | 1037 | default: |
1033 | /* Permanent failure */ | 1038 | /* Permanent failure */ |
@@ -1038,6 +1043,12 @@ int migrate_pages(struct list_head *from, | |||
1038 | } | 1043 | } |
1039 | rc = nr_failed + retry; | 1044 | rc = nr_failed + retry; |
1040 | out: | 1045 | out: |
1046 | if (nr_succeeded) | ||
1047 | count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); | ||
1048 | if (nr_failed) | ||
1049 | count_vm_events(PGMIGRATE_FAIL, nr_failed); | ||
1050 | trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason); | ||
1051 | |||
1041 | if (!swapwrite) | 1052 | if (!swapwrite) |
1042 | current->flags &= ~PF_SWAPWRITE; | 1053 | current->flags &= ~PF_SWAPWRITE; |
1043 | 1054 | ||
@@ -1176,7 +1187,8 @@ set_status: | |||
1176 | err = 0; | 1187 | err = 0; |
1177 | if (!list_empty(&pagelist)) { | 1188 | if (!list_empty(&pagelist)) { |
1178 | err = migrate_pages(&pagelist, new_page_node, | 1189 | err = migrate_pages(&pagelist, new_page_node, |
1179 | (unsigned long)pm, 0, MIGRATE_SYNC); | 1190 | (unsigned long)pm, 0, MIGRATE_SYNC, |
1191 | MR_SYSCALL); | ||
1180 | if (err) | 1192 | if (err) |
1181 | putback_lru_pages(&pagelist); | 1193 | putback_lru_pages(&pagelist); |
1182 | } | 1194 | } |
@@ -1440,4 +1452,317 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
1440 | } | 1452 | } |
1441 | return err; | 1453 | return err; |
1442 | } | 1454 | } |
1443 | #endif | 1455 | |
1456 | #ifdef CONFIG_NUMA_BALANCING | ||
1457 | /* | ||
1458 | * Returns true if this is a safe migration target node for misplaced NUMA | ||
1459 | * pages. Currently it only checks the watermarks which crude | ||
1460 | */ | ||
1461 | static bool migrate_balanced_pgdat(struct pglist_data *pgdat, | ||
1462 | int nr_migrate_pages) | ||
1463 | { | ||
1464 | int z; | ||
1465 | for (z = pgdat->nr_zones - 1; z >= 0; z--) { | ||
1466 | struct zone *zone = pgdat->node_zones + z; | ||
1467 | |||
1468 | if (!populated_zone(zone)) | ||
1469 | continue; | ||
1470 | |||
1471 | if (zone->all_unreclaimable) | ||
1472 | continue; | ||
1473 | |||
1474 | /* Avoid waking kswapd by allocating pages_to_migrate pages. */ | ||
1475 | if (!zone_watermark_ok(zone, 0, | ||
1476 | high_wmark_pages(zone) + | ||
1477 | nr_migrate_pages, | ||
1478 | 0, 0)) | ||
1479 | continue; | ||
1480 | return true; | ||
1481 | } | ||
1482 | return false; | ||
1483 | } | ||
1484 | |||
1485 | static struct page *alloc_misplaced_dst_page(struct page *page, | ||
1486 | unsigned long data, | ||
1487 | int **result) | ||
1488 | { | ||
1489 | int nid = (int) data; | ||
1490 | struct page *newpage; | ||
1491 | |||
1492 | newpage = alloc_pages_exact_node(nid, | ||
1493 | (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | | ||
1494 | __GFP_NOMEMALLOC | __GFP_NORETRY | | ||
1495 | __GFP_NOWARN) & | ||
1496 | ~GFP_IOFS, 0); | ||
1497 | if (newpage) | ||
1498 | page_xchg_last_nid(newpage, page_last_nid(page)); | ||
1499 | |||
1500 | return newpage; | ||
1501 | } | ||
1502 | |||
1503 | /* | ||
1504 | * page migration rate limiting control. | ||
1505 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | ||
1506 | * window of time. Default here says do not migrate more than 1280M per second. | ||
1507 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
1508 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
1509 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
1510 | * throttle window closed. | ||
1511 | */ | ||
1512 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | ||
1513 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
1514 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | ||
1515 | |||
1516 | /* Returns true if NUMA migration is currently rate limited */ | ||
1517 | bool migrate_ratelimited(int node) | ||
1518 | { | ||
1519 | pg_data_t *pgdat = NODE_DATA(node); | ||
1520 | |||
1521 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
1522 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
1523 | return false; | ||
1524 | |||
1525 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
1526 | return false; | ||
1527 | |||
1528 | return true; | ||
1529 | } | ||
1530 | |||
1531 | /* Returns true if the node is migrate rate-limited after the update */ | ||
1532 | bool numamigrate_update_ratelimit(pg_data_t *pgdat, unsigned long nr_pages) | ||
1533 | { | ||
1534 | bool rate_limited = false; | ||
1535 | |||
1536 | /* | ||
1537 | * Rate-limit the amount of data that is being migrated to a node. | ||
1538 | * Optimal placement is no good if the memory bus is saturated and | ||
1539 | * all the time is being spent migrating! | ||
1540 | */ | ||
1541 | spin_lock(&pgdat->numabalancing_migrate_lock); | ||
1542 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window)) { | ||
1543 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
1544 | pgdat->numabalancing_migrate_next_window = jiffies + | ||
1545 | msecs_to_jiffies(migrate_interval_millisecs); | ||
1546 | } | ||
1547 | if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) | ||
1548 | rate_limited = true; | ||
1549 | else | ||
1550 | pgdat->numabalancing_migrate_nr_pages += nr_pages; | ||
1551 | spin_unlock(&pgdat->numabalancing_migrate_lock); | ||
1552 | |||
1553 | return rate_limited; | ||
1554 | } | ||
1555 | |||
1556 | int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) | ||
1557 | { | ||
1558 | int ret = 0; | ||
1559 | |||
1560 | /* Avoid migrating to a node that is nearly full */ | ||
1561 | if (migrate_balanced_pgdat(pgdat, 1)) { | ||
1562 | int page_lru; | ||
1563 | |||
1564 | if (isolate_lru_page(page)) { | ||
1565 | put_page(page); | ||
1566 | return 0; | ||
1567 | } | ||
1568 | |||
1569 | /* Page is isolated */ | ||
1570 | ret = 1; | ||
1571 | page_lru = page_is_file_cache(page); | ||
1572 | if (!PageTransHuge(page)) | ||
1573 | inc_zone_page_state(page, NR_ISOLATED_ANON + page_lru); | ||
1574 | else | ||
1575 | mod_zone_page_state(page_zone(page), | ||
1576 | NR_ISOLATED_ANON + page_lru, | ||
1577 | HPAGE_PMD_NR); | ||
1578 | } | ||
1579 | |||
1580 | /* | ||
1581 | * Page is either isolated or there is not enough space on the target | ||
1582 | * node. If isolated, then it has taken a reference count and the | ||
1583 | * callers reference can be safely dropped without the page | ||
1584 | * disappearing underneath us during migration. Otherwise the page is | ||
1585 | * not to be migrated but the callers reference should still be | ||
1586 | * dropped so it does not leak. | ||
1587 | */ | ||
1588 | put_page(page); | ||
1589 | |||
1590 | return ret; | ||
1591 | } | ||
1592 | |||
1593 | /* | ||
1594 | * Attempt to migrate a misplaced page to the specified destination | ||
1595 | * node. Caller is expected to have an elevated reference count on | ||
1596 | * the page that will be dropped by this function before returning. | ||
1597 | */ | ||
1598 | int migrate_misplaced_page(struct page *page, int node) | ||
1599 | { | ||
1600 | pg_data_t *pgdat = NODE_DATA(node); | ||
1601 | int isolated = 0; | ||
1602 | int nr_remaining; | ||
1603 | LIST_HEAD(migratepages); | ||
1604 | |||
1605 | /* | ||
1606 | * Don't migrate pages that are mapped in multiple processes. | ||
1607 | * TODO: Handle false sharing detection instead of this hammer | ||
1608 | */ | ||
1609 | if (page_mapcount(page) != 1) { | ||
1610 | put_page(page); | ||
1611 | goto out; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Rate-limit the amount of data that is being migrated to a node. | ||
1616 | * Optimal placement is no good if the memory bus is saturated and | ||
1617 | * all the time is being spent migrating! | ||
1618 | */ | ||
1619 | if (numamigrate_update_ratelimit(pgdat, 1)) { | ||
1620 | put_page(page); | ||
1621 | goto out; | ||
1622 | } | ||
1623 | |||
1624 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1625 | if (!isolated) | ||
1626 | goto out; | ||
1627 | |||
1628 | list_add(&page->lru, &migratepages); | ||
1629 | nr_remaining = migrate_pages(&migratepages, | ||
1630 | alloc_misplaced_dst_page, | ||
1631 | node, false, MIGRATE_ASYNC, | ||
1632 | MR_NUMA_MISPLACED); | ||
1633 | if (nr_remaining) { | ||
1634 | putback_lru_pages(&migratepages); | ||
1635 | isolated = 0; | ||
1636 | } else | ||
1637 | count_vm_numa_event(NUMA_PAGE_MIGRATE); | ||
1638 | BUG_ON(!list_empty(&migratepages)); | ||
1639 | out: | ||
1640 | return isolated; | ||
1641 | } | ||
1642 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1643 | |||
1644 | #if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
1645 | int migrate_misplaced_transhuge_page(struct mm_struct *mm, | ||
1646 | struct vm_area_struct *vma, | ||
1647 | pmd_t *pmd, pmd_t entry, | ||
1648 | unsigned long address, | ||
1649 | struct page *page, int node) | ||
1650 | { | ||
1651 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
1652 | pg_data_t *pgdat = NODE_DATA(node); | ||
1653 | int isolated = 0; | ||
1654 | struct page *new_page = NULL; | ||
1655 | struct mem_cgroup *memcg = NULL; | ||
1656 | int page_lru = page_is_file_cache(page); | ||
1657 | |||
1658 | /* | ||
1659 | * Don't migrate pages that are mapped in multiple processes. | ||
1660 | * TODO: Handle false sharing detection instead of this hammer | ||
1661 | */ | ||
1662 | if (page_mapcount(page) != 1) | ||
1663 | goto out_dropref; | ||
1664 | |||
1665 | /* | ||
1666 | * Rate-limit the amount of data that is being migrated to a node. | ||
1667 | * Optimal placement is no good if the memory bus is saturated and | ||
1668 | * all the time is being spent migrating! | ||
1669 | */ | ||
1670 | if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR)) | ||
1671 | goto out_dropref; | ||
1672 | |||
1673 | new_page = alloc_pages_node(node, | ||
1674 | (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); | ||
1675 | if (!new_page) { | ||
1676 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1677 | goto out_dropref; | ||
1678 | } | ||
1679 | page_xchg_last_nid(new_page, page_last_nid(page)); | ||
1680 | |||
1681 | isolated = numamigrate_isolate_page(pgdat, page); | ||
1682 | if (!isolated) { | ||
1683 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1684 | put_page(new_page); | ||
1685 | goto out_keep_locked; | ||
1686 | } | ||
1687 | |||
1688 | /* Prepare a page as a migration target */ | ||
1689 | __set_page_locked(new_page); | ||
1690 | SetPageSwapBacked(new_page); | ||
1691 | |||
1692 | /* anon mapping, we can simply copy page->mapping to the new page: */ | ||
1693 | new_page->mapping = page->mapping; | ||
1694 | new_page->index = page->index; | ||
1695 | migrate_page_copy(new_page, page); | ||
1696 | WARN_ON(PageLRU(new_page)); | ||
1697 | |||
1698 | /* Recheck the target PMD */ | ||
1699 | spin_lock(&mm->page_table_lock); | ||
1700 | if (unlikely(!pmd_same(*pmd, entry))) { | ||
1701 | spin_unlock(&mm->page_table_lock); | ||
1702 | |||
1703 | /* Reverse changes made by migrate_page_copy() */ | ||
1704 | if (TestClearPageActive(new_page)) | ||
1705 | SetPageActive(page); | ||
1706 | if (TestClearPageUnevictable(new_page)) | ||
1707 | SetPageUnevictable(page); | ||
1708 | mlock_migrate_page(page, new_page); | ||
1709 | |||
1710 | unlock_page(new_page); | ||
1711 | put_page(new_page); /* Free it */ | ||
1712 | |||
1713 | unlock_page(page); | ||
1714 | putback_lru_page(page); | ||
1715 | |||
1716 | count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); | ||
1717 | goto out; | ||
1718 | } | ||
1719 | |||
1720 | /* | ||
1721 | * Traditional migration needs to prepare the memcg charge | ||
1722 | * transaction early to prevent the old page from being | ||
1723 | * uncharged when installing migration entries. Here we can | ||
1724 | * save the potential rollback and start the charge transfer | ||
1725 | * only when migration is already known to end successfully. | ||
1726 | */ | ||
1727 | mem_cgroup_prepare_migration(page, new_page, &memcg); | ||
1728 | |||
1729 | entry = mk_pmd(new_page, vma->vm_page_prot); | ||
1730 | entry = pmd_mknonnuma(entry); | ||
1731 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | ||
1732 | entry = pmd_mkhuge(entry); | ||
1733 | |||
1734 | page_add_new_anon_rmap(new_page, vma, haddr); | ||
1735 | |||
1736 | set_pmd_at(mm, haddr, pmd, entry); | ||
1737 | update_mmu_cache_pmd(vma, address, entry); | ||
1738 | page_remove_rmap(page); | ||
1739 | /* | ||
1740 | * Finish the charge transaction under the page table lock to | ||
1741 | * prevent split_huge_page() from dividing up the charge | ||
1742 | * before it's fully transferred to the new page. | ||
1743 | */ | ||
1744 | mem_cgroup_end_migration(memcg, page, new_page, true); | ||
1745 | spin_unlock(&mm->page_table_lock); | ||
1746 | |||
1747 | unlock_page(new_page); | ||
1748 | unlock_page(page); | ||
1749 | put_page(page); /* Drop the rmap reference */ | ||
1750 | put_page(page); /* Drop the LRU isolation reference */ | ||
1751 | |||
1752 | count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); | ||
1753 | count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); | ||
1754 | |||
1755 | out: | ||
1756 | mod_zone_page_state(page_zone(page), | ||
1757 | NR_ISOLATED_ANON + page_lru, | ||
1758 | -HPAGE_PMD_NR); | ||
1759 | return isolated; | ||
1760 | |||
1761 | out_dropref: | ||
1762 | put_page(page); | ||
1763 | out_keep_locked: | ||
1764 | return 0; | ||
1765 | } | ||
1766 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1767 | |||
1768 | #endif /* CONFIG_NUMA */ | ||
@@ -736,7 +736,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
736 | if (anon_vma) { | 736 | if (anon_vma) { |
737 | VM_BUG_ON(adjust_next && next->anon_vma && | 737 | VM_BUG_ON(adjust_next && next->anon_vma && |
738 | anon_vma != next->anon_vma); | 738 | anon_vma != next->anon_vma); |
739 | anon_vma_lock(anon_vma); | 739 | anon_vma_lock_write(anon_vma); |
740 | anon_vma_interval_tree_pre_update_vma(vma); | 740 | anon_vma_interval_tree_pre_update_vma(vma); |
741 | if (adjust_next) | 741 | if (adjust_next) |
742 | anon_vma_interval_tree_pre_update_vma(next); | 742 | anon_vma_interval_tree_pre_update_vma(next); |
@@ -2886,15 +2886,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | |||
2886 | * The LSB of head.next can't change from under us | 2886 | * The LSB of head.next can't change from under us |
2887 | * because we hold the mm_all_locks_mutex. | 2887 | * because we hold the mm_all_locks_mutex. |
2888 | */ | 2888 | */ |
2889 | mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem); | 2889 | down_write(&anon_vma->root->rwsem); |
2890 | /* | 2890 | /* |
2891 | * We can safely modify head.next after taking the | 2891 | * We can safely modify head.next after taking the |
2892 | * anon_vma->root->mutex. If some other vma in this mm shares | 2892 | * anon_vma->root->rwsem. If some other vma in this mm shares |
2893 | * the same anon_vma we won't take it again. | 2893 | * the same anon_vma we won't take it again. |
2894 | * | 2894 | * |
2895 | * No need of atomic instructions here, head.next | 2895 | * No need of atomic instructions here, head.next |
2896 | * can't change from under us thanks to the | 2896 | * can't change from under us thanks to the |
2897 | * anon_vma->root->mutex. | 2897 | * anon_vma->root->rwsem. |
2898 | */ | 2898 | */ |
2899 | if (__test_and_set_bit(0, (unsigned long *) | 2899 | if (__test_and_set_bit(0, (unsigned long *) |
2900 | &anon_vma->root->rb_root.rb_node)) | 2900 | &anon_vma->root->rb_root.rb_node)) |
@@ -2996,7 +2996,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | |||
2996 | * | 2996 | * |
2997 | * No need of atomic instructions here, head.next | 2997 | * No need of atomic instructions here, head.next |
2998 | * can't change from under us until we release the | 2998 | * can't change from under us until we release the |
2999 | * anon_vma->root->mutex. | 2999 | * anon_vma->root->rwsem. |
3000 | */ | 3000 | */ |
3001 | if (!__test_and_clear_bit(0, (unsigned long *) | 3001 | if (!__test_and_clear_bit(0, (unsigned long *) |
3002 | &anon_vma->root->rb_root.rb_node)) | 3002 | &anon_vma->root->rb_root.rb_node)) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index e8c3938db6fa..3dca970367db 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) | |||
35 | } | 35 | } |
36 | #endif | 36 | #endif |
37 | 37 | ||
38 | static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | 38 | static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
39 | unsigned long addr, unsigned long end, pgprot_t newprot, | 39 | unsigned long addr, unsigned long end, pgprot_t newprot, |
40 | int dirty_accountable) | 40 | int dirty_accountable, int prot_numa, bool *ret_all_same_node) |
41 | { | 41 | { |
42 | struct mm_struct *mm = vma->vm_mm; | ||
42 | pte_t *pte, oldpte; | 43 | pte_t *pte, oldpte; |
43 | spinlock_t *ptl; | 44 | spinlock_t *ptl; |
45 | unsigned long pages = 0; | ||
46 | bool all_same_node = true; | ||
47 | int last_nid = -1; | ||
44 | 48 | ||
45 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 49 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
46 | arch_enter_lazy_mmu_mode(); | 50 | arch_enter_lazy_mmu_mode(); |
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
48 | oldpte = *pte; | 52 | oldpte = *pte; |
49 | if (pte_present(oldpte)) { | 53 | if (pte_present(oldpte)) { |
50 | pte_t ptent; | 54 | pte_t ptent; |
55 | bool updated = false; | ||
51 | 56 | ||
52 | ptent = ptep_modify_prot_start(mm, addr, pte); | 57 | ptent = ptep_modify_prot_start(mm, addr, pte); |
53 | ptent = pte_modify(ptent, newprot); | 58 | if (!prot_numa) { |
59 | ptent = pte_modify(ptent, newprot); | ||
60 | updated = true; | ||
61 | } else { | ||
62 | struct page *page; | ||
63 | |||
64 | page = vm_normal_page(vma, addr, oldpte); | ||
65 | if (page) { | ||
66 | int this_nid = page_to_nid(page); | ||
67 | if (last_nid == -1) | ||
68 | last_nid = this_nid; | ||
69 | if (last_nid != this_nid) | ||
70 | all_same_node = false; | ||
71 | |||
72 | /* only check non-shared pages */ | ||
73 | if (!pte_numa(oldpte) && | ||
74 | page_mapcount(page) == 1) { | ||
75 | ptent = pte_mknuma(ptent); | ||
76 | updated = true; | ||
77 | } | ||
78 | } | ||
79 | } | ||
54 | 80 | ||
55 | /* | 81 | /* |
56 | * Avoid taking write faults for pages we know to be | 82 | * Avoid taking write faults for pages we know to be |
57 | * dirty. | 83 | * dirty. |
58 | */ | 84 | */ |
59 | if (dirty_accountable && pte_dirty(ptent)) | 85 | if (dirty_accountable && pte_dirty(ptent)) { |
60 | ptent = pte_mkwrite(ptent); | 86 | ptent = pte_mkwrite(ptent); |
87 | updated = true; | ||
88 | } | ||
61 | 89 | ||
90 | if (updated) | ||
91 | pages++; | ||
62 | ptep_modify_prot_commit(mm, addr, pte, ptent); | 92 | ptep_modify_prot_commit(mm, addr, pte, ptent); |
63 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { | 93 | } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { |
64 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
72 | set_pte_at(mm, addr, pte, | 102 | set_pte_at(mm, addr, pte, |
73 | swp_entry_to_pte(entry)); | 103 | swp_entry_to_pte(entry)); |
74 | } | 104 | } |
105 | pages++; | ||
75 | } | 106 | } |
76 | } while (pte++, addr += PAGE_SIZE, addr != end); | 107 | } while (pte++, addr += PAGE_SIZE, addr != end); |
77 | arch_leave_lazy_mmu_mode(); | 108 | arch_leave_lazy_mmu_mode(); |
78 | pte_unmap_unlock(pte - 1, ptl); | 109 | pte_unmap_unlock(pte - 1, ptl); |
110 | |||
111 | *ret_all_same_node = all_same_node; | ||
112 | return pages; | ||
79 | } | 113 | } |
80 | 114 | ||
81 | static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 115 | #ifdef CONFIG_NUMA_BALANCING |
116 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
117 | pmd_t *pmd) | ||
118 | { | ||
119 | spin_lock(&mm->page_table_lock); | ||
120 | set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); | ||
121 | spin_unlock(&mm->page_table_lock); | ||
122 | } | ||
123 | #else | ||
124 | static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, | ||
125 | pmd_t *pmd) | ||
126 | { | ||
127 | BUG(); | ||
128 | } | ||
129 | #endif /* CONFIG_NUMA_BALANCING */ | ||
130 | |||
131 | static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | ||
82 | unsigned long addr, unsigned long end, pgprot_t newprot, | 132 | unsigned long addr, unsigned long end, pgprot_t newprot, |
83 | int dirty_accountable) | 133 | int dirty_accountable, int prot_numa) |
84 | { | 134 | { |
85 | pmd_t *pmd; | 135 | pmd_t *pmd; |
86 | unsigned long next; | 136 | unsigned long next; |
137 | unsigned long pages = 0; | ||
138 | bool all_same_node; | ||
87 | 139 | ||
88 | pmd = pmd_offset(pud, addr); | 140 | pmd = pmd_offset(pud, addr); |
89 | do { | 141 | do { |
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
91 | if (pmd_trans_huge(*pmd)) { | 143 | if (pmd_trans_huge(*pmd)) { |
92 | if (next - addr != HPAGE_PMD_SIZE) | 144 | if (next - addr != HPAGE_PMD_SIZE) |
93 | split_huge_page_pmd(vma, addr, pmd); | 145 | split_huge_page_pmd(vma, addr, pmd); |
94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 146 | else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) { |
147 | pages += HPAGE_PMD_NR; | ||
95 | continue; | 148 | continue; |
149 | } | ||
96 | /* fall through */ | 150 | /* fall through */ |
97 | } | 151 | } |
98 | if (pmd_none_or_clear_bad(pmd)) | 152 | if (pmd_none_or_clear_bad(pmd)) |
99 | continue; | 153 | continue; |
100 | change_pte_range(vma->vm_mm, pmd, addr, next, newprot, | 154 | pages += change_pte_range(vma, pmd, addr, next, newprot, |
101 | dirty_accountable); | 155 | dirty_accountable, prot_numa, &all_same_node); |
156 | |||
157 | /* | ||
158 | * If we are changing protections for NUMA hinting faults then | ||
159 | * set pmd_numa if the examined pages were all on the same | ||
160 | * node. This allows a regular PMD to be handled as one fault | ||
161 | * and effectively batches the taking of the PTL | ||
162 | */ | ||
163 | if (prot_numa && all_same_node) | ||
164 | change_pmd_protnuma(vma->vm_mm, addr, pmd); | ||
102 | } while (pmd++, addr = next, addr != end); | 165 | } while (pmd++, addr = next, addr != end); |
166 | |||
167 | return pages; | ||
103 | } | 168 | } |
104 | 169 | ||
105 | static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | 170 | static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, |
106 | unsigned long addr, unsigned long end, pgprot_t newprot, | 171 | unsigned long addr, unsigned long end, pgprot_t newprot, |
107 | int dirty_accountable) | 172 | int dirty_accountable, int prot_numa) |
108 | { | 173 | { |
109 | pud_t *pud; | 174 | pud_t *pud; |
110 | unsigned long next; | 175 | unsigned long next; |
176 | unsigned long pages = 0; | ||
111 | 177 | ||
112 | pud = pud_offset(pgd, addr); | 178 | pud = pud_offset(pgd, addr); |
113 | do { | 179 | do { |
114 | next = pud_addr_end(addr, end); | 180 | next = pud_addr_end(addr, end); |
115 | if (pud_none_or_clear_bad(pud)) | 181 | if (pud_none_or_clear_bad(pud)) |
116 | continue; | 182 | continue; |
117 | change_pmd_range(vma, pud, addr, next, newprot, | 183 | pages += change_pmd_range(vma, pud, addr, next, newprot, |
118 | dirty_accountable); | 184 | dirty_accountable, prot_numa); |
119 | } while (pud++, addr = next, addr != end); | 185 | } while (pud++, addr = next, addr != end); |
186 | |||
187 | return pages; | ||
120 | } | 188 | } |
121 | 189 | ||
122 | static void change_protection(struct vm_area_struct *vma, | 190 | static unsigned long change_protection_range(struct vm_area_struct *vma, |
123 | unsigned long addr, unsigned long end, pgprot_t newprot, | 191 | unsigned long addr, unsigned long end, pgprot_t newprot, |
124 | int dirty_accountable) | 192 | int dirty_accountable, int prot_numa) |
125 | { | 193 | { |
126 | struct mm_struct *mm = vma->vm_mm; | 194 | struct mm_struct *mm = vma->vm_mm; |
127 | pgd_t *pgd; | 195 | pgd_t *pgd; |
128 | unsigned long next; | 196 | unsigned long next; |
129 | unsigned long start = addr; | 197 | unsigned long start = addr; |
198 | unsigned long pages = 0; | ||
130 | 199 | ||
131 | BUG_ON(addr >= end); | 200 | BUG_ON(addr >= end); |
132 | pgd = pgd_offset(mm, addr); | 201 | pgd = pgd_offset(mm, addr); |
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma, | |||
135 | next = pgd_addr_end(addr, end); | 204 | next = pgd_addr_end(addr, end); |
136 | if (pgd_none_or_clear_bad(pgd)) | 205 | if (pgd_none_or_clear_bad(pgd)) |
137 | continue; | 206 | continue; |
138 | change_pud_range(vma, pgd, addr, next, newprot, | 207 | pages += change_pud_range(vma, pgd, addr, next, newprot, |
139 | dirty_accountable); | 208 | dirty_accountable, prot_numa); |
140 | } while (pgd++, addr = next, addr != end); | 209 | } while (pgd++, addr = next, addr != end); |
141 | flush_tlb_range(vma, start, end); | 210 | |
211 | /* Only flush the TLB if we actually modified any entries: */ | ||
212 | if (pages) | ||
213 | flush_tlb_range(vma, start, end); | ||
214 | |||
215 | return pages; | ||
216 | } | ||
217 | |||
218 | unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, | ||
219 | unsigned long end, pgprot_t newprot, | ||
220 | int dirty_accountable, int prot_numa) | ||
221 | { | ||
222 | struct mm_struct *mm = vma->vm_mm; | ||
223 | unsigned long pages; | ||
224 | |||
225 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
226 | if (is_vm_hugetlb_page(vma)) | ||
227 | pages = hugetlb_change_protection(vma, start, end, newprot); | ||
228 | else | ||
229 | pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); | ||
230 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
231 | |||
232 | return pages; | ||
142 | } | 233 | } |
143 | 234 | ||
144 | int | 235 | int |
@@ -213,12 +304,8 @@ success: | |||
213 | dirty_accountable = 1; | 304 | dirty_accountable = 1; |
214 | } | 305 | } |
215 | 306 | ||
216 | mmu_notifier_invalidate_range_start(mm, start, end); | 307 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); |
217 | if (is_vm_hugetlb_page(vma)) | 308 | |
218 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | ||
219 | else | ||
220 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | ||
221 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
222 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 309 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
223 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 310 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
224 | perf_event_mmap(vma); | 311 | perf_event_mmap(vma); |
diff --git a/mm/mremap.c b/mm/mremap.c index eabb24da6c9e..e1031e1f6a61 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -104,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
104 | } | 104 | } |
105 | if (vma->anon_vma) { | 105 | if (vma->anon_vma) { |
106 | anon_vma = vma->anon_vma; | 106 | anon_vma = vma->anon_vma; |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | } | 108 | } |
109 | } | 109 | } |
110 | 110 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83637dfba110..d037c8bc1512 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -611,6 +611,7 @@ static inline int free_pages_check(struct page *page) | |||
611 | bad_page(page); | 611 | bad_page(page); |
612 | return 1; | 612 | return 1; |
613 | } | 613 | } |
614 | reset_page_last_nid(page); | ||
614 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 615 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
615 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 616 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
616 | return 0; | 617 | return 0; |
@@ -3883,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3883 | mminit_verify_page_links(page, zone, nid, pfn); | 3884 | mminit_verify_page_links(page, zone, nid, pfn); |
3884 | init_page_count(page); | 3885 | init_page_count(page); |
3885 | reset_page_mapcount(page); | 3886 | reset_page_mapcount(page); |
3887 | reset_page_last_nid(page); | ||
3886 | SetPageReserved(page); | 3888 | SetPageReserved(page); |
3887 | /* | 3889 | /* |
3888 | * Mark the block movable so that blocks are reserved for | 3890 | * Mark the block movable so that blocks are reserved for |
@@ -4526,6 +4528,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4526 | int ret; | 4528 | int ret; |
4527 | 4529 | ||
4528 | pgdat_resize_init(pgdat); | 4530 | pgdat_resize_init(pgdat); |
4531 | #ifdef CONFIG_NUMA_BALANCING | ||
4532 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4533 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4534 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4535 | #endif | ||
4529 | init_waitqueue_head(&pgdat->kswapd_wait); | 4536 | init_waitqueue_head(&pgdat->kswapd_wait); |
4530 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4537 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4531 | pgdat_page_cgroup_init(pgdat); | 4538 | pgdat_page_cgroup_init(pgdat); |
@@ -5800,7 +5807,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5800 | 5807 | ||
5801 | ret = migrate_pages(&cc->migratepages, | 5808 | ret = migrate_pages(&cc->migratepages, |
5802 | alloc_migrate_target, | 5809 | alloc_migrate_target, |
5803 | 0, false, MIGRATE_SYNC); | 5810 | 0, false, MIGRATE_SYNC, |
5811 | MR_CMA); | ||
5804 | } | 5812 | } |
5805 | 5813 | ||
5806 | putback_movable_pages(&cc->migratepages); | 5814 | putback_movable_pages(&cc->migratepages); |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e642627da6b7..0c8323fe6c8f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -12,8 +12,8 @@ | |||
12 | 12 | ||
13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS | 13 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
14 | /* | 14 | /* |
15 | * Only sets the access flags (dirty, accessed, and | 15 | * Only sets the access flags (dirty, accessed), as well as write |
16 | * writable). Furthermore, we know it always gets set to a "more | 16 | * permission. Furthermore, we know it always gets set to a "more |
17 | * permissive" setting, which allows most architectures to optimize | 17 | * permissive" setting, which allows most architectures to optimize |
18 | * this. We return whether the PTE actually changed, which in turn | 18 | * this. We return whether the PTE actually changed, which in turn |
19 | * instructs the caller to do things like update__mmu_cache. This | 19 | * instructs the caller to do things like update__mmu_cache. This |
@@ -27,7 +27,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, | |||
27 | int changed = !pte_same(*ptep, entry); | 27 | int changed = !pte_same(*ptep, entry); |
28 | if (changed) { | 28 | if (changed) { |
29 | set_pte_at(vma->vm_mm, address, ptep, entry); | 29 | set_pte_at(vma->vm_mm, address, ptep, entry); |
30 | flush_tlb_page(vma, address); | 30 | flush_tlb_fix_spurious_fault(vma, address); |
31 | } | 31 | } |
32 | return changed; | 32 | return changed; |
33 | } | 33 | } |
@@ -88,7 +88,8 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
88 | { | 88 | { |
89 | pte_t pte; | 89 | pte_t pte; |
90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); | 90 | pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); |
91 | flush_tlb_page(vma, address); | 91 | if (pte_accessible(pte)) |
92 | flush_tlb_page(vma, address); | ||
92 | return pte; | 93 | return pte; |
93 | } | 94 | } |
94 | #endif | 95 | #endif |
@@ -24,7 +24,7 @@ | |||
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_mutex | 26 | * mapping->i_mmap_mutex |
27 | * anon_vma->mutex | 27 | * anon_vma->rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * mm->page_table_lock or pte_lock |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * swap_lock (in swap_duplicate, swap_info_get) |
@@ -37,7 +37,7 @@ | |||
37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
38 | * within bdi.wb->list_lock in __sync_single_inode) | 38 | * within bdi.wb->list_lock in __sync_single_inode) |
39 | * | 39 | * |
40 | * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) | 40 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
41 | * ->tasklist_lock | 41 | * ->tasklist_lock |
42 | * pte map lock | 42 | * pte map lock |
43 | */ | 43 | */ |
@@ -87,24 +87,24 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); | 87 | VM_BUG_ON(atomic_read(&anon_vma->refcount)); |
88 | 88 | ||
89 | /* | 89 | /* |
90 | * Synchronize against page_lock_anon_vma() such that | 90 | * Synchronize against page_lock_anon_vma_read() such that |
91 | * we can safely hold the lock without the anon_vma getting | 91 | * we can safely hold the lock without the anon_vma getting |
92 | * freed. | 92 | * freed. |
93 | * | 93 | * |
94 | * Relies on the full mb implied by the atomic_dec_and_test() from | 94 | * Relies on the full mb implied by the atomic_dec_and_test() from |
95 | * put_anon_vma() against the acquire barrier implied by | 95 | * put_anon_vma() against the acquire barrier implied by |
96 | * mutex_trylock() from page_lock_anon_vma(). This orders: | 96 | * down_read_trylock() from page_lock_anon_vma_read(). This orders: |
97 | * | 97 | * |
98 | * page_lock_anon_vma() VS put_anon_vma() | 98 | * page_lock_anon_vma_read() VS put_anon_vma() |
99 | * mutex_trylock() atomic_dec_and_test() | 99 | * down_read_trylock() atomic_dec_and_test() |
100 | * LOCK MB | 100 | * LOCK MB |
101 | * atomic_read() mutex_is_locked() | 101 | * atomic_read() rwsem_is_locked() |
102 | * | 102 | * |
103 | * LOCK should suffice since the actual taking of the lock must | 103 | * LOCK should suffice since the actual taking of the lock must |
104 | * happen _before_ what follows. | 104 | * happen _before_ what follows. |
105 | */ | 105 | */ |
106 | if (mutex_is_locked(&anon_vma->root->mutex)) { | 106 | if (rwsem_is_locked(&anon_vma->root->rwsem)) { |
107 | anon_vma_lock(anon_vma); | 107 | anon_vma_lock_write(anon_vma); |
108 | anon_vma_unlock(anon_vma); | 108 | anon_vma_unlock(anon_vma); |
109 | } | 109 | } |
110 | 110 | ||
@@ -146,7 +146,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, | |||
146 | * allocate a new one. | 146 | * allocate a new one. |
147 | * | 147 | * |
148 | * Anon-vma allocations are very subtle, because we may have | 148 | * Anon-vma allocations are very subtle, because we may have |
149 | * optimistically looked up an anon_vma in page_lock_anon_vma() | 149 | * optimistically looked up an anon_vma in page_lock_anon_vma_read() |
150 | * and that may actually touch the spinlock even in the newly | 150 | * and that may actually touch the spinlock even in the newly |
151 | * allocated vma (it depends on RCU to make sure that the | 151 | * allocated vma (it depends on RCU to make sure that the |
152 | * anon_vma isn't actually destroyed). | 152 | * anon_vma isn't actually destroyed). |
@@ -181,7 +181,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
181 | allocated = anon_vma; | 181 | allocated = anon_vma; |
182 | } | 182 | } |
183 | 183 | ||
184 | anon_vma_lock(anon_vma); | 184 | anon_vma_lock_write(anon_vma); |
185 | /* page_table_lock to protect against threads */ | 185 | /* page_table_lock to protect against threads */ |
186 | spin_lock(&mm->page_table_lock); | 186 | spin_lock(&mm->page_table_lock); |
187 | if (likely(!vma->anon_vma)) { | 187 | if (likely(!vma->anon_vma)) { |
@@ -219,9 +219,9 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
219 | struct anon_vma *new_root = anon_vma->root; | 219 | struct anon_vma *new_root = anon_vma->root; |
220 | if (new_root != root) { | 220 | if (new_root != root) { |
221 | if (WARN_ON_ONCE(root)) | 221 | if (WARN_ON_ONCE(root)) |
222 | mutex_unlock(&root->mutex); | 222 | up_write(&root->rwsem); |
223 | root = new_root; | 223 | root = new_root; |
224 | mutex_lock(&root->mutex); | 224 | down_write(&root->rwsem); |
225 | } | 225 | } |
226 | return root; | 226 | return root; |
227 | } | 227 | } |
@@ -229,7 +229,7 @@ static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct | |||
229 | static inline void unlock_anon_vma_root(struct anon_vma *root) | 229 | static inline void unlock_anon_vma_root(struct anon_vma *root) |
230 | { | 230 | { |
231 | if (root) | 231 | if (root) |
232 | mutex_unlock(&root->mutex); | 232 | up_write(&root->rwsem); |
233 | } | 233 | } |
234 | 234 | ||
235 | /* | 235 | /* |
@@ -306,7 +306,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | |||
306 | get_anon_vma(anon_vma->root); | 306 | get_anon_vma(anon_vma->root); |
307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | 307 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ |
308 | vma->anon_vma = anon_vma; | 308 | vma->anon_vma = anon_vma; |
309 | anon_vma_lock(anon_vma); | 309 | anon_vma_lock_write(anon_vma); |
310 | anon_vma_chain_link(vma, avc, anon_vma); | 310 | anon_vma_chain_link(vma, avc, anon_vma); |
311 | anon_vma_unlock(anon_vma); | 311 | anon_vma_unlock(anon_vma); |
312 | 312 | ||
@@ -349,7 +349,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) | |||
349 | /* | 349 | /* |
350 | * Iterate the list once more, it now only contains empty and unlinked | 350 | * Iterate the list once more, it now only contains empty and unlinked |
351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() | 351 | * anon_vmas, destroy them. Could not do before due to __put_anon_vma() |
352 | * needing to acquire the anon_vma->root->mutex. | 352 | * needing to write-acquire the anon_vma->root->rwsem. |
353 | */ | 353 | */ |
354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | 354 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { |
355 | struct anon_vma *anon_vma = avc->anon_vma; | 355 | struct anon_vma *anon_vma = avc->anon_vma; |
@@ -365,7 +365,7 @@ static void anon_vma_ctor(void *data) | |||
365 | { | 365 | { |
366 | struct anon_vma *anon_vma = data; | 366 | struct anon_vma *anon_vma = data; |
367 | 367 | ||
368 | mutex_init(&anon_vma->mutex); | 368 | init_rwsem(&anon_vma->rwsem); |
369 | atomic_set(&anon_vma->refcount, 0); | 369 | atomic_set(&anon_vma->refcount, 0); |
370 | anon_vma->rb_root = RB_ROOT; | 370 | anon_vma->rb_root = RB_ROOT; |
371 | } | 371 | } |
@@ -442,7 +442,7 @@ out: | |||
442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a | 442 | * atomic op -- the trylock. If we fail the trylock, we fall back to getting a |
443 | * reference like with page_get_anon_vma() and then block on the mutex. | 443 | * reference like with page_get_anon_vma() and then block on the mutex. |
444 | */ | 444 | */ |
445 | struct anon_vma *page_lock_anon_vma(struct page *page) | 445 | struct anon_vma *page_lock_anon_vma_read(struct page *page) |
446 | { | 446 | { |
447 | struct anon_vma *anon_vma = NULL; | 447 | struct anon_vma *anon_vma = NULL; |
448 | struct anon_vma *root_anon_vma; | 448 | struct anon_vma *root_anon_vma; |
@@ -457,14 +457,14 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
457 | 457 | ||
458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 458 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 459 | root_anon_vma = ACCESS_ONCE(anon_vma->root); |
460 | if (mutex_trylock(&root_anon_vma->mutex)) { | 460 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
461 | /* | 461 | /* |
462 | * If the page is still mapped, then this anon_vma is still | 462 | * If the page is still mapped, then this anon_vma is still |
463 | * its anon_vma, and holding the mutex ensures that it will | 463 | * its anon_vma, and holding the mutex ensures that it will |
464 | * not go away, see anon_vma_free(). | 464 | * not go away, see anon_vma_free(). |
465 | */ | 465 | */ |
466 | if (!page_mapped(page)) { | 466 | if (!page_mapped(page)) { |
467 | mutex_unlock(&root_anon_vma->mutex); | 467 | up_read(&root_anon_vma->rwsem); |
468 | anon_vma = NULL; | 468 | anon_vma = NULL; |
469 | } | 469 | } |
470 | goto out; | 470 | goto out; |
@@ -484,15 +484,15 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
484 | 484 | ||
485 | /* we pinned the anon_vma, its safe to sleep */ | 485 | /* we pinned the anon_vma, its safe to sleep */ |
486 | rcu_read_unlock(); | 486 | rcu_read_unlock(); |
487 | anon_vma_lock(anon_vma); | 487 | anon_vma_lock_read(anon_vma); |
488 | 488 | ||
489 | if (atomic_dec_and_test(&anon_vma->refcount)) { | 489 | if (atomic_dec_and_test(&anon_vma->refcount)) { |
490 | /* | 490 | /* |
491 | * Oops, we held the last refcount, release the lock | 491 | * Oops, we held the last refcount, release the lock |
492 | * and bail -- can't simply use put_anon_vma() because | 492 | * and bail -- can't simply use put_anon_vma() because |
493 | * we'll deadlock on the anon_vma_lock() recursion. | 493 | * we'll deadlock on the anon_vma_lock_write() recursion. |
494 | */ | 494 | */ |
495 | anon_vma_unlock(anon_vma); | 495 | anon_vma_unlock_read(anon_vma); |
496 | __put_anon_vma(anon_vma); | 496 | __put_anon_vma(anon_vma); |
497 | anon_vma = NULL; | 497 | anon_vma = NULL; |
498 | } | 498 | } |
@@ -504,9 +504,9 @@ out: | |||
504 | return anon_vma; | 504 | return anon_vma; |
505 | } | 505 | } |
506 | 506 | ||
507 | void page_unlock_anon_vma(struct anon_vma *anon_vma) | 507 | void page_unlock_anon_vma_read(struct anon_vma *anon_vma) |
508 | { | 508 | { |
509 | anon_vma_unlock(anon_vma); | 509 | anon_vma_unlock_read(anon_vma); |
510 | } | 510 | } |
511 | 511 | ||
512 | /* | 512 | /* |
@@ -744,7 +744,7 @@ static int page_referenced_anon(struct page *page, | |||
744 | struct anon_vma_chain *avc; | 744 | struct anon_vma_chain *avc; |
745 | int referenced = 0; | 745 | int referenced = 0; |
746 | 746 | ||
747 | anon_vma = page_lock_anon_vma(page); | 747 | anon_vma = page_lock_anon_vma_read(page); |
748 | if (!anon_vma) | 748 | if (!anon_vma) |
749 | return referenced; | 749 | return referenced; |
750 | 750 | ||
@@ -766,7 +766,7 @@ static int page_referenced_anon(struct page *page, | |||
766 | break; | 766 | break; |
767 | } | 767 | } |
768 | 768 | ||
769 | page_unlock_anon_vma(anon_vma); | 769 | page_unlock_anon_vma_read(anon_vma); |
770 | return referenced; | 770 | return referenced; |
771 | } | 771 | } |
772 | 772 | ||
@@ -1315,7 +1315,7 @@ out_mlock: | |||
1315 | /* | 1315 | /* |
1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | 1316 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes |
1317 | * unstable result and race. Plus, We can't wait here because | 1317 | * unstable result and race. Plus, We can't wait here because |
1318 | * we now hold anon_vma->mutex or mapping->i_mmap_mutex. | 1318 | * we now hold anon_vma->rwsem or mapping->i_mmap_mutex. |
1319 | * if trylock failed, the page remain in evictable lru and later | 1319 | * if trylock failed, the page remain in evictable lru and later |
1320 | * vmscan could retry to move the page to unevictable lru if the | 1320 | * vmscan could retry to move the page to unevictable lru if the |
1321 | * page is actually mlocked. | 1321 | * page is actually mlocked. |
@@ -1480,7 +1480,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1480 | struct anon_vma_chain *avc; | 1480 | struct anon_vma_chain *avc; |
1481 | int ret = SWAP_AGAIN; | 1481 | int ret = SWAP_AGAIN; |
1482 | 1482 | ||
1483 | anon_vma = page_lock_anon_vma(page); | 1483 | anon_vma = page_lock_anon_vma_read(page); |
1484 | if (!anon_vma) | 1484 | if (!anon_vma) |
1485 | return ret; | 1485 | return ret; |
1486 | 1486 | ||
@@ -1507,7 +1507,7 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | |||
1507 | break; | 1507 | break; |
1508 | } | 1508 | } |
1509 | 1509 | ||
1510 | page_unlock_anon_vma(anon_vma); | 1510 | page_unlock_anon_vma_read(anon_vma); |
1511 | return ret; | 1511 | return ret; |
1512 | } | 1512 | } |
1513 | 1513 | ||
@@ -1702,7 +1702,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1702 | int ret = SWAP_AGAIN; | 1702 | int ret = SWAP_AGAIN; |
1703 | 1703 | ||
1704 | /* | 1704 | /* |
1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | 1705 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read() |
1706 | * because that depends on page_mapped(); but not all its usages | 1706 | * because that depends on page_mapped(); but not all its usages |
1707 | * are holding mmap_sem. Users without mmap_sem are required to | 1707 | * are holding mmap_sem. Users without mmap_sem are required to |
1708 | * take a reference count to prevent the anon_vma disappearing | 1708 | * take a reference count to prevent the anon_vma disappearing |
@@ -1710,7 +1710,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1710 | anon_vma = page_anon_vma(page); | 1710 | anon_vma = page_anon_vma(page); |
1711 | if (!anon_vma) | 1711 | if (!anon_vma) |
1712 | return ret; | 1712 | return ret; |
1713 | anon_vma_lock(anon_vma); | 1713 | anon_vma_lock_read(anon_vma); |
1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | 1714 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { |
1715 | struct vm_area_struct *vma = avc->vma; | 1715 | struct vm_area_struct *vma = avc->vma; |
1716 | unsigned long address = vma_address(page, vma); | 1716 | unsigned long address = vma_address(page, vma); |
@@ -1718,7 +1718,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
1718 | if (ret != SWAP_AGAIN) | 1718 | if (ret != SWAP_AGAIN) |
1719 | break; | 1719 | break; |
1720 | } | 1720 | } |
1721 | anon_vma_unlock(anon_vma); | 1721 | anon_vma_unlock_read(anon_vma); |
1722 | return ret; | 1722 | return ret; |
1723 | } | 1723 | } |
1724 | 1724 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index df14808f0a36..9800306c8195 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -774,10 +774,20 @@ const char * const vmstat_text[] = { | |||
774 | 774 | ||
775 | "pgrotated", | 775 | "pgrotated", |
776 | 776 | ||
777 | #ifdef CONFIG_NUMA_BALANCING | ||
778 | "numa_pte_updates", | ||
779 | "numa_hint_faults", | ||
780 | "numa_hint_faults_local", | ||
781 | "numa_pages_migrated", | ||
782 | #endif | ||
783 | #ifdef CONFIG_MIGRATION | ||
784 | "pgmigrate_success", | ||
785 | "pgmigrate_fail", | ||
786 | #endif | ||
777 | #ifdef CONFIG_COMPACTION | 787 | #ifdef CONFIG_COMPACTION |
778 | "compact_blocks_moved", | 788 | "compact_migrate_scanned", |
779 | "compact_pages_moved", | 789 | "compact_free_scanned", |
780 | "compact_pagemigrate_failed", | 790 | "compact_isolated", |
781 | "compact_stall", | 791 | "compact_stall", |
782 | "compact_fail", | 792 | "compact_fail", |
783 | "compact_success", | 793 | "compact_success", |