aboutsummaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /include
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/pgtable.h110
-rw-r--r--include/linux/huge_mm.h16
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/mempolicy.h8
-rw-r--r--include/linux/migrate.h46
-rw-r--r--include/linux/mm.h39
-rw-r--r--include/linux/mm_types.h31
-rw-r--r--include/linux/mmzone.h13
-rw-r--r--include/linux/rmap.h33
-rw-r--r--include/linux/sched.h27
-rw-r--r--include/linux/vm_event_item.h12
-rw-r--r--include/linux/vmstat.h8
-rw-r--r--include/trace/events/migrate.h51
-rw-r--r--include/uapi/linux/mempolicy.h15
14 files changed, 396 insertions, 21 deletions
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 284e80831d2c..701beab27aab 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
219#define move_pte(pte, prot, old_addr, new_addr) (pte) 219#define move_pte(pte, prot, old_addr, new_addr) (pte)
220#endif 220#endif
221 221
222#ifndef pte_accessible
223# define pte_accessible(pte) ((void)(pte),1)
224#endif
225
222#ifndef flush_tlb_fix_spurious_fault 226#ifndef flush_tlb_fix_spurious_fault
223#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) 227#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
224#endif 228#endif
@@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd)
580#endif 584#endif
581} 585}
582 586
587#ifdef CONFIG_NUMA_BALANCING
588#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
589/*
590 * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the
591 * same bit too). It's set only when _PAGE_PRESET is not set and it's
592 * never set if _PAGE_PRESENT is set.
593 *
594 * pte/pmd_present() returns true if pte/pmd_numa returns true. Page
595 * fault triggers on those regions if pte/pmd_numa returns true
596 * (because _PAGE_PRESENT is not set).
597 */
598#ifndef pte_numa
599static inline int pte_numa(pte_t pte)
600{
601 return (pte_flags(pte) &
602 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
603}
604#endif
605
606#ifndef pmd_numa
607static inline int pmd_numa(pmd_t pmd)
608{
609 return (pmd_flags(pmd) &
610 (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
611}
612#endif
613
614/*
615 * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
616 * because they're called by the NUMA hinting minor page fault. If we
617 * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler
618 * would be forced to set it later while filling the TLB after we
619 * return to userland. That would trigger a second write to memory
620 * that we optimize away by setting _PAGE_ACCESSED here.
621 */
622#ifndef pte_mknonnuma
623static inline pte_t pte_mknonnuma(pte_t pte)
624{
625 pte = pte_clear_flags(pte, _PAGE_NUMA);
626 return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED);
627}
628#endif
629
630#ifndef pmd_mknonnuma
631static inline pmd_t pmd_mknonnuma(pmd_t pmd)
632{
633 pmd = pmd_clear_flags(pmd, _PAGE_NUMA);
634 return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED);
635}
636#endif
637
638#ifndef pte_mknuma
639static inline pte_t pte_mknuma(pte_t pte)
640{
641 pte = pte_set_flags(pte, _PAGE_NUMA);
642 return pte_clear_flags(pte, _PAGE_PRESENT);
643}
644#endif
645
646#ifndef pmd_mknuma
647static inline pmd_t pmd_mknuma(pmd_t pmd)
648{
649 pmd = pmd_set_flags(pmd, _PAGE_NUMA);
650 return pmd_clear_flags(pmd, _PAGE_PRESENT);
651}
652#endif
653#else
654extern int pte_numa(pte_t pte);
655extern int pmd_numa(pmd_t pmd);
656extern pte_t pte_mknonnuma(pte_t pte);
657extern pmd_t pmd_mknonnuma(pmd_t pmd);
658extern pte_t pte_mknuma(pte_t pte);
659extern pmd_t pmd_mknuma(pmd_t pmd);
660#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
661#else
662static inline int pmd_numa(pmd_t pmd)
663{
664 return 0;
665}
666
667static inline int pte_numa(pte_t pte)
668{
669 return 0;
670}
671
672static inline pte_t pte_mknonnuma(pte_t pte)
673{
674 return pte;
675}
676
677static inline pmd_t pmd_mknonnuma(pmd_t pmd)
678{
679 return pmd;
680}
681
682static inline pte_t pte_mknuma(pte_t pte)
683{
684 return pte;
685}
686
687static inline pmd_t pmd_mknuma(pmd_t pmd)
688{
689 return pmd;
690}
691#endif /* CONFIG_NUMA_BALANCING */
692
583#endif /* CONFIG_MMU */ 693#endif /* CONFIG_MMU */
584 694
585#endif /* !__ASSEMBLY__ */ 695#endif /* !__ASSEMBLY__ */
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 092dc5305a32..1d76f8ca90f0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma,
31 unsigned long new_addr, unsigned long old_end, 31 unsigned long new_addr, unsigned long old_end,
32 pmd_t *old_pmd, pmd_t *new_pmd); 32 pmd_t *old_pmd, pmd_t *new_pmd);
33extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 33extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
34 unsigned long addr, pgprot_t newprot); 34 unsigned long addr, pgprot_t newprot,
35 int prot_numa);
35 36
36enum transparent_hugepage_flag { 37enum transparent_hugepage_flag {
37 TRANSPARENT_HUGEPAGE_FLAG, 38 TRANSPARENT_HUGEPAGE_FLAG,
@@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma,
111#define wait_split_huge_page(__anon_vma, __pmd) \ 112#define wait_split_huge_page(__anon_vma, __pmd) \
112 do { \ 113 do { \
113 pmd_t *____pmd = (__pmd); \ 114 pmd_t *____pmd = (__pmd); \
114 anon_vma_lock(__anon_vma); \ 115 anon_vma_lock_write(__anon_vma); \
115 anon_vma_unlock(__anon_vma); \ 116 anon_vma_unlock(__anon_vma); \
116 BUG_ON(pmd_trans_splitting(*____pmd) || \ 117 BUG_ON(pmd_trans_splitting(*____pmd) || \
117 pmd_trans_huge(*____pmd)); \ 118 pmd_trans_huge(*____pmd)); \
@@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page)
171 } 172 }
172 return page; 173 return page;
173} 174}
175
176extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
177 unsigned long addr, pmd_t pmd, pmd_t *pmdp);
178
174#else /* CONFIG_TRANSPARENT_HUGEPAGE */ 179#else /* CONFIG_TRANSPARENT_HUGEPAGE */
175#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) 180#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
176#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) 181#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
@@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd,
209{ 214{
210 return 0; 215 return 0;
211} 216}
217
218static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
219 unsigned long addr, pmd_t pmd, pmd_t *pmdp)
220{
221 return 0;
222}
223
212#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 224#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
213 225
214#endif /* _LINUX_HUGE_MM_H */ 226#endif /* _LINUX_HUGE_MM_H */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3e7fa1acf09c..0c80d3f57a5b 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
87 pud_t *pud, int write); 87 pud_t *pud, int write);
88int pmd_huge(pmd_t pmd); 88int pmd_huge(pmd_t pmd);
89int pud_huge(pud_t pmd); 89int pud_huge(pud_t pmd);
90void hugetlb_change_protection(struct vm_area_struct *vma, 90unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
91 unsigned long address, unsigned long end, pgprot_t newprot); 91 unsigned long address, unsigned long end, pgprot_t newprot);
92 92
93#else /* !CONFIG_HUGETLB_PAGE */ 93#else /* !CONFIG_HUGETLB_PAGE */
@@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src)
132{ 132{
133} 133}
134 134
135#define hugetlb_change_protection(vma, address, end, newprot) 135static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
136 unsigned long address, unsigned long end, pgprot_t newprot)
137{
138 return 0;
139}
136 140
137static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, 141static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
138 struct vm_area_struct *vma, unsigned long start, 142 struct vm_area_struct *vma, unsigned long start,
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index dbd212723b74..9adc270de7ef 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
188 return 1; 188 return 1;
189} 189}
190 190
191extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
192
191#else 193#else
192 194
193struct mempolicy {}; 195struct mempolicy {};
@@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol,
307 return 0; 309 return 0;
308} 310}
309 311
312static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
313 unsigned long address)
314{
315 return -1; /* no node preference */
316}
317
310#endif /* CONFIG_NUMA */ 318#endif /* CONFIG_NUMA */
311#endif 319#endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 0b5865c61efd..1e9f627967a3 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **);
23#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page 23#define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page
24 * sucessful migration case. 24 * sucessful migration case.
25 */ 25 */
26enum migrate_reason {
27 MR_COMPACTION,
28 MR_MEMORY_FAILURE,
29 MR_MEMORY_HOTPLUG,
30 MR_SYSCALL, /* also applies to cpusets */
31 MR_MEMPOLICY_MBIND,
32 MR_NUMA_MISPLACED,
33 MR_CMA
34};
26 35
27#ifdef CONFIG_MIGRATION 36#ifdef CONFIG_MIGRATION
28 37
@@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *,
32 struct page *, struct page *, enum migrate_mode); 41 struct page *, struct page *, enum migrate_mode);
33extern int migrate_pages(struct list_head *l, new_page_t x, 42extern int migrate_pages(struct list_head *l, new_page_t x,
34 unsigned long private, bool offlining, 43 unsigned long private, bool offlining,
35 enum migrate_mode mode); 44 enum migrate_mode mode, int reason);
36extern int migrate_huge_page(struct page *, new_page_t x, 45extern int migrate_huge_page(struct page *, new_page_t x,
37 unsigned long private, bool offlining, 46 unsigned long private, bool offlining,
38 enum migrate_mode mode); 47 enum migrate_mode mode);
@@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {}
54static inline void putback_movable_pages(struct list_head *l) {} 63static inline void putback_movable_pages(struct list_head *l) {}
55static inline int migrate_pages(struct list_head *l, new_page_t x, 64static inline int migrate_pages(struct list_head *l, new_page_t x,
56 unsigned long private, bool offlining, 65 unsigned long private, bool offlining,
57 enum migrate_mode mode) { return -ENOSYS; } 66 enum migrate_mode mode, int reason) { return -ENOSYS; }
58static inline int migrate_huge_page(struct page *page, new_page_t x, 67static inline int migrate_huge_page(struct page *page, new_page_t x,
59 unsigned long private, bool offlining, 68 unsigned long private, bool offlining,
60 enum migrate_mode mode) { return -ENOSYS; } 69 enum migrate_mode mode) { return -ENOSYS; }
@@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
83#define fail_migrate_page NULL 92#define fail_migrate_page NULL
84 93
85#endif /* CONFIG_MIGRATION */ 94#endif /* CONFIG_MIGRATION */
95
96#ifdef CONFIG_NUMA_BALANCING
97extern int migrate_misplaced_page(struct page *page, int node);
98extern int migrate_misplaced_page(struct page *page, int node);
99extern bool migrate_ratelimited(int node);
100#else
101static inline int migrate_misplaced_page(struct page *page, int node)
102{
103 return -EAGAIN; /* can't migrate now */
104}
105static inline bool migrate_ratelimited(int node)
106{
107 return false;
108}
109#endif /* CONFIG_NUMA_BALANCING */
110
111#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
112extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
113 struct vm_area_struct *vma,
114 pmd_t *pmd, pmd_t entry,
115 unsigned long address,
116 struct page *page, int node);
117#else
118static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
119 struct vm_area_struct *vma,
120 pmd_t *pmd, pmd_t entry,
121 unsigned long address,
122 struct page *page, int node)
123{
124 return -EAGAIN;
125}
126#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
127
86#endif /* _LINUX_MIGRATE_H */ 128#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4af4f0b1be4c..7f4f906190bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page)
693} 693}
694#endif 694#endif
695 695
696#ifdef CONFIG_NUMA_BALANCING
697static inline int page_xchg_last_nid(struct page *page, int nid)
698{
699 return xchg(&page->_last_nid, nid);
700}
701
702static inline int page_last_nid(struct page *page)
703{
704 return page->_last_nid;
705}
706static inline void reset_page_last_nid(struct page *page)
707{
708 page->_last_nid = -1;
709}
710#else
711static inline int page_xchg_last_nid(struct page *page, int nid)
712{
713 return page_to_nid(page);
714}
715
716static inline int page_last_nid(struct page *page)
717{
718 return page_to_nid(page);
719}
720
721static inline void reset_page_last_nid(struct page *page)
722{
723}
724#endif
725
696static inline struct zone *page_zone(const struct page *page) 726static inline struct zone *page_zone(const struct page *page)
697{ 727{
698 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; 728 return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
@@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
1078extern unsigned long do_mremap(unsigned long addr, 1108extern unsigned long do_mremap(unsigned long addr,
1079 unsigned long old_len, unsigned long new_len, 1109 unsigned long old_len, unsigned long new_len,
1080 unsigned long flags, unsigned long new_addr); 1110 unsigned long flags, unsigned long new_addr);
1111extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
1112 unsigned long end, pgprot_t newprot,
1113 int dirty_accountable, int prot_numa);
1081extern int mprotect_fixup(struct vm_area_struct *vma, 1114extern int mprotect_fixup(struct vm_area_struct *vma,
1082 struct vm_area_struct **pprev, unsigned long start, 1115 struct vm_area_struct **pprev, unsigned long start,
1083 unsigned long end, unsigned long newflags); 1116 unsigned long end, unsigned long newflags);
@@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
1579} 1612}
1580#endif 1613#endif
1581 1614
1615#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
1616unsigned long change_prot_numa(struct vm_area_struct *vma,
1617 unsigned long start, unsigned long end);
1618#endif
1619
1582struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); 1620struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
1583int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 1621int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
1584 unsigned long pfn, unsigned long size, pgprot_t); 1622 unsigned long pfn, unsigned long size, pgprot_t);
@@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
1600#define FOLL_MLOCK 0x40 /* mark page as mlocked */ 1638#define FOLL_MLOCK 0x40 /* mark page as mlocked */
1601#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ 1639#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
1602#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ 1640#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
1641#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
1603 1642
1604typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 1643typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
1605 void *data); 1644 void *data);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7ade2731b5d6..7d9ebb7cc982 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,6 +175,10 @@ struct page {
175 */ 175 */
176 void *shadow; 176 void *shadow;
177#endif 177#endif
178
179#ifdef CONFIG_NUMA_BALANCING
180 int _last_nid;
181#endif
178} 182}
179/* 183/*
180 * The struct page can be forced to be double word aligned so that atomic ops 184 * The struct page can be forced to be double word aligned so that atomic ops
@@ -411,9 +415,36 @@ struct mm_struct {
411#ifdef CONFIG_CPUMASK_OFFSTACK 415#ifdef CONFIG_CPUMASK_OFFSTACK
412 struct cpumask cpumask_allocation; 416 struct cpumask cpumask_allocation;
413#endif 417#endif
418#ifdef CONFIG_NUMA_BALANCING
419 /*
420 * numa_next_scan is the next time when the PTEs will me marked
421 * pte_numa to gather statistics and migrate pages to new nodes
422 * if necessary
423 */
424 unsigned long numa_next_scan;
425
426 /* numa_next_reset is when the PTE scanner period will be reset */
427 unsigned long numa_next_reset;
428
429 /* Restart point for scanning and setting pte_numa */
430 unsigned long numa_scan_offset;
431
432 /* numa_scan_seq prevents two threads setting pte_numa */
433 int numa_scan_seq;
434
435 /*
436 * The first node a task was scheduled on. If a task runs on
437 * a different node than Make PTE Scan Go Now.
438 */
439 int first_nid;
440#endif
414 struct uprobes_state uprobes_state; 441 struct uprobes_state uprobes_state;
415}; 442};
416 443
444/* first nid will either be a valid NID or one of these values */
445#define NUMA_PTE_SCAN_INIT -1
446#define NUMA_PTE_SCAN_ACTIVE -2
447
417static inline void mm_init_cpumask(struct mm_struct *mm) 448static inline void mm_init_cpumask(struct mm_struct *mm)
418{ 449{
419#ifdef CONFIG_CPUMASK_OFFSTACK 450#ifdef CONFIG_CPUMASK_OFFSTACK
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd55dad56aac..4bec5be82cab 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -735,6 +735,19 @@ typedef struct pglist_data {
735 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 735 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
736 int kswapd_max_order; 736 int kswapd_max_order;
737 enum zone_type classzone_idx; 737 enum zone_type classzone_idx;
738#ifdef CONFIG_NUMA_BALANCING
739 /*
740 * Lock serializing the per destination node AutoNUMA memory
741 * migration rate limiting data.
742 */
743 spinlock_t numabalancing_migrate_lock;
744
745 /* Rate limiting time interval */
746 unsigned long numabalancing_migrate_next_window;
747
748 /* Number of pages migrated during the rate limiting time interval */
749 unsigned long numabalancing_migrate_nr_pages;
750#endif
738} pg_data_t; 751} pg_data_t;
739 752
740#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) 753#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index bfe1f4780644..c20635c527a9 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -7,7 +7,7 @@
7#include <linux/list.h> 7#include <linux/list.h>
8#include <linux/slab.h> 8#include <linux/slab.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/mutex.h> 10#include <linux/rwsem.h>
11#include <linux/memcontrol.h> 11#include <linux/memcontrol.h>
12 12
13/* 13/*
@@ -25,8 +25,8 @@
25 * pointing to this anon_vma once its vma list is empty. 25 * pointing to this anon_vma once its vma list is empty.
26 */ 26 */
27struct anon_vma { 27struct anon_vma {
28 struct anon_vma *root; /* Root of this anon_vma tree */ 28 struct anon_vma *root; /* Root of this anon_vma tree */
29 struct mutex mutex; /* Serialize access to vma list */ 29 struct rw_semaphore rwsem; /* W: modification, R: walking the list */
30 /* 30 /*
31 * The refcount is taken on an anon_vma when there is no 31 * The refcount is taken on an anon_vma when there is no
32 * guarantee that the vma of page tables will exist for 32 * guarantee that the vma of page tables will exist for
@@ -64,7 +64,7 @@ struct anon_vma_chain {
64 struct vm_area_struct *vma; 64 struct vm_area_struct *vma;
65 struct anon_vma *anon_vma; 65 struct anon_vma *anon_vma;
66 struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ 66 struct list_head same_vma; /* locked by mmap_sem & page_table_lock */
67 struct rb_node rb; /* locked by anon_vma->mutex */ 67 struct rb_node rb; /* locked by anon_vma->rwsem */
68 unsigned long rb_subtree_last; 68 unsigned long rb_subtree_last;
69#ifdef CONFIG_DEBUG_VM_RB 69#ifdef CONFIG_DEBUG_VM_RB
70 unsigned long cached_vma_start, cached_vma_last; 70 unsigned long cached_vma_start, cached_vma_last;
@@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
108{ 108{
109 struct anon_vma *anon_vma = vma->anon_vma; 109 struct anon_vma *anon_vma = vma->anon_vma;
110 if (anon_vma) 110 if (anon_vma)
111 mutex_lock(&anon_vma->root->mutex); 111 down_write(&anon_vma->root->rwsem);
112} 112}
113 113
114static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) 114static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
115{ 115{
116 struct anon_vma *anon_vma = vma->anon_vma; 116 struct anon_vma *anon_vma = vma->anon_vma;
117 if (anon_vma) 117 if (anon_vma)
118 mutex_unlock(&anon_vma->root->mutex); 118 up_write(&anon_vma->root->rwsem);
119} 119}
120 120
121static inline void anon_vma_lock(struct anon_vma *anon_vma) 121static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
122{ 122{
123 mutex_lock(&anon_vma->root->mutex); 123 down_write(&anon_vma->root->rwsem);
124} 124}
125 125
126static inline void anon_vma_unlock(struct anon_vma *anon_vma) 126static inline void anon_vma_unlock(struct anon_vma *anon_vma)
127{ 127{
128 mutex_unlock(&anon_vma->root->mutex); 128 up_write(&anon_vma->root->rwsem);
129} 129}
130 130
131static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
132{
133 down_read(&anon_vma->root->rwsem);
134}
135
136static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
137{
138 up_read(&anon_vma->root->rwsem);
139}
140
141
131/* 142/*
132 * anon_vma helper functions. 143 * anon_vma helper functions.
133 */ 144 */
@@ -220,8 +231,8 @@ int try_to_munlock(struct page *);
220/* 231/*
221 * Called by memory-failure.c to kill processes. 232 * Called by memory-failure.c to kill processes.
222 */ 233 */
223struct anon_vma *page_lock_anon_vma(struct page *page); 234struct anon_vma *page_lock_anon_vma_read(struct page *page);
224void page_unlock_anon_vma(struct anon_vma *anon_vma); 235void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
225int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); 236int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
226 237
227/* 238/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c2f3072beef..b089c92c609b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1527,6 +1527,14 @@ struct task_struct {
1527 short il_next; 1527 short il_next;
1528 short pref_node_fork; 1528 short pref_node_fork;
1529#endif 1529#endif
1530#ifdef CONFIG_NUMA_BALANCING
1531 int numa_scan_seq;
1532 int numa_migrate_seq;
1533 unsigned int numa_scan_period;
1534 u64 node_stamp; /* migration stamp */
1535 struct callback_head numa_work;
1536#endif /* CONFIG_NUMA_BALANCING */
1537
1530 struct rcu_head rcu; 1538 struct rcu_head rcu;
1531 1539
1532 /* 1540 /*
@@ -1601,6 +1609,18 @@ struct task_struct {
1601/* Future-safe accessor for struct task_struct's cpus_allowed. */ 1609/* Future-safe accessor for struct task_struct's cpus_allowed. */
1602#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1610#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1603 1611
1612#ifdef CONFIG_NUMA_BALANCING
1613extern void task_numa_fault(int node, int pages, bool migrated);
1614extern void set_numabalancing_state(bool enabled);
1615#else
1616static inline void task_numa_fault(int node, int pages, bool migrated)
1617{
1618}
1619static inline void set_numabalancing_state(bool enabled)
1620{
1621}
1622#endif
1623
1604/* 1624/*
1605 * Priority of a process goes from 0..MAX_PRIO-1, valid RT 1625 * Priority of a process goes from 0..MAX_PRIO-1, valid RT
1606 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH 1626 * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -2030,6 +2050,13 @@ enum sched_tunable_scaling {
2030}; 2050};
2031extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; 2051extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
2032 2052
2053extern unsigned int sysctl_numa_balancing_scan_delay;
2054extern unsigned int sysctl_numa_balancing_scan_period_min;
2055extern unsigned int sysctl_numa_balancing_scan_period_max;
2056extern unsigned int sysctl_numa_balancing_scan_period_reset;
2057extern unsigned int sysctl_numa_balancing_scan_size;
2058extern unsigned int sysctl_numa_balancing_settle_count;
2059
2033#ifdef CONFIG_SCHED_DEBUG 2060#ifdef CONFIG_SCHED_DEBUG
2034extern unsigned int sysctl_sched_migration_cost; 2061extern unsigned int sysctl_sched_migration_cost;
2035extern unsigned int sysctl_sched_nr_migrate; 2062extern unsigned int sysctl_sched_nr_migrate;
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index fe786f07d2bd..fce0a2799d43 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -38,8 +38,18 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
38 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY, 38 KSWAPD_LOW_WMARK_HIT_QUICKLY, KSWAPD_HIGH_WMARK_HIT_QUICKLY,
39 KSWAPD_SKIP_CONGESTION_WAIT, 39 KSWAPD_SKIP_CONGESTION_WAIT,
40 PAGEOUTRUN, ALLOCSTALL, PGROTATED, 40 PAGEOUTRUN, ALLOCSTALL, PGROTATED,
41#ifdef CONFIG_NUMA_BALANCING
42 NUMA_PTE_UPDATES,
43 NUMA_HINT_FAULTS,
44 NUMA_HINT_FAULTS_LOCAL,
45 NUMA_PAGE_MIGRATE,
46#endif
47#ifdef CONFIG_MIGRATION
48 PGMIGRATE_SUCCESS, PGMIGRATE_FAIL,
49#endif
41#ifdef CONFIG_COMPACTION 50#ifdef CONFIG_COMPACTION
42 COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED, 51 COMPACTMIGRATE_SCANNED, COMPACTFREE_SCANNED,
52 COMPACTISOLATED,
43 COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS, 53 COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
44#endif 54#endif
45#ifdef CONFIG_HUGETLB_PAGE 55#ifdef CONFIG_HUGETLB_PAGE
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 92a86b2cce33..a13291f7da88 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -80,6 +80,14 @@ static inline void vm_events_fold_cpu(int cpu)
80 80
81#endif /* CONFIG_VM_EVENT_COUNTERS */ 81#endif /* CONFIG_VM_EVENT_COUNTERS */
82 82
83#ifdef CONFIG_NUMA_BALANCING
84#define count_vm_numa_event(x) count_vm_event(x)
85#define count_vm_numa_events(x, y) count_vm_events(x, y)
86#else
87#define count_vm_numa_event(x) do {} while (0)
88#define count_vm_numa_events(x, y) do {} while (0)
89#endif /* CONFIG_NUMA_BALANCING */
90
83#define __count_zone_vm_events(item, zone, delta) \ 91#define __count_zone_vm_events(item, zone, delta) \
84 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ 92 __count_vm_events(item##_NORMAL - ZONE_NORMAL + \
85 zone_idx(zone), delta) 93 zone_idx(zone), delta)
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
new file mode 100644
index 000000000000..ec2a6ccfd7e5
--- /dev/null
+++ b/include/trace/events/migrate.h
@@ -0,0 +1,51 @@
1#undef TRACE_SYSTEM
2#define TRACE_SYSTEM migrate
3
4#if !defined(_TRACE_MIGRATE_H) || defined(TRACE_HEADER_MULTI_READ)
5#define _TRACE_MIGRATE_H
6
7#define MIGRATE_MODE \
8 {MIGRATE_ASYNC, "MIGRATE_ASYNC"}, \
9 {MIGRATE_SYNC_LIGHT, "MIGRATE_SYNC_LIGHT"}, \
10 {MIGRATE_SYNC, "MIGRATE_SYNC"}
11
12#define MIGRATE_REASON \
13 {MR_COMPACTION, "compaction"}, \
14 {MR_MEMORY_FAILURE, "memory_failure"}, \
15 {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \
16 {MR_SYSCALL, "syscall_or_cpuset"}, \
17 {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \
18 {MR_CMA, "cma"}
19
20TRACE_EVENT(mm_migrate_pages,
21
22 TP_PROTO(unsigned long succeeded, unsigned long failed,
23 enum migrate_mode mode, int reason),
24
25 TP_ARGS(succeeded, failed, mode, reason),
26
27 TP_STRUCT__entry(
28 __field( unsigned long, succeeded)
29 __field( unsigned long, failed)
30 __field( enum migrate_mode, mode)
31 __field( int, reason)
32 ),
33
34 TP_fast_assign(
35 __entry->succeeded = succeeded;
36 __entry->failed = failed;
37 __entry->mode = mode;
38 __entry->reason = reason;
39 ),
40
41 TP_printk("nr_succeeded=%lu nr_failed=%lu mode=%s reason=%s",
42 __entry->succeeded,
43 __entry->failed,
44 __print_symbolic(__entry->mode, MIGRATE_MODE),
45 __print_symbolic(__entry->reason, MIGRATE_REASON))
46);
47
48#endif /* _TRACE_MIGRATE_H */
49
50/* This part must be outside protection */
51#include <trace/define_trace.h>
diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
index 23e62e0537e2..0d11c3dcd3a1 100644
--- a/include/uapi/linux/mempolicy.h
+++ b/include/uapi/linux/mempolicy.h
@@ -20,6 +20,7 @@ enum {
20 MPOL_PREFERRED, 20 MPOL_PREFERRED,
21 MPOL_BIND, 21 MPOL_BIND,
22 MPOL_INTERLEAVE, 22 MPOL_INTERLEAVE,
23 MPOL_LOCAL,
23 MPOL_MAX, /* always last member of enum */ 24 MPOL_MAX, /* always last member of enum */
24}; 25};
25 26
@@ -47,9 +48,15 @@ enum mpol_rebind_step {
47 48
48/* Flags for mbind */ 49/* Flags for mbind */
49#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ 50#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
50#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ 51#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
51#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ 52 to policy */
52#define MPOL_MF_INTERNAL (1<<3) /* Internal flags start here */ 53#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
54#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
55#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
56
57#define MPOL_MF_VALID (MPOL_MF_STRICT | \
58 MPOL_MF_MOVE | \
59 MPOL_MF_MOVE_ALL)
53 60
54/* 61/*
55 * Internal flags that share the struct mempolicy flags word with 62 * Internal flags that share the struct mempolicy flags word with
@@ -59,6 +66,8 @@ enum mpol_rebind_step {
59#define MPOL_F_SHARED (1 << 0) /* identify shared policies */ 66#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
60#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 67#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
61#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ 68#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
69#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
70#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
62 71
63 72
64#endif /* _UAPI_LINUX_MEMPOLICY_H */ 73#endif /* _UAPI_LINUX_MEMPOLICY_H */