aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mprotect.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /mm/mprotect.c
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'mm/mprotect.c')
-rw-r--r--mm/mprotect.c135
1 files changed, 111 insertions, 24 deletions
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8c3938db6fa..3dca970367db 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -35,12 +35,16 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
35} 35}
36#endif 36#endif
37 37
38static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, 38static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
39 unsigned long addr, unsigned long end, pgprot_t newprot, 39 unsigned long addr, unsigned long end, pgprot_t newprot,
40 int dirty_accountable) 40 int dirty_accountable, int prot_numa, bool *ret_all_same_node)
41{ 41{
42 struct mm_struct *mm = vma->vm_mm;
42 pte_t *pte, oldpte; 43 pte_t *pte, oldpte;
43 spinlock_t *ptl; 44 spinlock_t *ptl;
45 unsigned long pages = 0;
46 bool all_same_node = true;
47 int last_nid = -1;
44 48
45 pte = pte_offset_map_lock(mm, pmd, addr, &ptl); 49 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
46 arch_enter_lazy_mmu_mode(); 50 arch_enter_lazy_mmu_mode();
@@ -48,17 +52,43 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
48 oldpte = *pte; 52 oldpte = *pte;
49 if (pte_present(oldpte)) { 53 if (pte_present(oldpte)) {
50 pte_t ptent; 54 pte_t ptent;
55 bool updated = false;
51 56
52 ptent = ptep_modify_prot_start(mm, addr, pte); 57 ptent = ptep_modify_prot_start(mm, addr, pte);
53 ptent = pte_modify(ptent, newprot); 58 if (!prot_numa) {
59 ptent = pte_modify(ptent, newprot);
60 updated = true;
61 } else {
62 struct page *page;
63
64 page = vm_normal_page(vma, addr, oldpte);
65 if (page) {
66 int this_nid = page_to_nid(page);
67 if (last_nid == -1)
68 last_nid = this_nid;
69 if (last_nid != this_nid)
70 all_same_node = false;
71
72 /* only check non-shared pages */
73 if (!pte_numa(oldpte) &&
74 page_mapcount(page) == 1) {
75 ptent = pte_mknuma(ptent);
76 updated = true;
77 }
78 }
79 }
54 80
55 /* 81 /*
56 * Avoid taking write faults for pages we know to be 82 * Avoid taking write faults for pages we know to be
57 * dirty. 83 * dirty.
58 */ 84 */
59 if (dirty_accountable && pte_dirty(ptent)) 85 if (dirty_accountable && pte_dirty(ptent)) {
60 ptent = pte_mkwrite(ptent); 86 ptent = pte_mkwrite(ptent);
87 updated = true;
88 }
61 89
90 if (updated)
91 pages++;
62 ptep_modify_prot_commit(mm, addr, pte, ptent); 92 ptep_modify_prot_commit(mm, addr, pte, ptent);
63 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { 93 } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 94 swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -72,18 +102,40 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 102 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 103 swp_entry_to_pte(entry));
74 } 104 }
105 pages++;
75 } 106 }
76 } while (pte++, addr += PAGE_SIZE, addr != end); 107 } while (pte++, addr += PAGE_SIZE, addr != end);
77 arch_leave_lazy_mmu_mode(); 108 arch_leave_lazy_mmu_mode();
78 pte_unmap_unlock(pte - 1, ptl); 109 pte_unmap_unlock(pte - 1, ptl);
110
111 *ret_all_same_node = all_same_node;
112 return pages;
79} 113}
80 114
81static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, 115#ifdef CONFIG_NUMA_BALANCING
116static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
117 pmd_t *pmd)
118{
119 spin_lock(&mm->page_table_lock);
120 set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
121 spin_unlock(&mm->page_table_lock);
122}
123#else
124static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
125 pmd_t *pmd)
126{
127 BUG();
128}
129#endif /* CONFIG_NUMA_BALANCING */
130
131static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
82 unsigned long addr, unsigned long end, pgprot_t newprot, 132 unsigned long addr, unsigned long end, pgprot_t newprot,
83 int dirty_accountable) 133 int dirty_accountable, int prot_numa)
84{ 134{
85 pmd_t *pmd; 135 pmd_t *pmd;
86 unsigned long next; 136 unsigned long next;
137 unsigned long pages = 0;
138 bool all_same_node;
87 139
88 pmd = pmd_offset(pud, addr); 140 pmd = pmd_offset(pud, addr);
89 do { 141 do {
@@ -91,42 +143,59 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
91 if (pmd_trans_huge(*pmd)) { 143 if (pmd_trans_huge(*pmd)) {
92 if (next - addr != HPAGE_PMD_SIZE) 144 if (next - addr != HPAGE_PMD_SIZE)
93 split_huge_page_pmd(vma, addr, pmd); 145 split_huge_page_pmd(vma, addr, pmd);
94 else if (change_huge_pmd(vma, pmd, addr, newprot)) 146 else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
147 pages += HPAGE_PMD_NR;
95 continue; 148 continue;
149 }
96 /* fall through */ 150 /* fall through */
97 } 151 }
98 if (pmd_none_or_clear_bad(pmd)) 152 if (pmd_none_or_clear_bad(pmd))
99 continue; 153 continue;
100 change_pte_range(vma->vm_mm, pmd, addr, next, newprot, 154 pages += change_pte_range(vma, pmd, addr, next, newprot,
101 dirty_accountable); 155 dirty_accountable, prot_numa, &all_same_node);
156
157 /*
158 * If we are changing protections for NUMA hinting faults then
159 * set pmd_numa if the examined pages were all on the same
160 * node. This allows a regular PMD to be handled as one fault
161 * and effectively batches the taking of the PTL
162 */
163 if (prot_numa && all_same_node)
164 change_pmd_protnuma(vma->vm_mm, addr, pmd);
102 } while (pmd++, addr = next, addr != end); 165 } while (pmd++, addr = next, addr != end);
166
167 return pages;
103} 168}
104 169
105static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, 170static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
106 unsigned long addr, unsigned long end, pgprot_t newprot, 171 unsigned long addr, unsigned long end, pgprot_t newprot,
107 int dirty_accountable) 172 int dirty_accountable, int prot_numa)
108{ 173{
109 pud_t *pud; 174 pud_t *pud;
110 unsigned long next; 175 unsigned long next;
176 unsigned long pages = 0;
111 177
112 pud = pud_offset(pgd, addr); 178 pud = pud_offset(pgd, addr);
113 do { 179 do {
114 next = pud_addr_end(addr, end); 180 next = pud_addr_end(addr, end);
115 if (pud_none_or_clear_bad(pud)) 181 if (pud_none_or_clear_bad(pud))
116 continue; 182 continue;
117 change_pmd_range(vma, pud, addr, next, newprot, 183 pages += change_pmd_range(vma, pud, addr, next, newprot,
118 dirty_accountable); 184 dirty_accountable, prot_numa);
119 } while (pud++, addr = next, addr != end); 185 } while (pud++, addr = next, addr != end);
186
187 return pages;
120} 188}
121 189
122static void change_protection(struct vm_area_struct *vma, 190static unsigned long change_protection_range(struct vm_area_struct *vma,
123 unsigned long addr, unsigned long end, pgprot_t newprot, 191 unsigned long addr, unsigned long end, pgprot_t newprot,
124 int dirty_accountable) 192 int dirty_accountable, int prot_numa)
125{ 193{
126 struct mm_struct *mm = vma->vm_mm; 194 struct mm_struct *mm = vma->vm_mm;
127 pgd_t *pgd; 195 pgd_t *pgd;
128 unsigned long next; 196 unsigned long next;
129 unsigned long start = addr; 197 unsigned long start = addr;
198 unsigned long pages = 0;
130 199
131 BUG_ON(addr >= end); 200 BUG_ON(addr >= end);
132 pgd = pgd_offset(mm, addr); 201 pgd = pgd_offset(mm, addr);
@@ -135,10 +204,32 @@ static void change_protection(struct vm_area_struct *vma,
135 next = pgd_addr_end(addr, end); 204 next = pgd_addr_end(addr, end);
136 if (pgd_none_or_clear_bad(pgd)) 205 if (pgd_none_or_clear_bad(pgd))
137 continue; 206 continue;
138 change_pud_range(vma, pgd, addr, next, newprot, 207 pages += change_pud_range(vma, pgd, addr, next, newprot,
139 dirty_accountable); 208 dirty_accountable, prot_numa);
140 } while (pgd++, addr = next, addr != end); 209 } while (pgd++, addr = next, addr != end);
141 flush_tlb_range(vma, start, end); 210
211 /* Only flush the TLB if we actually modified any entries: */
212 if (pages)
213 flush_tlb_range(vma, start, end);
214
215 return pages;
216}
217
218unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
219 unsigned long end, pgprot_t newprot,
220 int dirty_accountable, int prot_numa)
221{
222 struct mm_struct *mm = vma->vm_mm;
223 unsigned long pages;
224
225 mmu_notifier_invalidate_range_start(mm, start, end);
226 if (is_vm_hugetlb_page(vma))
227 pages = hugetlb_change_protection(vma, start, end, newprot);
228 else
229 pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa);
230 mmu_notifier_invalidate_range_end(mm, start, end);
231
232 return pages;
142} 233}
143 234
144int 235int
@@ -213,12 +304,8 @@ success:
213 dirty_accountable = 1; 304 dirty_accountable = 1;
214 } 305 }
215 306
216 mmu_notifier_invalidate_range_start(mm, start, end); 307 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
217 if (is_vm_hugetlb_page(vma)) 308
218 hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
219 else
220 change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
221 mmu_notifier_invalidate_range_end(mm, start, end);
222 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); 309 vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
223 vm_stat_account(mm, newflags, vma->vm_file, nrpages); 310 vm_stat_account(mm, newflags, vma->vm_file, nrpages);
224 perf_event_mmap(vma); 311 perf_event_mmap(vma);