aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-04-19 16:29:15 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org.(none)>2005-04-19 16:29:15 -0400
commite0da382c92626ad1d7f4b7527d19b80104d67a83 (patch)
treeb3f455518c286ee14cb2755ced8808487bca7911 /mm/memory.c
parent9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff)
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c152
1 files changed, 115 insertions, 37 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fb6e5deb873a..fee5dc8fc36c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd)
110 * Note: this doesn't free the actual pages themselves. That 110 * Note: this doesn't free the actual pages themselves. That
111 * has been handled earlier when unmapping all the memory regions. 111 * has been handled earlier when unmapping all the memory regions.
112 */ 112 */
113static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 113static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114 unsigned long addr, unsigned long end)
115{ 114{
116 if (!((addr | end) & ~PMD_MASK)) { 115 struct page *page = pmd_page(*pmd);
117 /* Only free fully aligned ranges */ 116 pmd_clear(pmd);
118 struct page *page = pmd_page(*pmd); 117 pte_free_tlb(tlb, page);
119 pmd_clear(pmd); 118 dec_page_state(nr_page_table_pages);
120 dec_page_state(nr_page_table_pages); 119 tlb->mm->nr_ptes--;
121 tlb->mm->nr_ptes--;
122 pte_free_tlb(tlb, page);
123 }
124} 120}
125 121
126static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, 122static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
127 unsigned long addr, unsigned long end) 123 unsigned long addr, unsigned long end,
124 unsigned long floor, unsigned long ceiling)
128{ 125{
129 pmd_t *pmd; 126 pmd_t *pmd;
130 unsigned long next; 127 unsigned long next;
131 pmd_t *empty_pmd = NULL; 128 unsigned long start;
132 129
130 start = addr;
133 pmd = pmd_offset(pud, addr); 131 pmd = pmd_offset(pud, addr);
134
135 /* Only free fully aligned ranges */
136 if (!((addr | end) & ~PUD_MASK))
137 empty_pmd = pmd;
138 do { 132 do {
139 next = pmd_addr_end(addr, end); 133 next = pmd_addr_end(addr, end);
140 if (pmd_none_or_clear_bad(pmd)) 134 if (pmd_none_or_clear_bad(pmd))
141 continue; 135 continue;
142 clear_pte_range(tlb, pmd, addr, next); 136 free_pte_range(tlb, pmd);
143 } while (pmd++, addr = next, addr != end); 137 } while (pmd++, addr = next, addr != end);
144 138
145 if (empty_pmd) { 139 start &= PUD_MASK;
146 pud_clear(pud); 140 if (start < floor)
147 pmd_free_tlb(tlb, empty_pmd); 141 return;
142 if (ceiling) {
143 ceiling &= PUD_MASK;
144 if (!ceiling)
145 return;
148 } 146 }
147 if (end - 1 > ceiling - 1)
148 return;
149
150 pmd = pmd_offset(pud, start);
151 pud_clear(pud);
152 pmd_free_tlb(tlb, pmd);
149} 153}
150 154
151static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 155static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
152 unsigned long addr, unsigned long end) 156 unsigned long addr, unsigned long end,
157 unsigned long floor, unsigned long ceiling)
153{ 158{
154 pud_t *pud; 159 pud_t *pud;
155 unsigned long next; 160 unsigned long next;
156 pud_t *empty_pud = NULL; 161 unsigned long start;
157 162
163 start = addr;
158 pud = pud_offset(pgd, addr); 164 pud = pud_offset(pgd, addr);
159
160 /* Only free fully aligned ranges */
161 if (!((addr | end) & ~PGDIR_MASK))
162 empty_pud = pud;
163 do { 165 do {
164 next = pud_addr_end(addr, end); 166 next = pud_addr_end(addr, end);
165 if (pud_none_or_clear_bad(pud)) 167 if (pud_none_or_clear_bad(pud))
166 continue; 168 continue;
167 clear_pmd_range(tlb, pud, addr, next); 169 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
168 } while (pud++, addr = next, addr != end); 170 } while (pud++, addr = next, addr != end);
169 171
170 if (empty_pud) { 172 start &= PGDIR_MASK;
171 pgd_clear(pgd); 173 if (start < floor)
172 pud_free_tlb(tlb, empty_pud); 174 return;
175 if (ceiling) {
176 ceiling &= PGDIR_MASK;
177 if (!ceiling)
178 return;
173 } 179 }
180 if (end - 1 > ceiling - 1)
181 return;
182
183 pud = pud_offset(pgd, start);
184 pgd_clear(pgd);
185 pud_free_tlb(tlb, pud);
174} 186}
175 187
176/* 188/*
177 * This function clears user-level page tables of a process. 189 * This function frees user-level page tables of a process.
178 * Unlike other pagetable walks, some memory layouts might give end 0. 190 *
179 * Must be called with pagetable lock held. 191 * Must be called with pagetable lock held.
180 */ 192 */
181void clear_page_range(struct mmu_gather *tlb, 193static inline void free_pgd_range(struct mmu_gather *tlb,
182 unsigned long addr, unsigned long end) 194 unsigned long addr, unsigned long end,
195 unsigned long floor, unsigned long ceiling)
183{ 196{
184 pgd_t *pgd; 197 pgd_t *pgd;
185 unsigned long next; 198 unsigned long next;
199 unsigned long start;
186 200
201 /*
202 * The next few lines have given us lots of grief...
203 *
204 * Why are we testing PMD* at this top level? Because often
205 * there will be no work to do at all, and we'd prefer not to
206 * go all the way down to the bottom just to discover that.
207 *
208 * Why all these "- 1"s? Because 0 represents both the bottom
209 * of the address space and the top of it (using -1 for the
210 * top wouldn't help much: the masks would do the wrong thing).
211 * The rule is that addr 0 and floor 0 refer to the bottom of
212 * the address space, but end 0 and ceiling 0 refer to the top
213 * Comparisons need to use "end - 1" and "ceiling - 1" (though
214 * that end 0 case should be mythical).
215 *
216 * Wherever addr is brought up or ceiling brought down, we must
217 * be careful to reject "the opposite 0" before it confuses the
218 * subsequent tests. But what about where end is brought down
219 * by PMD_SIZE below? no, end can't go down to 0 there.
220 *
221 * Whereas we round start (addr) and ceiling down, by different
222 * masks at different levels, in order to test whether a table
223 * now has no other vmas using it, so can be freed, we don't
224 * bother to round floor or end up - the tests don't need that.
225 */
226
227 addr &= PMD_MASK;
228 if (addr < floor) {
229 addr += PMD_SIZE;
230 if (!addr)
231 return;
232 }
233 if (ceiling) {
234 ceiling &= PMD_MASK;
235 if (!ceiling)
236 return;
237 }
238 if (end - 1 > ceiling - 1)
239 end -= PMD_SIZE;
240 if (addr > end - 1)
241 return;
242
243 start = addr;
187 pgd = pgd_offset(tlb->mm, addr); 244 pgd = pgd_offset(tlb->mm, addr);
188 do { 245 do {
189 next = pgd_addr_end(addr, end); 246 next = pgd_addr_end(addr, end);
190 if (pgd_none_or_clear_bad(pgd)) 247 if (pgd_none_or_clear_bad(pgd))
191 continue; 248 continue;
192 clear_pud_range(tlb, pgd, addr, next); 249 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
193 } while (pgd++, addr = next, addr != end); 250 } while (pgd++, addr = next, addr != end);
251
252 if (!tlb_is_full_mm(tlb))
253 flush_tlb_pgtables(tlb->mm, start, end);
254}
255
256void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
257 unsigned long floor, unsigned long ceiling)
258{
259 while (vma) {
260 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start;
262
263 /* Optimization: gather nearby vmas into a single call down */
264 while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
265 vma = next;
266 next = vma->vm_next;
267 }
268 free_pgd_range(*tlb, addr, vma->vm_end,
269 floor, next? next->vm_start: ceiling);
270 vma = next;
271 }
194} 272}
195 273
196pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 274pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)