diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-04-19 16:29:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org.(none)> | 2005-04-19 16:29:15 -0400 |
commit | e0da382c92626ad1d7f4b7527d19b80104d67a83 (patch) | |
tree | b3f455518c286ee14cb2755ced8808487bca7911 /mm/memory.c | |
parent | 9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff) |
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 152 |
1 files changed, 115 insertions, 37 deletions
diff --git a/mm/memory.c b/mm/memory.c index fb6e5deb873a..fee5dc8fc36c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd) | |||
110 | * Note: this doesn't free the actual pages themselves. That | 110 | * Note: this doesn't free the actual pages themselves. That |
111 | * has been handled earlier when unmapping all the memory regions. | 111 | * has been handled earlier when unmapping all the memory regions. |
112 | */ | 112 | */ |
113 | static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | 113 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) |
114 | unsigned long addr, unsigned long end) | ||
115 | { | 114 | { |
116 | if (!((addr | end) & ~PMD_MASK)) { | 115 | struct page *page = pmd_page(*pmd); |
117 | /* Only free fully aligned ranges */ | 116 | pmd_clear(pmd); |
118 | struct page *page = pmd_page(*pmd); | 117 | pte_free_tlb(tlb, page); |
119 | pmd_clear(pmd); | 118 | dec_page_state(nr_page_table_pages); |
120 | dec_page_state(nr_page_table_pages); | 119 | tlb->mm->nr_ptes--; |
121 | tlb->mm->nr_ptes--; | ||
122 | pte_free_tlb(tlb, page); | ||
123 | } | ||
124 | } | 120 | } |
125 | 121 | ||
126 | static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 122 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
127 | unsigned long addr, unsigned long end) | 123 | unsigned long addr, unsigned long end, |
124 | unsigned long floor, unsigned long ceiling) | ||
128 | { | 125 | { |
129 | pmd_t *pmd; | 126 | pmd_t *pmd; |
130 | unsigned long next; | 127 | unsigned long next; |
131 | pmd_t *empty_pmd = NULL; | 128 | unsigned long start; |
132 | 129 | ||
130 | start = addr; | ||
133 | pmd = pmd_offset(pud, addr); | 131 | pmd = pmd_offset(pud, addr); |
134 | |||
135 | /* Only free fully aligned ranges */ | ||
136 | if (!((addr | end) & ~PUD_MASK)) | ||
137 | empty_pmd = pmd; | ||
138 | do { | 132 | do { |
139 | next = pmd_addr_end(addr, end); | 133 | next = pmd_addr_end(addr, end); |
140 | if (pmd_none_or_clear_bad(pmd)) | 134 | if (pmd_none_or_clear_bad(pmd)) |
141 | continue; | 135 | continue; |
142 | clear_pte_range(tlb, pmd, addr, next); | 136 | free_pte_range(tlb, pmd); |
143 | } while (pmd++, addr = next, addr != end); | 137 | } while (pmd++, addr = next, addr != end); |
144 | 138 | ||
145 | if (empty_pmd) { | 139 | start &= PUD_MASK; |
146 | pud_clear(pud); | 140 | if (start < floor) |
147 | pmd_free_tlb(tlb, empty_pmd); | 141 | return; |
142 | if (ceiling) { | ||
143 | ceiling &= PUD_MASK; | ||
144 | if (!ceiling) | ||
145 | return; | ||
148 | } | 146 | } |
147 | if (end - 1 > ceiling - 1) | ||
148 | return; | ||
149 | |||
150 | pmd = pmd_offset(pud, start); | ||
151 | pud_clear(pud); | ||
152 | pmd_free_tlb(tlb, pmd); | ||
149 | } | 153 | } |
150 | 154 | ||
151 | static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 155 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
152 | unsigned long addr, unsigned long end) | 156 | unsigned long addr, unsigned long end, |
157 | unsigned long floor, unsigned long ceiling) | ||
153 | { | 158 | { |
154 | pud_t *pud; | 159 | pud_t *pud; |
155 | unsigned long next; | 160 | unsigned long next; |
156 | pud_t *empty_pud = NULL; | 161 | unsigned long start; |
157 | 162 | ||
163 | start = addr; | ||
158 | pud = pud_offset(pgd, addr); | 164 | pud = pud_offset(pgd, addr); |
159 | |||
160 | /* Only free fully aligned ranges */ | ||
161 | if (!((addr | end) & ~PGDIR_MASK)) | ||
162 | empty_pud = pud; | ||
163 | do { | 165 | do { |
164 | next = pud_addr_end(addr, end); | 166 | next = pud_addr_end(addr, end); |
165 | if (pud_none_or_clear_bad(pud)) | 167 | if (pud_none_or_clear_bad(pud)) |
166 | continue; | 168 | continue; |
167 | clear_pmd_range(tlb, pud, addr, next); | 169 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); |
168 | } while (pud++, addr = next, addr != end); | 170 | } while (pud++, addr = next, addr != end); |
169 | 171 | ||
170 | if (empty_pud) { | 172 | start &= PGDIR_MASK; |
171 | pgd_clear(pgd); | 173 | if (start < floor) |
172 | pud_free_tlb(tlb, empty_pud); | 174 | return; |
175 | if (ceiling) { | ||
176 | ceiling &= PGDIR_MASK; | ||
177 | if (!ceiling) | ||
178 | return; | ||
173 | } | 179 | } |
180 | if (end - 1 > ceiling - 1) | ||
181 | return; | ||
182 | |||
183 | pud = pud_offset(pgd, start); | ||
184 | pgd_clear(pgd); | ||
185 | pud_free_tlb(tlb, pud); | ||
174 | } | 186 | } |
175 | 187 | ||
176 | /* | 188 | /* |
177 | * This function clears user-level page tables of a process. | 189 | * This function frees user-level page tables of a process. |
178 | * Unlike other pagetable walks, some memory layouts might give end 0. | 190 | * |
179 | * Must be called with pagetable lock held. | 191 | * Must be called with pagetable lock held. |
180 | */ | 192 | */ |
181 | void clear_page_range(struct mmu_gather *tlb, | 193 | static inline void free_pgd_range(struct mmu_gather *tlb, |
182 | unsigned long addr, unsigned long end) | 194 | unsigned long addr, unsigned long end, |
195 | unsigned long floor, unsigned long ceiling) | ||
183 | { | 196 | { |
184 | pgd_t *pgd; | 197 | pgd_t *pgd; |
185 | unsigned long next; | 198 | unsigned long next; |
199 | unsigned long start; | ||
186 | 200 | ||
201 | /* | ||
202 | * The next few lines have given us lots of grief... | ||
203 | * | ||
204 | * Why are we testing PMD* at this top level? Because often | ||
205 | * there will be no work to do at all, and we'd prefer not to | ||
206 | * go all the way down to the bottom just to discover that. | ||
207 | * | ||
208 | * Why all these "- 1"s? Because 0 represents both the bottom | ||
209 | * of the address space and the top of it (using -1 for the | ||
210 | * top wouldn't help much: the masks would do the wrong thing). | ||
211 | * The rule is that addr 0 and floor 0 refer to the bottom of | ||
212 | * the address space, but end 0 and ceiling 0 refer to the top | ||
213 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | ||
214 | * that end 0 case should be mythical). | ||
215 | * | ||
216 | * Wherever addr is brought up or ceiling brought down, we must | ||
217 | * be careful to reject "the opposite 0" before it confuses the | ||
218 | * subsequent tests. But what about where end is brought down | ||
219 | * by PMD_SIZE below? no, end can't go down to 0 there. | ||
220 | * | ||
221 | * Whereas we round start (addr) and ceiling down, by different | ||
222 | * masks at different levels, in order to test whether a table | ||
223 | * now has no other vmas using it, so can be freed, we don't | ||
224 | * bother to round floor or end up - the tests don't need that. | ||
225 | */ | ||
226 | |||
227 | addr &= PMD_MASK; | ||
228 | if (addr < floor) { | ||
229 | addr += PMD_SIZE; | ||
230 | if (!addr) | ||
231 | return; | ||
232 | } | ||
233 | if (ceiling) { | ||
234 | ceiling &= PMD_MASK; | ||
235 | if (!ceiling) | ||
236 | return; | ||
237 | } | ||
238 | if (end - 1 > ceiling - 1) | ||
239 | end -= PMD_SIZE; | ||
240 | if (addr > end - 1) | ||
241 | return; | ||
242 | |||
243 | start = addr; | ||
187 | pgd = pgd_offset(tlb->mm, addr); | 244 | pgd = pgd_offset(tlb->mm, addr); |
188 | do { | 245 | do { |
189 | next = pgd_addr_end(addr, end); | 246 | next = pgd_addr_end(addr, end); |
190 | if (pgd_none_or_clear_bad(pgd)) | 247 | if (pgd_none_or_clear_bad(pgd)) |
191 | continue; | 248 | continue; |
192 | clear_pud_range(tlb, pgd, addr, next); | 249 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
193 | } while (pgd++, addr = next, addr != end); | 250 | } while (pgd++, addr = next, addr != end); |
251 | |||
252 | if (!tlb_is_full_mm(tlb)) | ||
253 | flush_tlb_pgtables(tlb->mm, start, end); | ||
254 | } | ||
255 | |||
256 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | ||
257 | unsigned long floor, unsigned long ceiling) | ||
258 | { | ||
259 | while (vma) { | ||
260 | struct vm_area_struct *next = vma->vm_next; | ||
261 | unsigned long addr = vma->vm_start; | ||
262 | |||
263 | /* Optimization: gather nearby vmas into a single call down */ | ||
264 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { | ||
265 | vma = next; | ||
266 | next = vma->vm_next; | ||
267 | } | ||
268 | free_pgd_range(*tlb, addr, vma->vm_end, | ||
269 | floor, next? next->vm_start: ceiling); | ||
270 | vma = next; | ||
271 | } | ||
194 | } | 272 | } |
195 | 273 | ||
196 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 274 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |