diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-04-19 16:29:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org.(none)> | 2005-04-19 16:29:15 -0400 |
commit | e0da382c92626ad1d7f4b7527d19b80104d67a83 (patch) | |
tree | b3f455518c286ee14cb2755ced8808487bca7911 /mm | |
parent | 9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff) |
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/memory.c | 152 | ||||
-rw-r--r-- | mm/mmap.c | 102 |
2 files changed, 136 insertions, 118 deletions
diff --git a/mm/memory.c b/mm/memory.c index fb6e5deb873..fee5dc8fc36 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd) | |||
110 | * Note: this doesn't free the actual pages themselves. That | 110 | * Note: this doesn't free the actual pages themselves. That |
111 | * has been handled earlier when unmapping all the memory regions. | 111 | * has been handled earlier when unmapping all the memory regions. |
112 | */ | 112 | */ |
113 | static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, | 113 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) |
114 | unsigned long addr, unsigned long end) | ||
115 | { | 114 | { |
116 | if (!((addr | end) & ~PMD_MASK)) { | 115 | struct page *page = pmd_page(*pmd); |
117 | /* Only free fully aligned ranges */ | 116 | pmd_clear(pmd); |
118 | struct page *page = pmd_page(*pmd); | 117 | pte_free_tlb(tlb, page); |
119 | pmd_clear(pmd); | 118 | dec_page_state(nr_page_table_pages); |
120 | dec_page_state(nr_page_table_pages); | 119 | tlb->mm->nr_ptes--; |
121 | tlb->mm->nr_ptes--; | ||
122 | pte_free_tlb(tlb, page); | ||
123 | } | ||
124 | } | 120 | } |
125 | 121 | ||
126 | static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, | 122 | static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, |
127 | unsigned long addr, unsigned long end) | 123 | unsigned long addr, unsigned long end, |
124 | unsigned long floor, unsigned long ceiling) | ||
128 | { | 125 | { |
129 | pmd_t *pmd; | 126 | pmd_t *pmd; |
130 | unsigned long next; | 127 | unsigned long next; |
131 | pmd_t *empty_pmd = NULL; | 128 | unsigned long start; |
132 | 129 | ||
130 | start = addr; | ||
133 | pmd = pmd_offset(pud, addr); | 131 | pmd = pmd_offset(pud, addr); |
134 | |||
135 | /* Only free fully aligned ranges */ | ||
136 | if (!((addr | end) & ~PUD_MASK)) | ||
137 | empty_pmd = pmd; | ||
138 | do { | 132 | do { |
139 | next = pmd_addr_end(addr, end); | 133 | next = pmd_addr_end(addr, end); |
140 | if (pmd_none_or_clear_bad(pmd)) | 134 | if (pmd_none_or_clear_bad(pmd)) |
141 | continue; | 135 | continue; |
142 | clear_pte_range(tlb, pmd, addr, next); | 136 | free_pte_range(tlb, pmd); |
143 | } while (pmd++, addr = next, addr != end); | 137 | } while (pmd++, addr = next, addr != end); |
144 | 138 | ||
145 | if (empty_pmd) { | 139 | start &= PUD_MASK; |
146 | pud_clear(pud); | 140 | if (start < floor) |
147 | pmd_free_tlb(tlb, empty_pmd); | 141 | return; |
142 | if (ceiling) { | ||
143 | ceiling &= PUD_MASK; | ||
144 | if (!ceiling) | ||
145 | return; | ||
148 | } | 146 | } |
147 | if (end - 1 > ceiling - 1) | ||
148 | return; | ||
149 | |||
150 | pmd = pmd_offset(pud, start); | ||
151 | pud_clear(pud); | ||
152 | pmd_free_tlb(tlb, pmd); | ||
149 | } | 153 | } |
150 | 154 | ||
151 | static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 155 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
152 | unsigned long addr, unsigned long end) | 156 | unsigned long addr, unsigned long end, |
157 | unsigned long floor, unsigned long ceiling) | ||
153 | { | 158 | { |
154 | pud_t *pud; | 159 | pud_t *pud; |
155 | unsigned long next; | 160 | unsigned long next; |
156 | pud_t *empty_pud = NULL; | 161 | unsigned long start; |
157 | 162 | ||
163 | start = addr; | ||
158 | pud = pud_offset(pgd, addr); | 164 | pud = pud_offset(pgd, addr); |
159 | |||
160 | /* Only free fully aligned ranges */ | ||
161 | if (!((addr | end) & ~PGDIR_MASK)) | ||
162 | empty_pud = pud; | ||
163 | do { | 165 | do { |
164 | next = pud_addr_end(addr, end); | 166 | next = pud_addr_end(addr, end); |
165 | if (pud_none_or_clear_bad(pud)) | 167 | if (pud_none_or_clear_bad(pud)) |
166 | continue; | 168 | continue; |
167 | clear_pmd_range(tlb, pud, addr, next); | 169 | free_pmd_range(tlb, pud, addr, next, floor, ceiling); |
168 | } while (pud++, addr = next, addr != end); | 170 | } while (pud++, addr = next, addr != end); |
169 | 171 | ||
170 | if (empty_pud) { | 172 | start &= PGDIR_MASK; |
171 | pgd_clear(pgd); | 173 | if (start < floor) |
172 | pud_free_tlb(tlb, empty_pud); | 174 | return; |
175 | if (ceiling) { | ||
176 | ceiling &= PGDIR_MASK; | ||
177 | if (!ceiling) | ||
178 | return; | ||
173 | } | 179 | } |
180 | if (end - 1 > ceiling - 1) | ||
181 | return; | ||
182 | |||
183 | pud = pud_offset(pgd, start); | ||
184 | pgd_clear(pgd); | ||
185 | pud_free_tlb(tlb, pud); | ||
174 | } | 186 | } |
175 | 187 | ||
176 | /* | 188 | /* |
177 | * This function clears user-level page tables of a process. | 189 | * This function frees user-level page tables of a process. |
178 | * Unlike other pagetable walks, some memory layouts might give end 0. | 190 | * |
179 | * Must be called with pagetable lock held. | 191 | * Must be called with pagetable lock held. |
180 | */ | 192 | */ |
181 | void clear_page_range(struct mmu_gather *tlb, | 193 | static inline void free_pgd_range(struct mmu_gather *tlb, |
182 | unsigned long addr, unsigned long end) | 194 | unsigned long addr, unsigned long end, |
195 | unsigned long floor, unsigned long ceiling) | ||
183 | { | 196 | { |
184 | pgd_t *pgd; | 197 | pgd_t *pgd; |
185 | unsigned long next; | 198 | unsigned long next; |
199 | unsigned long start; | ||
186 | 200 | ||
201 | /* | ||
202 | * The next few lines have given us lots of grief... | ||
203 | * | ||
204 | * Why are we testing PMD* at this top level? Because often | ||
205 | * there will be no work to do at all, and we'd prefer not to | ||
206 | * go all the way down to the bottom just to discover that. | ||
207 | * | ||
208 | * Why all these "- 1"s? Because 0 represents both the bottom | ||
209 | * of the address space and the top of it (using -1 for the | ||
210 | * top wouldn't help much: the masks would do the wrong thing). | ||
211 | * The rule is that addr 0 and floor 0 refer to the bottom of | ||
212 | * the address space, but end 0 and ceiling 0 refer to the top | ||
213 | * Comparisons need to use "end - 1" and "ceiling - 1" (though | ||
214 | * that end 0 case should be mythical). | ||
215 | * | ||
216 | * Wherever addr is brought up or ceiling brought down, we must | ||
217 | * be careful to reject "the opposite 0" before it confuses the | ||
218 | * subsequent tests. But what about where end is brought down | ||
219 | * by PMD_SIZE below? no, end can't go down to 0 there. | ||
220 | * | ||
221 | * Whereas we round start (addr) and ceiling down, by different | ||
222 | * masks at different levels, in order to test whether a table | ||
223 | * now has no other vmas using it, so can be freed, we don't | ||
224 | * bother to round floor or end up - the tests don't need that. | ||
225 | */ | ||
226 | |||
227 | addr &= PMD_MASK; | ||
228 | if (addr < floor) { | ||
229 | addr += PMD_SIZE; | ||
230 | if (!addr) | ||
231 | return; | ||
232 | } | ||
233 | if (ceiling) { | ||
234 | ceiling &= PMD_MASK; | ||
235 | if (!ceiling) | ||
236 | return; | ||
237 | } | ||
238 | if (end - 1 > ceiling - 1) | ||
239 | end -= PMD_SIZE; | ||
240 | if (addr > end - 1) | ||
241 | return; | ||
242 | |||
243 | start = addr; | ||
187 | pgd = pgd_offset(tlb->mm, addr); | 244 | pgd = pgd_offset(tlb->mm, addr); |
188 | do { | 245 | do { |
189 | next = pgd_addr_end(addr, end); | 246 | next = pgd_addr_end(addr, end); |
190 | if (pgd_none_or_clear_bad(pgd)) | 247 | if (pgd_none_or_clear_bad(pgd)) |
191 | continue; | 248 | continue; |
192 | clear_pud_range(tlb, pgd, addr, next); | 249 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
193 | } while (pgd++, addr = next, addr != end); | 250 | } while (pgd++, addr = next, addr != end); |
251 | |||
252 | if (!tlb_is_full_mm(tlb)) | ||
253 | flush_tlb_pgtables(tlb->mm, start, end); | ||
254 | } | ||
255 | |||
256 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | ||
257 | unsigned long floor, unsigned long ceiling) | ||
258 | { | ||
259 | while (vma) { | ||
260 | struct vm_area_struct *next = vma->vm_next; | ||
261 | unsigned long addr = vma->vm_start; | ||
262 | |||
263 | /* Optimization: gather nearby vmas into a single call down */ | ||
264 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { | ||
265 | vma = next; | ||
266 | next = vma->vm_next; | ||
267 | } | ||
268 | free_pgd_range(*tlb, addr, vma->vm_end, | ||
269 | floor, next? next->vm_start: ceiling); | ||
270 | vma = next; | ||
271 | } | ||
194 | } | 272 | } |
195 | 273 | ||
196 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) | 274 | pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) |
@@ -29,6 +29,10 @@ | |||
29 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
30 | #include <asm/tlb.h> | 30 | #include <asm/tlb.h> |
31 | 31 | ||
32 | static void unmap_region(struct mm_struct *mm, | ||
33 | struct vm_area_struct *vma, struct vm_area_struct *prev, | ||
34 | unsigned long start, unsigned long end); | ||
35 | |||
32 | /* | 36 | /* |
33 | * WARNING: the debugging will use recursive algorithms so never enable this | 37 | * WARNING: the debugging will use recursive algorithms so never enable this |
34 | * unless you know what you are doing. | 38 | * unless you know what you are doing. |
@@ -1129,7 +1133,8 @@ unmap_and_free_vma: | |||
1129 | fput(file); | 1133 | fput(file); |
1130 | 1134 | ||
1131 | /* Undo any partial mapping done by a device driver. */ | 1135 | /* Undo any partial mapping done by a device driver. */ |
1132 | zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); | 1136 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
1137 | charged = 0; | ||
1133 | free_vma: | 1138 | free_vma: |
1134 | kmem_cache_free(vm_area_cachep, vma); | 1139 | kmem_cache_free(vm_area_cachep, vma); |
1135 | unacct_error: | 1140 | unacct_error: |
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1572 | } | 1577 | } |
1573 | #endif | 1578 | #endif |
1574 | 1579 | ||
1575 | /* | ||
1576 | * Try to free as many page directory entries as we can, | ||
1577 | * without having to work very hard at actually scanning | ||
1578 | * the page tables themselves. | ||
1579 | * | ||
1580 | * Right now we try to free page tables if we have a nice | ||
1581 | * PGDIR-aligned area that got free'd up. We could be more | ||
1582 | * granular if we want to, but this is fast and simple, | ||
1583 | * and covers the bad cases. | ||
1584 | * | ||
1585 | * "prev", if it exists, points to a vma before the one | ||
1586 | * we just free'd - but there's no telling how much before. | ||
1587 | */ | ||
1588 | static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | ||
1589 | unsigned long start, unsigned long end) | ||
1590 | { | ||
1591 | unsigned long first = start & PGDIR_MASK; | ||
1592 | unsigned long last = end + PGDIR_SIZE - 1; | ||
1593 | struct mm_struct *mm = tlb->mm; | ||
1594 | |||
1595 | if (last > MM_VM_SIZE(mm) || last < end) | ||
1596 | last = MM_VM_SIZE(mm); | ||
1597 | |||
1598 | if (!prev) { | ||
1599 | prev = mm->mmap; | ||
1600 | if (!prev) | ||
1601 | goto no_mmaps; | ||
1602 | if (prev->vm_end > start) { | ||
1603 | if (last > prev->vm_start) | ||
1604 | last = prev->vm_start; | ||
1605 | goto no_mmaps; | ||
1606 | } | ||
1607 | } | ||
1608 | for (;;) { | ||
1609 | struct vm_area_struct *next = prev->vm_next; | ||
1610 | |||
1611 | if (next) { | ||
1612 | if (next->vm_start < start) { | ||
1613 | prev = next; | ||
1614 | continue; | ||
1615 | } | ||
1616 | if (last > next->vm_start) | ||
1617 | last = next->vm_start; | ||
1618 | } | ||
1619 | if (prev->vm_end > first) | ||
1620 | first = prev->vm_end; | ||
1621 | break; | ||
1622 | } | ||
1623 | no_mmaps: | ||
1624 | if (last < first) /* for arches with discontiguous pgd indices */ | ||
1625 | return; | ||
1626 | if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) | ||
1627 | first = FIRST_USER_PGD_NR * PGDIR_SIZE; | ||
1628 | /* No point trying to free anything if we're in the same pte page */ | ||
1629 | if ((first & PMD_MASK) < (last & PMD_MASK)) { | ||
1630 | clear_page_range(tlb, first, last); | ||
1631 | flush_tlb_pgtables(mm, first, last); | ||
1632 | } | ||
1633 | } | ||
1634 | |||
1635 | /* Normal function to fix up a mapping | 1580 | /* Normal function to fix up a mapping |
1636 | * This function is the default for when an area has no specific | 1581 | * This function is the default for when an area has no specific |
1637 | * function. This may be used as part of a more specific routine. | 1582 | * function. This may be used as part of a more specific routine. |
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_struct *mm, | |||
1674 | * Called with the page table lock held. | 1619 | * Called with the page table lock held. |
1675 | */ | 1620 | */ |
1676 | static void unmap_region(struct mm_struct *mm, | 1621 | static void unmap_region(struct mm_struct *mm, |
1677 | struct vm_area_struct *vma, | 1622 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
1678 | struct vm_area_struct *prev, | 1623 | unsigned long start, unsigned long end) |
1679 | unsigned long start, | ||
1680 | unsigned long end) | ||
1681 | { | 1624 | { |
1625 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | ||
1682 | struct mmu_gather *tlb; | 1626 | struct mmu_gather *tlb; |
1683 | unsigned long nr_accounted = 0; | 1627 | unsigned long nr_accounted = 0; |
1684 | 1628 | ||
1685 | lru_add_drain(); | 1629 | lru_add_drain(); |
1630 | spin_lock(&mm->page_table_lock); | ||
1686 | tlb = tlb_gather_mmu(mm, 0); | 1631 | tlb = tlb_gather_mmu(mm, 0); |
1687 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | 1632 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); |
1688 | vm_unacct_memory(nr_accounted); | 1633 | vm_unacct_memory(nr_accounted); |
1689 | 1634 | free_pgtables(&tlb, vma, prev? prev->vm_end: 0, | |
1690 | if (is_hugepage_only_range(mm, start, end - start)) | 1635 | next? next->vm_start: 0); |
1691 | hugetlb_free_pgtables(tlb, prev, start, end); | ||
1692 | else | ||
1693 | free_pgtables(tlb, prev, start, end); | ||
1694 | tlb_finish_mmu(tlb, start, end); | 1636 | tlb_finish_mmu(tlb, start, end); |
1637 | spin_unlock(&mm->page_table_lock); | ||
1695 | } | 1638 | } |
1696 | 1639 | ||
1697 | /* | 1640 | /* |
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1823 | * Remove the vma's, and unmap the actual pages | 1766 | * Remove the vma's, and unmap the actual pages |
1824 | */ | 1767 | */ |
1825 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); | 1768 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); |
1826 | spin_lock(&mm->page_table_lock); | ||
1827 | unmap_region(mm, mpnt, prev, start, end); | 1769 | unmap_region(mm, mpnt, prev, start, end); |
1828 | spin_unlock(&mm->page_table_lock); | ||
1829 | 1770 | ||
1830 | /* Fix up all other VM information */ | 1771 | /* Fix up all other VM information */ |
1831 | unmap_vma_list(mm, mpnt); | 1772 | unmap_vma_list(mm, mpnt); |
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk); | |||
1957 | void exit_mmap(struct mm_struct *mm) | 1898 | void exit_mmap(struct mm_struct *mm) |
1958 | { | 1899 | { |
1959 | struct mmu_gather *tlb; | 1900 | struct mmu_gather *tlb; |
1960 | struct vm_area_struct *vma; | 1901 | struct vm_area_struct *vma = mm->mmap; |
1961 | unsigned long nr_accounted = 0; | 1902 | unsigned long nr_accounted = 0; |
1962 | 1903 | ||
1963 | lru_add_drain(); | 1904 | lru_add_drain(); |
1964 | 1905 | ||
1965 | spin_lock(&mm->page_table_lock); | 1906 | spin_lock(&mm->page_table_lock); |
1966 | 1907 | ||
1967 | tlb = tlb_gather_mmu(mm, 1); | ||
1968 | flush_cache_mm(mm); | 1908 | flush_cache_mm(mm); |
1969 | /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ | 1909 | tlb = tlb_gather_mmu(mm, 1); |
1970 | mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, | 1910 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1971 | ~0UL, &nr_accounted, NULL); | 1911 | mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); |
1972 | vm_unacct_memory(nr_accounted); | 1912 | vm_unacct_memory(nr_accounted); |
1973 | BUG_ON(mm->map_count); /* This is just debugging */ | 1913 | free_pgtables(&tlb, vma, 0, 0); |
1974 | clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); | ||
1975 | |||
1976 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); | 1914 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); |
1977 | 1915 | ||
1978 | vma = mm->mmap; | ||
1979 | mm->mmap = mm->mmap_cache = NULL; | 1916 | mm->mmap = mm->mmap_cache = NULL; |
1980 | mm->mm_rb = RB_ROOT; | 1917 | mm->mm_rb = RB_ROOT; |
1981 | set_mm_counter(mm, rss, 0); | 1918 | set_mm_counter(mm, rss, 0); |
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm) | |||
1993 | remove_vm_struct(vma); | 1930 | remove_vm_struct(vma); |
1994 | vma = next; | 1931 | vma = next; |
1995 | } | 1932 | } |
1933 | |||
1934 | BUG_ON(mm->map_count); /* This is just debugging */ | ||
1935 | BUG_ON(mm->nr_ptes); /* This is just debugging */ | ||
1996 | } | 1936 | } |
1997 | 1937 | ||
1998 | /* Insert vm structure into process list sorted by address | 1938 | /* Insert vm structure into process list sorted by address |