aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-04-19 16:29:15 -0400
committerLinus Torvalds <torvalds@ppc970.osdl.org.(none)>2005-04-19 16:29:15 -0400
commite0da382c92626ad1d7f4b7527d19b80104d67a83 (patch)
treeb3f455518c286ee14cb2755ced8808487bca7911 /mm
parent9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff)
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c152
-rw-r--r--mm/mmap.c102
2 files changed, 136 insertions, 118 deletions
diff --git a/mm/memory.c b/mm/memory.c
index fb6e5deb873a..fee5dc8fc36c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd)
110 * Note: this doesn't free the actual pages themselves. That 110 * Note: this doesn't free the actual pages themselves. That
111 * has been handled earlier when unmapping all the memory regions. 111 * has been handled earlier when unmapping all the memory regions.
112 */ 112 */
113static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 113static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114 unsigned long addr, unsigned long end)
115{ 114{
116 if (!((addr | end) & ~PMD_MASK)) { 115 struct page *page = pmd_page(*pmd);
117 /* Only free fully aligned ranges */ 116 pmd_clear(pmd);
118 struct page *page = pmd_page(*pmd); 117 pte_free_tlb(tlb, page);
119 pmd_clear(pmd); 118 dec_page_state(nr_page_table_pages);
120 dec_page_state(nr_page_table_pages); 119 tlb->mm->nr_ptes--;
121 tlb->mm->nr_ptes--;
122 pte_free_tlb(tlb, page);
123 }
124} 120}
125 121
126static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, 122static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
127 unsigned long addr, unsigned long end) 123 unsigned long addr, unsigned long end,
124 unsigned long floor, unsigned long ceiling)
128{ 125{
129 pmd_t *pmd; 126 pmd_t *pmd;
130 unsigned long next; 127 unsigned long next;
131 pmd_t *empty_pmd = NULL; 128 unsigned long start;
132 129
130 start = addr;
133 pmd = pmd_offset(pud, addr); 131 pmd = pmd_offset(pud, addr);
134
135 /* Only free fully aligned ranges */
136 if (!((addr | end) & ~PUD_MASK))
137 empty_pmd = pmd;
138 do { 132 do {
139 next = pmd_addr_end(addr, end); 133 next = pmd_addr_end(addr, end);
140 if (pmd_none_or_clear_bad(pmd)) 134 if (pmd_none_or_clear_bad(pmd))
141 continue; 135 continue;
142 clear_pte_range(tlb, pmd, addr, next); 136 free_pte_range(tlb, pmd);
143 } while (pmd++, addr = next, addr != end); 137 } while (pmd++, addr = next, addr != end);
144 138
145 if (empty_pmd) { 139 start &= PUD_MASK;
146 pud_clear(pud); 140 if (start < floor)
147 pmd_free_tlb(tlb, empty_pmd); 141 return;
142 if (ceiling) {
143 ceiling &= PUD_MASK;
144 if (!ceiling)
145 return;
148 } 146 }
147 if (end - 1 > ceiling - 1)
148 return;
149
150 pmd = pmd_offset(pud, start);
151 pud_clear(pud);
152 pmd_free_tlb(tlb, pmd);
149} 153}
150 154
151static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 155static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
152 unsigned long addr, unsigned long end) 156 unsigned long addr, unsigned long end,
157 unsigned long floor, unsigned long ceiling)
153{ 158{
154 pud_t *pud; 159 pud_t *pud;
155 unsigned long next; 160 unsigned long next;
156 pud_t *empty_pud = NULL; 161 unsigned long start;
157 162
163 start = addr;
158 pud = pud_offset(pgd, addr); 164 pud = pud_offset(pgd, addr);
159
160 /* Only free fully aligned ranges */
161 if (!((addr | end) & ~PGDIR_MASK))
162 empty_pud = pud;
163 do { 165 do {
164 next = pud_addr_end(addr, end); 166 next = pud_addr_end(addr, end);
165 if (pud_none_or_clear_bad(pud)) 167 if (pud_none_or_clear_bad(pud))
166 continue; 168 continue;
167 clear_pmd_range(tlb, pud, addr, next); 169 free_pmd_range(tlb, pud, addr, next, floor, ceiling);
168 } while (pud++, addr = next, addr != end); 170 } while (pud++, addr = next, addr != end);
169 171
170 if (empty_pud) { 172 start &= PGDIR_MASK;
171 pgd_clear(pgd); 173 if (start < floor)
172 pud_free_tlb(tlb, empty_pud); 174 return;
175 if (ceiling) {
176 ceiling &= PGDIR_MASK;
177 if (!ceiling)
178 return;
173 } 179 }
180 if (end - 1 > ceiling - 1)
181 return;
182
183 pud = pud_offset(pgd, start);
184 pgd_clear(pgd);
185 pud_free_tlb(tlb, pud);
174} 186}
175 187
176/* 188/*
177 * This function clears user-level page tables of a process. 189 * This function frees user-level page tables of a process.
178 * Unlike other pagetable walks, some memory layouts might give end 0. 190 *
179 * Must be called with pagetable lock held. 191 * Must be called with pagetable lock held.
180 */ 192 */
181void clear_page_range(struct mmu_gather *tlb, 193static inline void free_pgd_range(struct mmu_gather *tlb,
182 unsigned long addr, unsigned long end) 194 unsigned long addr, unsigned long end,
195 unsigned long floor, unsigned long ceiling)
183{ 196{
184 pgd_t *pgd; 197 pgd_t *pgd;
185 unsigned long next; 198 unsigned long next;
199 unsigned long start;
186 200
201 /*
202 * The next few lines have given us lots of grief...
203 *
204 * Why are we testing PMD* at this top level? Because often
205 * there will be no work to do at all, and we'd prefer not to
206 * go all the way down to the bottom just to discover that.
207 *
208 * Why all these "- 1"s? Because 0 represents both the bottom
209 * of the address space and the top of it (using -1 for the
210 * top wouldn't help much: the masks would do the wrong thing).
211 * The rule is that addr 0 and floor 0 refer to the bottom of
212 * the address space, but end 0 and ceiling 0 refer to the top
213 * Comparisons need to use "end - 1" and "ceiling - 1" (though
214 * that end 0 case should be mythical).
215 *
216 * Wherever addr is brought up or ceiling brought down, we must
217 * be careful to reject "the opposite 0" before it confuses the
218 * subsequent tests. But what about where end is brought down
219 * by PMD_SIZE below? no, end can't go down to 0 there.
220 *
221 * Whereas we round start (addr) and ceiling down, by different
222 * masks at different levels, in order to test whether a table
223 * now has no other vmas using it, so can be freed, we don't
224 * bother to round floor or end up - the tests don't need that.
225 */
226
227 addr &= PMD_MASK;
228 if (addr < floor) {
229 addr += PMD_SIZE;
230 if (!addr)
231 return;
232 }
233 if (ceiling) {
234 ceiling &= PMD_MASK;
235 if (!ceiling)
236 return;
237 }
238 if (end - 1 > ceiling - 1)
239 end -= PMD_SIZE;
240 if (addr > end - 1)
241 return;
242
243 start = addr;
187 pgd = pgd_offset(tlb->mm, addr); 244 pgd = pgd_offset(tlb->mm, addr);
188 do { 245 do {
189 next = pgd_addr_end(addr, end); 246 next = pgd_addr_end(addr, end);
190 if (pgd_none_or_clear_bad(pgd)) 247 if (pgd_none_or_clear_bad(pgd))
191 continue; 248 continue;
192 clear_pud_range(tlb, pgd, addr, next); 249 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
193 } while (pgd++, addr = next, addr != end); 250 } while (pgd++, addr = next, addr != end);
251
252 if (!tlb_is_full_mm(tlb))
253 flush_tlb_pgtables(tlb->mm, start, end);
254}
255
256void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
257 unsigned long floor, unsigned long ceiling)
258{
259 while (vma) {
260 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start;
262
263 /* Optimization: gather nearby vmas into a single call down */
264 while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
265 vma = next;
266 next = vma->vm_next;
267 }
268 free_pgd_range(*tlb, addr, vma->vm_end,
269 floor, next? next->vm_start: ceiling);
270 vma = next;
271 }
194} 272}
195 273
196pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 274pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
diff --git a/mm/mmap.c b/mm/mmap.c
index a95ebda27446..926d03015471 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,10 @@
29#include <asm/cacheflush.h> 29#include <asm/cacheflush.h>
30#include <asm/tlb.h> 30#include <asm/tlb.h>
31 31
32static void unmap_region(struct mm_struct *mm,
33 struct vm_area_struct *vma, struct vm_area_struct *prev,
34 unsigned long start, unsigned long end);
35
32/* 36/*
33 * WARNING: the debugging will use recursive algorithms so never enable this 37 * WARNING: the debugging will use recursive algorithms so never enable this
34 * unless you know what you are doing. 38 * unless you know what you are doing.
@@ -1129,7 +1133,8 @@ unmap_and_free_vma:
1129 fput(file); 1133 fput(file);
1130 1134
1131 /* Undo any partial mapping done by a device driver. */ 1135 /* Undo any partial mapping done by a device driver. */
1132 zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); 1136 unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
1137 charged = 0;
1133free_vma: 1138free_vma:
1134 kmem_cache_free(vm_area_cachep, vma); 1139 kmem_cache_free(vm_area_cachep, vma);
1135unacct_error: 1140unacct_error:
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1572} 1577}
1573#endif 1578#endif
1574 1579
1575/*
1576 * Try to free as many page directory entries as we can,
1577 * without having to work very hard at actually scanning
1578 * the page tables themselves.
1579 *
1580 * Right now we try to free page tables if we have a nice
1581 * PGDIR-aligned area that got free'd up. We could be more
1582 * granular if we want to, but this is fast and simple,
1583 * and covers the bad cases.
1584 *
1585 * "prev", if it exists, points to a vma before the one
1586 * we just free'd - but there's no telling how much before.
1587 */
1588static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
1589 unsigned long start, unsigned long end)
1590{
1591 unsigned long first = start & PGDIR_MASK;
1592 unsigned long last = end + PGDIR_SIZE - 1;
1593 struct mm_struct *mm = tlb->mm;
1594
1595 if (last > MM_VM_SIZE(mm) || last < end)
1596 last = MM_VM_SIZE(mm);
1597
1598 if (!prev) {
1599 prev = mm->mmap;
1600 if (!prev)
1601 goto no_mmaps;
1602 if (prev->vm_end > start) {
1603 if (last > prev->vm_start)
1604 last = prev->vm_start;
1605 goto no_mmaps;
1606 }
1607 }
1608 for (;;) {
1609 struct vm_area_struct *next = prev->vm_next;
1610
1611 if (next) {
1612 if (next->vm_start < start) {
1613 prev = next;
1614 continue;
1615 }
1616 if (last > next->vm_start)
1617 last = next->vm_start;
1618 }
1619 if (prev->vm_end > first)
1620 first = prev->vm_end;
1621 break;
1622 }
1623no_mmaps:
1624 if (last < first) /* for arches with discontiguous pgd indices */
1625 return;
1626 if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
1627 first = FIRST_USER_PGD_NR * PGDIR_SIZE;
1628 /* No point trying to free anything if we're in the same pte page */
1629 if ((first & PMD_MASK) < (last & PMD_MASK)) {
1630 clear_page_range(tlb, first, last);
1631 flush_tlb_pgtables(mm, first, last);
1632 }
1633}
1634
1635/* Normal function to fix up a mapping 1580/* Normal function to fix up a mapping
1636 * This function is the default for when an area has no specific 1581 * This function is the default for when an area has no specific
1637 * function. This may be used as part of a more specific routine. 1582 * function. This may be used as part of a more specific routine.
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_struct *mm,
1674 * Called with the page table lock held. 1619 * Called with the page table lock held.
1675 */ 1620 */
1676static void unmap_region(struct mm_struct *mm, 1621static void unmap_region(struct mm_struct *mm,
1677 struct vm_area_struct *vma, 1622 struct vm_area_struct *vma, struct vm_area_struct *prev,
1678 struct vm_area_struct *prev, 1623 unsigned long start, unsigned long end)
1679 unsigned long start,
1680 unsigned long end)
1681{ 1624{
1625 struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1682 struct mmu_gather *tlb; 1626 struct mmu_gather *tlb;
1683 unsigned long nr_accounted = 0; 1627 unsigned long nr_accounted = 0;
1684 1628
1685 lru_add_drain(); 1629 lru_add_drain();
1630 spin_lock(&mm->page_table_lock);
1686 tlb = tlb_gather_mmu(mm, 0); 1631 tlb = tlb_gather_mmu(mm, 0);
1687 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 1632 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1688 vm_unacct_memory(nr_accounted); 1633 vm_unacct_memory(nr_accounted);
1689 1634 free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
1690 if (is_hugepage_only_range(mm, start, end - start)) 1635 next? next->vm_start: 0);
1691 hugetlb_free_pgtables(tlb, prev, start, end);
1692 else
1693 free_pgtables(tlb, prev, start, end);
1694 tlb_finish_mmu(tlb, start, end); 1636 tlb_finish_mmu(tlb, start, end);
1637 spin_unlock(&mm->page_table_lock);
1695} 1638}
1696 1639
1697/* 1640/*
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1823 * Remove the vma's, and unmap the actual pages 1766 * Remove the vma's, and unmap the actual pages
1824 */ 1767 */
1825 detach_vmas_to_be_unmapped(mm, mpnt, prev, end); 1768 detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1826 spin_lock(&mm->page_table_lock);
1827 unmap_region(mm, mpnt, prev, start, end); 1769 unmap_region(mm, mpnt, prev, start, end);
1828 spin_unlock(&mm->page_table_lock);
1829 1770
1830 /* Fix up all other VM information */ 1771 /* Fix up all other VM information */
1831 unmap_vma_list(mm, mpnt); 1772 unmap_vma_list(mm, mpnt);
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk);
1957void exit_mmap(struct mm_struct *mm) 1898void exit_mmap(struct mm_struct *mm)
1958{ 1899{
1959 struct mmu_gather *tlb; 1900 struct mmu_gather *tlb;
1960 struct vm_area_struct *vma; 1901 struct vm_area_struct *vma = mm->mmap;
1961 unsigned long nr_accounted = 0; 1902 unsigned long nr_accounted = 0;
1962 1903
1963 lru_add_drain(); 1904 lru_add_drain();
1964 1905
1965 spin_lock(&mm->page_table_lock); 1906 spin_lock(&mm->page_table_lock);
1966 1907
1967 tlb = tlb_gather_mmu(mm, 1);
1968 flush_cache_mm(mm); 1908 flush_cache_mm(mm);
1969 /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ 1909 tlb = tlb_gather_mmu(mm, 1);
1970 mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, 1910 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1971 ~0UL, &nr_accounted, NULL); 1911 mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1972 vm_unacct_memory(nr_accounted); 1912 vm_unacct_memory(nr_accounted);
1973 BUG_ON(mm->map_count); /* This is just debugging */ 1913 free_pgtables(&tlb, vma, 0, 0);
1974 clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
1975
1976 tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); 1914 tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1977 1915
1978 vma = mm->mmap;
1979 mm->mmap = mm->mmap_cache = NULL; 1916 mm->mmap = mm->mmap_cache = NULL;
1980 mm->mm_rb = RB_ROOT; 1917 mm->mm_rb = RB_ROOT;
1981 set_mm_counter(mm, rss, 0); 1918 set_mm_counter(mm, rss, 0);
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm)
1993 remove_vm_struct(vma); 1930 remove_vm_struct(vma);
1994 vma = next; 1931 vma = next;
1995 } 1932 }
1933
1934 BUG_ON(mm->map_count); /* This is just debugging */
1935 BUG_ON(mm->nr_ptes); /* This is just debugging */
1996} 1936}
1997 1937
1998/* Insert vm structure into process list sorted by address 1938/* Insert vm structure into process list sorted by address