[PATCH] freepgt: free_pgtables use vma list

Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables. Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma? And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that we need to do more than is done here - every PMD_SIZE ever occupied will be flushed, do we really have to flush every PGDIR_SIZE ever partially occupied? A shame to complicate it unnecessarily. Special thanks to David Miller for time spent repairing my ceilings. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Hugh Dickins <hugh@veritas.com> 2005-04-19 16:29:15 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org.(none)> 2005-04-19 16:29:15 -0400
commit: e0da382c92626ad1d7f4b7527d19b80104d67a83 (patch)
tree: b3f455518c286ee14cb2755ced8808487bca7911
parent: 9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff)
5 files changed, 141 insertions, 155 deletions
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 0742d54f8bb0..dd81479ff88a 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -255,6 +255,6 @@ void pgd_free(pgd_t *pgd)
        if (PTRS_PER_PMD > 1)
                for (i = 0; i < USER_PTRS_PER_PGD; ++i)
                        kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-        /* in the non-PAE case, clear_page_range() clears user pgd entries */
+        /* in the non-PAE case, free_pgtables() clears user pgd entries */
        kmem_cache_free(pgd_cache, pgd);
 }
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 40ad8328ffd5..626258ae9742 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -187,45 +187,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 }
 /*
- * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
+ * Do nothing, until we've worked out what to do!  To allow build, we
- * are hugetlb region specific.
+ * must remove reference to clear_page_range since it no longer exists.
 */
 void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
        unsigned long start, unsigned long end)
 {
-        unsigned long first = start & HUGETLB_PGDIR_MASK;
-        unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
-        struct mm_struct *mm = tlb->mm;
-        if (!prev) {
-                prev = mm->mmap;
-                if (!prev)
-                        goto no_mmaps;
-                if (prev->vm_end > start) {
-                        if (last > prev->vm_start)
-                                last = prev->vm_start;
-                        goto no_mmaps;
-                }
-        }
-        for (;;) {
-                struct vm_area_struct *next = prev->vm_next;
-                if (next) {
-                        if (next->vm_start < start) {
-                                prev = next;
-                                continue;
-                        }
-                        if (last > next->vm_start)
-                                last = next->vm_start;
-                }
-                if (prev->vm_end > first)
-                        first = prev->vm_end;
-                break;
-        }
-no_mmaps:
-        if (last < first)       /* for arches with discontiguous pgd indices */
-                return;
-        clear_page_range(tlb, first, last);
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 85f7d1bea937..c3f6c39d41d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -592,7 +592,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                struct vm_area_struct *start_vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *);
-void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+                unsigned long floor, unsigned long ceiling);
 int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
                        struct vm_area_struct *vma);
 int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
diff --git a/mm/memory.c b/mm/memory.c
index fb6e5deb873a..fee5dc8fc36c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd)
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
-static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
-                                unsigned long addr, unsigned long end)
 {
-        if (!((addr | end) & ~PMD_MASK)) {
+        struct page *page = pmd_page(*pmd);
-                /* Only free fully aligned ranges */
+        pmd_clear(pmd);
-                struct page *page = pmd_page(*pmd);
+        pte_free_tlb(tlb, page);
-                pmd_clear(pmd);
+        dec_page_state(nr_page_table_pages);
-                dec_page_state(nr_page_table_pages);
+        tlb->mm->nr_ptes--;
-                tlb->mm->nr_ptes--;
-                pte_free_tlb(tlb, page);
-        }
 }
-static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-                                unsigned long addr, unsigned long end)
+                                unsigned long addr, unsigned long end,
+                                unsigned long floor, unsigned long ceiling)
 {
        pmd_t *pmd;
        unsigned long next;
-        pmd_t *empty_pmd = NULL;
+        unsigned long start;
+        start = addr;
        pmd = pmd_offset(pud, addr);
-        /* Only free fully aligned ranges */
-        if (!((addr | end) & ~PUD_MASK))
-                empty_pmd = pmd;
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                clear_pte_range(tlb, pmd, addr, next);
+                free_pte_range(tlb, pmd);
        } while (pmd++, addr = next, addr != end);
-        if (empty_pmd) {
+        start &= PUD_MASK;
-                pud_clear(pud);
+        if (start < floor)
-                pmd_free_tlb(tlb, empty_pmd);
+                return;
+        if (ceiling) {
+                ceiling &= PUD_MASK;
+                if (!ceiling)
+                        return;
        }
+        if (end - 1 > ceiling - 1)
+                return;
+        pmd = pmd_offset(pud, start);
+        pud_clear(pud);
+        pmd_free_tlb(tlb, pmd);
 }
-static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
-                                unsigned long addr, unsigned long end)
+                                unsigned long addr, unsigned long end,
+                                unsigned long floor, unsigned long ceiling)
 {
        pud_t *pud;
        unsigned long next;
-        pud_t *empty_pud = NULL;
+        unsigned long start;
+        start = addr;
        pud = pud_offset(pgd, addr);
-        /* Only free fully aligned ranges */
-        if (!((addr | end) & ~PGDIR_MASK))
-                empty_pud = pud;
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                clear_pmd_range(tlb, pud, addr, next);
+                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);
-        if (empty_pud) {
+        start &= PGDIR_MASK;
-                pgd_clear(pgd);
+        if (start < floor)
-                pud_free_tlb(tlb, empty_pud);
+                return;
+        if (ceiling) {
+                ceiling &= PGDIR_MASK;
+                if (!ceiling)
+                        return;
        }
+        if (end - 1 > ceiling - 1)
+                return;
+        pud = pud_offset(pgd, start);
+        pgd_clear(pgd);
+        pud_free_tlb(tlb, pud);
 }
 /*
- * This function clears user-level page tables of a process.
+ * This function frees user-level page tables of a process.
- * Unlike other pagetable walks, some memory layouts might give end 0.
+ *
 * Must be called with pagetable lock held.
 */
-void clear_page_range(struct mmu_gather *tlb,
+static inline void free_pgd_range(struct mmu_gather *tlb,
-                                unsigned long addr, unsigned long end)
+                        unsigned long addr, unsigned long end,
+                        unsigned long floor, unsigned long ceiling)
 {
        pgd_t *pgd;
        unsigned long next;
+        unsigned long start;
+        /*
+         * The next few lines have given us lots of grief...
+         *
+         * Why are we testing PMD* at this top level?  Because often
+         * there will be no work to do at all, and we'd prefer not to
+         * go all the way down to the bottom just to discover that.
+         *
+         * Why all these "- 1"s?  Because 0 represents both the bottom
+         * of the address space and the top of it (using -1 for the
+         * top wouldn't help much: the masks would do the wrong thing).
+         * The rule is that addr 0 and floor 0 refer to the bottom of
+         * the address space, but end 0 and ceiling 0 refer to the top
+         * Comparisons need to use "end - 1" and "ceiling - 1" (though
+         * that end 0 case should be mythical).
+         *
+         * Wherever addr is brought up or ceiling brought down, we must
+         * be careful to reject "the opposite 0" before it confuses the
+         * subsequent tests.  But what about where end is brought down
+         * by PMD_SIZE below? no, end can't go down to 0 there.
+         *
+         * Whereas we round start (addr) and ceiling down, by different
+         * masks at different levels, in order to test whether a table
+         * now has no other vmas using it, so can be freed, we don't
+         * bother to round floor or end up - the tests don't need that.
+         */
+        addr &= PMD_MASK;
+        if (addr < floor) {
+                addr += PMD_SIZE;
+                if (!addr)
+                        return;
+        }
+        if (ceiling) {
+                ceiling &= PMD_MASK;
+                if (!ceiling)
+                        return;
+        }
+        if (end - 1 > ceiling - 1)
+                end -= PMD_SIZE;
+        if (addr > end - 1)
+                return;
+        start = addr;
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                clear_pud_range(tlb, pgd, addr, next);
+                free_pud_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
+        if (!tlb_is_full_mm(tlb))
+                flush_tlb_pgtables(tlb->mm, start, end);
+}
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+                                unsigned long floor, unsigned long ceiling)
+{
+        while (vma) {
+                struct vm_area_struct *next = vma->vm_next;
+                unsigned long addr = vma->vm_start;
+                /* Optimization: gather nearby vmas into a single call down */
+                while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
+                        vma = next;
+                        next = vma->vm_next;
+                }
+                free_pgd_range(*tlb, addr, vma->vm_end,
+                                floor, next? next->vm_start: ceiling);
+                vma = next;
+        }
 }
 pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
diff --git a/mm/mmap.c b/mm/mmap.c
index a95ebda27446..926d03015471 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,10 @@
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
+static void unmap_region(struct mm_struct *mm,
+                struct vm_area_struct *vma, struct vm_area_struct *prev,
+                unsigned long start, unsigned long end);
 /*
 * WARNING: the debugging will use recursive algorithms so never enable this
 * unless you know what you are doing.
@@ -1129,7 +1133,8 @@ unmap_and_free_vma:
        fput(file);
        /* Undo any partial mapping done by a device driver. */
-        zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
+        unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
+        charged = 0;
 free_vma:
        kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 }
 #endif
-/*
- * Try to free as many page directory entries as we can,
- * without having to work very hard at actually scanning
- * the page tables themselves.
- *
- * Right now we try to free page tables if we have a nice
- * PGDIR-aligned area that got free'd up. We could be more
- * granular if we want to, but this is fast and simple,
- * and covers the bad cases.
- *
- * "prev", if it exists, points to a vma before the one
- * we just free'd - but there's no telling how much before.
- */
-static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
-        unsigned long start, unsigned long end)
-{
-        unsigned long first = start & PGDIR_MASK;
-        unsigned long last = end + PGDIR_SIZE - 1;
-        struct mm_struct *mm = tlb->mm;
-        if (last > MM_VM_SIZE(mm) || last < end)
-                last = MM_VM_SIZE(mm);
-        if (!prev) {
-                prev = mm->mmap;
-                if (!prev)
-                        goto no_mmaps;
-                if (prev->vm_end > start) {
-                        if (last > prev->vm_start)
-                                last = prev->vm_start;
-                        goto no_mmaps;
-                }
-        }
-        for (;;) {
-                struct vm_area_struct *next = prev->vm_next;
-                if (next) {
-                        if (next->vm_start < start) {
-                                prev = next;
-                                continue;
-                        }
-                        if (last > next->vm_start)
-                                last = next->vm_start;
-                }
-                if (prev->vm_end > first)
-                        first = prev->vm_end;
-                break;
-        }
-no_mmaps:
-        if (last < first)       /* for arches with discontiguous pgd indices */
-                return;
-        if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
-                first = FIRST_USER_PGD_NR * PGDIR_SIZE;
-        /* No point trying to free anything if we're in the same pte page */
-        if ((first & PMD_MASK) < (last & PMD_MASK)) {
-                clear_page_range(tlb, first, last);
-                flush_tlb_pgtables(mm, first, last);
-        }
-}
 /* Normal function to fix up a mapping
 * This function is the default for when an area has no specific
 * function.  This may be used as part of a more specific routine.
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_struct *mm,
 * Called with the page table lock held.
 */
 static void unmap_region(struct mm_struct *mm,
-        struct vm_area_struct *vma,
+                struct vm_area_struct *vma, struct vm_area_struct *prev,
-        struct vm_area_struct *prev,
+                unsigned long start, unsigned long end)
-        unsigned long start,
-        unsigned long end)
 {
+        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
        struct mmu_gather *tlb;
        unsigned long nr_accounted = 0;
        lru_add_drain();
+        spin_lock(&mm->page_table_lock);
        tlb = tlb_gather_mmu(mm, 0);
        unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
+        free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
-        if (is_hugepage_only_range(mm, start, end - start))
+                                 next? next->vm_start: 0);
-                hugetlb_free_pgtables(tlb, prev, start, end);
-        else
-                free_pgtables(tlb, prev, start, end);
        tlb_finish_mmu(tlb, start, end);
+        spin_unlock(&mm->page_table_lock);
 }
 /*
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
         * Remove the vma's, and unmap the actual pages
         */
        detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
-        spin_lock(&mm->page_table_lock);
        unmap_region(mm, mpnt, prev, start, end);
-        spin_unlock(&mm->page_table_lock);
        /* Fix up all other VM information */
        unmap_vma_list(mm, mpnt);
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk);
 void exit_mmap(struct mm_struct *mm)
 {
        struct mmu_gather *tlb;
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = mm->mmap;
        unsigned long nr_accounted = 0;
        lru_add_drain();
        spin_lock(&mm->page_table_lock);
-        tlb = tlb_gather_mmu(mm, 1);
        flush_cache_mm(mm);
-        /* Use ~0UL here to ensure all VMAs in the mm are unmapped */
+        tlb = tlb_gather_mmu(mm, 1);
-        mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
+        /* Use -1 here to ensure all VMAs in the mm are unmapped */
-                                        ~0UL, &nr_accounted, NULL);
+        mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        BUG_ON(mm->map_count);  /* This is just debugging */
+        free_pgtables(&tlb, vma, 0, 0);
-        clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
-        
        tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
-        vma = mm->mmap;
        mm->mmap = mm->mmap_cache = NULL;
        mm->mm_rb = RB_ROOT;
        set_mm_counter(mm, rss, 0);
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm)
                remove_vm_struct(vma);
                vma = next;
        }
+        BUG_ON(mm->map_count);  /* This is just debugging */
+        BUG_ON(mm->nr_ptes);    /* This is just debugging */
 }
 /* Insert vm structure into process list sorted by address
author	Hugh Dickins <hugh@veritas.com>	2005-04-19 16:29:15 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org.(none)>	2005-04-19 16:29:15 -0400
commit	e0da382c92626ad1d7f4b7527d19b80104d67a83 (patch)
tree	b3f455518c286ee14cb2755ced8808487bca7911
parent	9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff)

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c index 0742d54f8bb0..dd81479ff88a 100644 --- a/arch/i386/mm/pgtable.c +++ b/arch/i386/mm/pgtable.c
@@ -255,6 +255,6 @@ void pgd_free(pgd_t *pgd)
255	if (PTRS_PER_PMD > 1)	255	if (PTRS_PER_PMD > 1)
256	for (i = 0; i < USER_PTRS_PER_PGD; ++i)	256	for (i = 0; i < USER_PTRS_PER_PGD; ++i)
257	kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));	257	kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
258	/* in the non-PAE case, clear_page_range() clears user pgd entries */	258	/* in the non-PAE case, free_pgtables() clears user pgd entries */
259	kmem_cache_free(pgd_cache, pgd);	259	kmem_cache_free(pgd_cache, pgd);
260	}	260	}


diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 40ad8328ffd5..626258ae9742 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c
@@ -187,45 +187,12 @@ follow_huge_pmd(struct mm_struct mm, unsigned long address, pmd_t pmd, int wri
187	}	187	}
188		188
189	/*	189	/*
190	* Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset	190	* Do nothing, until we've worked out what to do! To allow build, we
191	* are hugetlb region specific.	191	* must remove reference to clear_page_range since it no longer exists.
192	*/	192	*/
193	void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,	193	void hugetlb_free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
194	unsigned long start, unsigned long end)	194	unsigned long start, unsigned long end)
195	{	195	{
196	unsigned long first = start & HUGETLB_PGDIR_MASK;
197	unsigned long last = end + HUGETLB_PGDIR_SIZE - 1;
198	struct mm_struct *mm = tlb->mm;
199
200	if (!prev) {
201	prev = mm->mmap;
202	if (!prev)
203	goto no_mmaps;
204	if (prev->vm_end > start) {
205	if (last > prev->vm_start)
206	last = prev->vm_start;
207	goto no_mmaps;
208	}
209	}
210	for (;;) {
211	struct vm_area_struct *next = prev->vm_next;
212
213	if (next) {
214	if (next->vm_start < start) {
215	prev = next;
216	continue;
217	}
218	if (last > next->vm_start)
219	last = next->vm_start;
220	}
221	if (prev->vm_end > first)
222	first = prev->vm_end;
223	break;
224	}
225	no_mmaps:
226	if (last < first) /* for arches with discontiguous pgd indices */
227	return;
228	clear_page_range(tlb, first, last);
229	}	196	}
230		197
231	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)	198	void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end)


diff --git a/include/linux/mm.h b/include/linux/mm.h index 85f7d1bea937..c3f6c39d41d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h
@@ -592,7 +592,8 @@ int unmap_vmas(struct mmu_gather *tlbp, struct mm_struct mm,
592	struct vm_area_struct *start_vma, unsigned long start_addr,	592	struct vm_area_struct *start_vma, unsigned long start_addr,
593	unsigned long end_addr, unsigned long *nr_accounted,	593	unsigned long end_addr, unsigned long *nr_accounted,
594	struct zap_details *);	594	struct zap_details *);
595	void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end);	595	void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
		596	unsigned long floor, unsigned long ceiling);
596	int copy_page_range(struct mm_struct dst, struct mm_struct src,	597	int copy_page_range(struct mm_struct dst, struct mm_struct src,
597	struct vm_area_struct *vma);	598	struct vm_area_struct *vma);
598	int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,	599	int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,


diff --git a/mm/memory.c b/mm/memory.c index fb6e5deb873a..fee5dc8fc36c 100644 --- a/mm/memory.c +++ b/mm/memory.c
@@ -110,87 +110,165 @@ void pmd_clear_bad(pmd_t *pmd)
110	* Note: this doesn't free the actual pages themselves. That	110	* Note: this doesn't free the actual pages themselves. That
111	* has been handled earlier when unmapping all the memory regions.	111	* has been handled earlier when unmapping all the memory regions.
112	*/	112	*/
113	static inline void clear_pte_range(struct mmu_gather tlb, pmd_t pmd,	113	static void free_pte_range(struct mmu_gather tlb, pmd_t pmd)
114	unsigned long addr, unsigned long end)
115	{	114	{
116	if (!((addr \| end) & ~PMD_MASK)) {	115	struct page page = pmd_page(pmd);
117	/* Only free fully aligned ranges */	116	pmd_clear(pmd);
118	struct page page = pmd_page(pmd);	117	pte_free_tlb(tlb, page);
119	pmd_clear(pmd);	118	dec_page_state(nr_page_table_pages);
120	dec_page_state(nr_page_table_pages);	119	tlb->mm->nr_ptes--;
121	tlb->mm->nr_ptes--;
122	pte_free_tlb(tlb, page);
123	}
124	}	120	}
125		121
126	static inline void clear_pmd_range(struct mmu_gather tlb, pud_t pud,	122	static inline void free_pmd_range(struct mmu_gather tlb, pud_t pud,
127	unsigned long addr, unsigned long end)	123	unsigned long addr, unsigned long end,
		124	unsigned long floor, unsigned long ceiling)
128	{	125	{
129	pmd_t *pmd;	126	pmd_t *pmd;
130	unsigned long next;	127	unsigned long next;
131	pmd_t *empty_pmd = NULL;	128	unsigned long start;
132		129
		130	start = addr;
133	pmd = pmd_offset(pud, addr);	131	pmd = pmd_offset(pud, addr);
134
135	/* Only free fully aligned ranges */
136	if (!((addr \| end) & ~PUD_MASK))
137	empty_pmd = pmd;
138	do {	132	do {
139	next = pmd_addr_end(addr, end);	133	next = pmd_addr_end(addr, end);
140	if (pmd_none_or_clear_bad(pmd))	134	if (pmd_none_or_clear_bad(pmd))
141	continue;	135	continue;
142	clear_pte_range(tlb, pmd, addr, next);	136	free_pte_range(tlb, pmd);
143	} while (pmd++, addr = next, addr != end);	137	} while (pmd++, addr = next, addr != end);
144		138
145	if (empty_pmd) {	139	start &= PUD_MASK;
146	pud_clear(pud);	140	if (start < floor)
147	pmd_free_tlb(tlb, empty_pmd);	141	return;
		142	if (ceiling) {
		143	ceiling &= PUD_MASK;
		144	if (!ceiling)
		145	return;
148	}	146	}
		147	if (end - 1 > ceiling - 1)
		148	return;
		149
		150	pmd = pmd_offset(pud, start);
		151	pud_clear(pud);
		152	pmd_free_tlb(tlb, pmd);
149	}	153	}
150		154
151	static inline void clear_pud_range(struct mmu_gather tlb, pgd_t pgd,	155	static inline void free_pud_range(struct mmu_gather tlb, pgd_t pgd,
152	unsigned long addr, unsigned long end)	156	unsigned long addr, unsigned long end,
		157	unsigned long floor, unsigned long ceiling)
153	{	158	{
154	pud_t *pud;	159	pud_t *pud;
155	unsigned long next;	160	unsigned long next;
156	pud_t *empty_pud = NULL;	161	unsigned long start;
157		162
		163	start = addr;
158	pud = pud_offset(pgd, addr);	164	pud = pud_offset(pgd, addr);
159
160	/* Only free fully aligned ranges */
161	if (!((addr \| end) & ~PGDIR_MASK))
162	empty_pud = pud;
163	do {	165	do {
164	next = pud_addr_end(addr, end);	166	next = pud_addr_end(addr, end);
165	if (pud_none_or_clear_bad(pud))	167	if (pud_none_or_clear_bad(pud))
166	continue;	168	continue;
167	clear_pmd_range(tlb, pud, addr, next);	169	free_pmd_range(tlb, pud, addr, next, floor, ceiling);
168	} while (pud++, addr = next, addr != end);	170	} while (pud++, addr = next, addr != end);
169		171
170	if (empty_pud) {	172	start &= PGDIR_MASK;
171	pgd_clear(pgd);	173	if (start < floor)
172	pud_free_tlb(tlb, empty_pud);	174	return;
		175	if (ceiling) {
		176	ceiling &= PGDIR_MASK;
		177	if (!ceiling)
		178	return;
173	}	179	}
		180	if (end - 1 > ceiling - 1)
		181	return;
		182
		183	pud = pud_offset(pgd, start);
		184	pgd_clear(pgd);
		185	pud_free_tlb(tlb, pud);
174	}	186	}
175		187
176	/*	188	/*
177	* This function clears user-level page tables of a process.	189	* This function frees user-level page tables of a process.
178	* Unlike other pagetable walks, some memory layouts might give end 0.	190	*
179	* Must be called with pagetable lock held.	191	* Must be called with pagetable lock held.
180	*/	192	*/
181	void clear_page_range(struct mmu_gather *tlb,	193	static inline void free_pgd_range(struct mmu_gather *tlb,
182	unsigned long addr, unsigned long end)	194	unsigned long addr, unsigned long end,
		195	unsigned long floor, unsigned long ceiling)
183	{	196	{
184	pgd_t *pgd;	197	pgd_t *pgd;
185	unsigned long next;	198	unsigned long next;
		199	unsigned long start;
186		200
		201	/*
		202	* The next few lines have given us lots of grief...
		203	*
		204	* Why are we testing PMD* at this top level? Because often
		205	* there will be no work to do at all, and we'd prefer not to
		206	* go all the way down to the bottom just to discover that.
		207	*
		208	* Why all these "- 1"s? Because 0 represents both the bottom
		209	* of the address space and the top of it (using -1 for the
		210	* top wouldn't help much: the masks would do the wrong thing).
		211	* The rule is that addr 0 and floor 0 refer to the bottom of
		212	* the address space, but end 0 and ceiling 0 refer to the top
		213	* Comparisons need to use "end - 1" and "ceiling - 1" (though
		214	* that end 0 case should be mythical).
		215	*
		216	* Wherever addr is brought up or ceiling brought down, we must
		217	* be careful to reject "the opposite 0" before it confuses the
		218	* subsequent tests. But what about where end is brought down
		219	* by PMD_SIZE below? no, end can't go down to 0 there.
		220	*
		221	* Whereas we round start (addr) and ceiling down, by different
		222	* masks at different levels, in order to test whether a table
		223	* now has no other vmas using it, so can be freed, we don't
		224	* bother to round floor or end up - the tests don't need that.
		225	*/
		226
		227	addr &= PMD_MASK;
		228	if (addr < floor) {
		229	addr += PMD_SIZE;
		230	if (!addr)
		231	return;
		232	}
		233	if (ceiling) {
		234	ceiling &= PMD_MASK;
		235	if (!ceiling)
		236	return;
		237	}
		238	if (end - 1 > ceiling - 1)
		239	end -= PMD_SIZE;
		240	if (addr > end - 1)
		241	return;
		242
		243	start = addr;
187	pgd = pgd_offset(tlb->mm, addr);	244	pgd = pgd_offset(tlb->mm, addr);
188	do {	245	do {
189	next = pgd_addr_end(addr, end);	246	next = pgd_addr_end(addr, end);
190	if (pgd_none_or_clear_bad(pgd))	247	if (pgd_none_or_clear_bad(pgd))
191	continue;	248	continue;
192	clear_pud_range(tlb, pgd, addr, next);	249	free_pud_range(tlb, pgd, addr, next, floor, ceiling);
193	} while (pgd++, addr = next, addr != end);	250	} while (pgd++, addr = next, addr != end);
		251
		252	if (!tlb_is_full_mm(tlb))
		253	flush_tlb_pgtables(tlb->mm, start, end);
		254	}
		255
		256	void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct vma,
		257	unsigned long floor, unsigned long ceiling)
		258	{
		259	while (vma) {
		260	struct vm_area_struct *next = vma->vm_next;
		261	unsigned long addr = vma->vm_start;
		262
		263	/* Optimization: gather nearby vmas into a single call down */
		264	while (next && next->vm_start <= vma->vm_end + PMD_SIZE) {
		265	vma = next;
		266	next = vma->vm_next;
		267	}
		268	free_pgd_range(*tlb, addr, vma->vm_end,
		269	floor, next? next->vm_start: ceiling);
		270	vma = next;
		271	}
194	}	272	}
195		273
196	pte_t fastcall * pte_alloc_map(struct mm_struct mm, pmd_t pmd, unsigned long address)	274	pte_t fastcall * pte_alloc_map(struct mm_struct mm, pmd_t pmd, unsigned long address)


diff --git a/mm/mmap.c b/mm/mmap.c index a95ebda27446..926d03015471 100644 --- a/mm/mmap.c +++ b/mm/mmap.c
@@ -29,6 +29,10 @@
29	#include <asm/cacheflush.h>	29	#include <asm/cacheflush.h>
30	#include <asm/tlb.h>	30	#include <asm/tlb.h>
31		31
		32	static void unmap_region(struct mm_struct *mm,
		33	struct vm_area_struct vma, struct vm_area_struct prev,
		34	unsigned long start, unsigned long end);
		35
32	/*	36	/*
33	* WARNING: the debugging will use recursive algorithms so never enable this	37	* WARNING: the debugging will use recursive algorithms so never enable this
34	* unless you know what you are doing.	38	* unless you know what you are doing.
@@ -1129,7 +1133,8 @@ unmap_and_free_vma:
1129	fput(file);	1133	fput(file);
1130		1134
1131	/* Undo any partial mapping done by a device driver. */	1135	/* Undo any partial mapping done by a device driver. */
1132	zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);	1136	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
		1137	charged = 0;
1133	free_vma:	1138	free_vma:
1134	kmem_cache_free(vm_area_cachep, vma);	1139	kmem_cache_free(vm_area_cachep, vma);
1135	unacct_error:	1140	unacct_error:
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1572	}	1577	}
1573	#endif	1578	#endif
1574		1579
1575	/*
1576	* Try to free as many page directory entries as we can,
1577	* without having to work very hard at actually scanning
1578	* the page tables themselves.
1579	*
1580	* Right now we try to free page tables if we have a nice
1581	* PGDIR-aligned area that got free'd up. We could be more
1582	* granular if we want to, but this is fast and simple,
1583	* and covers the bad cases.
1584	*
1585	* "prev", if it exists, points to a vma before the one
1586	* we just free'd - but there's no telling how much before.
1587	*/
1588	static void free_pgtables(struct mmu_gather tlb, struct vm_area_struct prev,
1589	unsigned long start, unsigned long end)
1590	{
1591	unsigned long first = start & PGDIR_MASK;
1592	unsigned long last = end + PGDIR_SIZE - 1;
1593	struct mm_struct *mm = tlb->mm;
1594
1595	if (last > MM_VM_SIZE(mm) \|\| last < end)
1596	last = MM_VM_SIZE(mm);
1597
1598	if (!prev) {
1599	prev = mm->mmap;
1600	if (!prev)
1601	goto no_mmaps;
1602	if (prev->vm_end > start) {
1603	if (last > prev->vm_start)
1604	last = prev->vm_start;
1605	goto no_mmaps;
1606	}
1607	}
1608	for (;;) {
1609	struct vm_area_struct *next = prev->vm_next;
1610
1611	if (next) {
1612	if (next->vm_start < start) {
1613	prev = next;
1614	continue;
1615	}
1616	if (last > next->vm_start)
1617	last = next->vm_start;
1618	}
1619	if (prev->vm_end > first)
1620	first = prev->vm_end;
1621	break;
1622	}
1623	no_mmaps:
1624	if (last < first) /* for arches with discontiguous pgd indices */
1625	return;
1626	if (first < FIRST_USER_PGD_NR * PGDIR_SIZE)
1627	first = FIRST_USER_PGD_NR * PGDIR_SIZE;
1628	/* No point trying to free anything if we're in the same pte page */
1629	if ((first & PMD_MASK) < (last & PMD_MASK)) {
1630	clear_page_range(tlb, first, last);
1631	flush_tlb_pgtables(mm, first, last);
1632	}
1633	}
1634
1635	/* Normal function to fix up a mapping	1580	/* Normal function to fix up a mapping
1636	* This function is the default for when an area has no specific	1581	* This function is the default for when an area has no specific
1637	* function. This may be used as part of a more specific routine.	1582	* function. This may be used as part of a more specific routine.
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_struct *mm,
1674	* Called with the page table lock held.	1619	* Called with the page table lock held.
1675	*/	1620	*/
1676	static void unmap_region(struct mm_struct *mm,	1621	static void unmap_region(struct mm_struct *mm,
1677	struct vm_area_struct *vma,	1622	struct vm_area_struct vma, struct vm_area_struct prev,
1678	struct vm_area_struct *prev,	1623	unsigned long start, unsigned long end)
1679	unsigned long start,
1680	unsigned long end)
1681	{	1624	{
		1625	struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
1682	struct mmu_gather *tlb;	1626	struct mmu_gather *tlb;
1683	unsigned long nr_accounted = 0;	1627	unsigned long nr_accounted = 0;
1684		1628
1685	lru_add_drain();	1629	lru_add_drain();
		1630	spin_lock(&mm->page_table_lock);
1686	tlb = tlb_gather_mmu(mm, 0);	1631	tlb = tlb_gather_mmu(mm, 0);
1687	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);	1632	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1688	vm_unacct_memory(nr_accounted);	1633	vm_unacct_memory(nr_accounted);
1689		1634	free_pgtables(&tlb, vma, prev? prev->vm_end: 0,
1690	if (is_hugepage_only_range(mm, start, end - start))	1635	next? next->vm_start: 0);
1691	hugetlb_free_pgtables(tlb, prev, start, end);
1692	else
1693	free_pgtables(tlb, prev, start, end);
1694	tlb_finish_mmu(tlb, start, end);	1636	tlb_finish_mmu(tlb, start, end);
		1637	spin_unlock(&mm->page_table_lock);
1695	}	1638	}
1696		1639
1697	/*	1640	/*
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1823	* Remove the vma's, and unmap the actual pages	1766	* Remove the vma's, and unmap the actual pages
1824	*/	1767	*/
1825	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);	1768	detach_vmas_to_be_unmapped(mm, mpnt, prev, end);
1826	spin_lock(&mm->page_table_lock);
1827	unmap_region(mm, mpnt, prev, start, end);	1769	unmap_region(mm, mpnt, prev, start, end);
1828	spin_unlock(&mm->page_table_lock);
1829		1770
1830	/* Fix up all other VM information */	1771	/* Fix up all other VM information */
1831	unmap_vma_list(mm, mpnt);	1772	unmap_vma_list(mm, mpnt);
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk);
1957	void exit_mmap(struct mm_struct *mm)	1898	void exit_mmap(struct mm_struct *mm)
1958	{	1899	{
1959	struct mmu_gather *tlb;	1900	struct mmu_gather *tlb;
1960	struct vm_area_struct *vma;	1901	struct vm_area_struct *vma = mm->mmap;
1961	unsigned long nr_accounted = 0;	1902	unsigned long nr_accounted = 0;
1962		1903
1963	lru_add_drain();	1904	lru_add_drain();
1964		1905
1965	spin_lock(&mm->page_table_lock);	1906	spin_lock(&mm->page_table_lock);
1966		1907
1967	tlb = tlb_gather_mmu(mm, 1);
1968	flush_cache_mm(mm);	1908	flush_cache_mm(mm);
1969	/* Use ~0UL here to ensure all VMAs in the mm are unmapped */	1909	tlb = tlb_gather_mmu(mm, 1);
1970	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,	1910	/* Use -1 here to ensure all VMAs in the mm are unmapped */
1971	~0UL, &nr_accounted, NULL);	1911	mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1972	vm_unacct_memory(nr_accounted);	1912	vm_unacct_memory(nr_accounted);
1973	BUG_ON(mm->map_count); /* This is just debugging */	1913	free_pgtables(&tlb, vma, 0, 0);
1974	clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm));
1975
1976	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));	1914	tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm));
1977		1915
1978	vma = mm->mmap;
1979	mm->mmap = mm->mmap_cache = NULL;	1916	mm->mmap = mm->mmap_cache = NULL;
1980	mm->mm_rb = RB_ROOT;	1917	mm->mm_rb = RB_ROOT;
1981	set_mm_counter(mm, rss, 0);	1918	set_mm_counter(mm, rss, 0);
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm)
1993	remove_vm_struct(vma);	1930	remove_vm_struct(vma);
1994	vma = next;	1931	vma = next;
1995	}	1932	}
		1933
		1934	BUG_ON(mm->map_count); /* This is just debugging */
		1935	BUG_ON(mm->nr_ptes); /* This is just debugging */
1996	}	1936	}
1997		1937
1998	/* Insert vm structure into process list sorted by address	1938	/* Insert vm structure into process list sorted by address