diff options
author | Hugh Dickins <hugh@veritas.com> | 2005-04-19 16:29:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org.(none)> | 2005-04-19 16:29:15 -0400 |
commit | e0da382c92626ad1d7f4b7527d19b80104d67a83 (patch) | |
tree | b3f455518c286ee14cb2755ced8808487bca7911 /mm/mmap.c | |
parent | 9f6c6fc505560465be0964eb4da1b6ca97bd3951 (diff) |
[PATCH] freepgt: free_pgtables use vma list
Recent woes with some arches needing their own pgd_addr_end macro; and 4-level
clear_page_range regression since 2.6.10's clear_page_tables; and its
long-standing well-known inefficiency in searching throughout the higher-level
page tables for those few entries to clear and free: all can be blamed on
ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by
free_pgtables operating on the mm's vma list; unmap_region use it in the same
way, giving floor and ceiling beyond which it may not free tables. This
brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled,
in which case latency fixes spoil unmap_vmas throughput).
Beware: the do_mmap_pgoff driver failure case must now use unmap_region
instead of zap_page_range, since a page table might have been allocated, and
can only be freed while it is touched by some vma.
Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted
from the clear_page_range levels. (Most of free_pgtables' old code was
actually for a non-existent case, prev not properly set up, dating from before
hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we
might want to add latency lockdrops later; but no attempt to do so yet, going
by vma should itself reduce latency.
But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful
examination: put that off until a later patch of the series.
What of x86_64's 32bit vdso page __map_syscall32 maps outside any vma?
And the range to sparc64's flush_tlb_pgtables? It's less clear to me now that
we need to do more than is done here - every PMD_SIZE ever occupied will be
flushed, do we really have to flush every PGDIR_SIZE ever partially occupied?
A shame to complicate it unnecessarily.
Special thanks to David Miller for time spent repairing my ceilings.
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/mmap.c')
-rw-r--r-- | mm/mmap.c | 102 |
1 files changed, 21 insertions, 81 deletions
@@ -29,6 +29,10 @@ | |||
29 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
30 | #include <asm/tlb.h> | 30 | #include <asm/tlb.h> |
31 | 31 | ||
32 | static void unmap_region(struct mm_struct *mm, | ||
33 | struct vm_area_struct *vma, struct vm_area_struct *prev, | ||
34 | unsigned long start, unsigned long end); | ||
35 | |||
32 | /* | 36 | /* |
33 | * WARNING: the debugging will use recursive algorithms so never enable this | 37 | * WARNING: the debugging will use recursive algorithms so never enable this |
34 | * unless you know what you are doing. | 38 | * unless you know what you are doing. |
@@ -1129,7 +1133,8 @@ unmap_and_free_vma: | |||
1129 | fput(file); | 1133 | fput(file); |
1130 | 1134 | ||
1131 | /* Undo any partial mapping done by a device driver. */ | 1135 | /* Undo any partial mapping done by a device driver. */ |
1132 | zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); | 1136 | unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); |
1137 | charged = 0; | ||
1133 | free_vma: | 1138 | free_vma: |
1134 | kmem_cache_free(vm_area_cachep, vma); | 1139 | kmem_cache_free(vm_area_cachep, vma); |
1135 | unacct_error: | 1140 | unacct_error: |
@@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
1572 | } | 1577 | } |
1573 | #endif | 1578 | #endif |
1574 | 1579 | ||
1575 | /* | ||
1576 | * Try to free as many page directory entries as we can, | ||
1577 | * without having to work very hard at actually scanning | ||
1578 | * the page tables themselves. | ||
1579 | * | ||
1580 | * Right now we try to free page tables if we have a nice | ||
1581 | * PGDIR-aligned area that got free'd up. We could be more | ||
1582 | * granular if we want to, but this is fast and simple, | ||
1583 | * and covers the bad cases. | ||
1584 | * | ||
1585 | * "prev", if it exists, points to a vma before the one | ||
1586 | * we just free'd - but there's no telling how much before. | ||
1587 | */ | ||
1588 | static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, | ||
1589 | unsigned long start, unsigned long end) | ||
1590 | { | ||
1591 | unsigned long first = start & PGDIR_MASK; | ||
1592 | unsigned long last = end + PGDIR_SIZE - 1; | ||
1593 | struct mm_struct *mm = tlb->mm; | ||
1594 | |||
1595 | if (last > MM_VM_SIZE(mm) || last < end) | ||
1596 | last = MM_VM_SIZE(mm); | ||
1597 | |||
1598 | if (!prev) { | ||
1599 | prev = mm->mmap; | ||
1600 | if (!prev) | ||
1601 | goto no_mmaps; | ||
1602 | if (prev->vm_end > start) { | ||
1603 | if (last > prev->vm_start) | ||
1604 | last = prev->vm_start; | ||
1605 | goto no_mmaps; | ||
1606 | } | ||
1607 | } | ||
1608 | for (;;) { | ||
1609 | struct vm_area_struct *next = prev->vm_next; | ||
1610 | |||
1611 | if (next) { | ||
1612 | if (next->vm_start < start) { | ||
1613 | prev = next; | ||
1614 | continue; | ||
1615 | } | ||
1616 | if (last > next->vm_start) | ||
1617 | last = next->vm_start; | ||
1618 | } | ||
1619 | if (prev->vm_end > first) | ||
1620 | first = prev->vm_end; | ||
1621 | break; | ||
1622 | } | ||
1623 | no_mmaps: | ||
1624 | if (last < first) /* for arches with discontiguous pgd indices */ | ||
1625 | return; | ||
1626 | if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) | ||
1627 | first = FIRST_USER_PGD_NR * PGDIR_SIZE; | ||
1628 | /* No point trying to free anything if we're in the same pte page */ | ||
1629 | if ((first & PMD_MASK) < (last & PMD_MASK)) { | ||
1630 | clear_page_range(tlb, first, last); | ||
1631 | flush_tlb_pgtables(mm, first, last); | ||
1632 | } | ||
1633 | } | ||
1634 | |||
1635 | /* Normal function to fix up a mapping | 1580 | /* Normal function to fix up a mapping |
1636 | * This function is the default for when an area has no specific | 1581 | * This function is the default for when an area has no specific |
1637 | * function. This may be used as part of a more specific routine. | 1582 | * function. This may be used as part of a more specific routine. |
@@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_struct *mm, | |||
1674 | * Called with the page table lock held. | 1619 | * Called with the page table lock held. |
1675 | */ | 1620 | */ |
1676 | static void unmap_region(struct mm_struct *mm, | 1621 | static void unmap_region(struct mm_struct *mm, |
1677 | struct vm_area_struct *vma, | 1622 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
1678 | struct vm_area_struct *prev, | 1623 | unsigned long start, unsigned long end) |
1679 | unsigned long start, | ||
1680 | unsigned long end) | ||
1681 | { | 1624 | { |
1625 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | ||
1682 | struct mmu_gather *tlb; | 1626 | struct mmu_gather *tlb; |
1683 | unsigned long nr_accounted = 0; | 1627 | unsigned long nr_accounted = 0; |
1684 | 1628 | ||
1685 | lru_add_drain(); | 1629 | lru_add_drain(); |
1630 | spin_lock(&mm->page_table_lock); | ||
1686 | tlb = tlb_gather_mmu(mm, 0); | 1631 | tlb = tlb_gather_mmu(mm, 0); |
1687 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); | 1632 | unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); |
1688 | vm_unacct_memory(nr_accounted); | 1633 | vm_unacct_memory(nr_accounted); |
1689 | 1634 | free_pgtables(&tlb, vma, prev? prev->vm_end: 0, | |
1690 | if (is_hugepage_only_range(mm, start, end - start)) | 1635 | next? next->vm_start: 0); |
1691 | hugetlb_free_pgtables(tlb, prev, start, end); | ||
1692 | else | ||
1693 | free_pgtables(tlb, prev, start, end); | ||
1694 | tlb_finish_mmu(tlb, start, end); | 1636 | tlb_finish_mmu(tlb, start, end); |
1637 | spin_unlock(&mm->page_table_lock); | ||
1695 | } | 1638 | } |
1696 | 1639 | ||
1697 | /* | 1640 | /* |
@@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
1823 | * Remove the vma's, and unmap the actual pages | 1766 | * Remove the vma's, and unmap the actual pages |
1824 | */ | 1767 | */ |
1825 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); | 1768 | detach_vmas_to_be_unmapped(mm, mpnt, prev, end); |
1826 | spin_lock(&mm->page_table_lock); | ||
1827 | unmap_region(mm, mpnt, prev, start, end); | 1769 | unmap_region(mm, mpnt, prev, start, end); |
1828 | spin_unlock(&mm->page_table_lock); | ||
1829 | 1770 | ||
1830 | /* Fix up all other VM information */ | 1771 | /* Fix up all other VM information */ |
1831 | unmap_vma_list(mm, mpnt); | 1772 | unmap_vma_list(mm, mpnt); |
@@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk); | |||
1957 | void exit_mmap(struct mm_struct *mm) | 1898 | void exit_mmap(struct mm_struct *mm) |
1958 | { | 1899 | { |
1959 | struct mmu_gather *tlb; | 1900 | struct mmu_gather *tlb; |
1960 | struct vm_area_struct *vma; | 1901 | struct vm_area_struct *vma = mm->mmap; |
1961 | unsigned long nr_accounted = 0; | 1902 | unsigned long nr_accounted = 0; |
1962 | 1903 | ||
1963 | lru_add_drain(); | 1904 | lru_add_drain(); |
1964 | 1905 | ||
1965 | spin_lock(&mm->page_table_lock); | 1906 | spin_lock(&mm->page_table_lock); |
1966 | 1907 | ||
1967 | tlb = tlb_gather_mmu(mm, 1); | ||
1968 | flush_cache_mm(mm); | 1908 | flush_cache_mm(mm); |
1969 | /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ | 1909 | tlb = tlb_gather_mmu(mm, 1); |
1970 | mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, | 1910 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
1971 | ~0UL, &nr_accounted, NULL); | 1911 | mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); |
1972 | vm_unacct_memory(nr_accounted); | 1912 | vm_unacct_memory(nr_accounted); |
1973 | BUG_ON(mm->map_count); /* This is just debugging */ | 1913 | free_pgtables(&tlb, vma, 0, 0); |
1974 | clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); | ||
1975 | |||
1976 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); | 1914 | tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); |
1977 | 1915 | ||
1978 | vma = mm->mmap; | ||
1979 | mm->mmap = mm->mmap_cache = NULL; | 1916 | mm->mmap = mm->mmap_cache = NULL; |
1980 | mm->mm_rb = RB_ROOT; | 1917 | mm->mm_rb = RB_ROOT; |
1981 | set_mm_counter(mm, rss, 0); | 1918 | set_mm_counter(mm, rss, 0); |
@@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm) | |||
1993 | remove_vm_struct(vma); | 1930 | remove_vm_struct(vma); |
1994 | vma = next; | 1931 | vma = next; |
1995 | } | 1932 | } |
1933 | |||
1934 | BUG_ON(mm->map_count); /* This is just debugging */ | ||
1935 | BUG_ON(mm->nr_ptes); /* This is just debugging */ | ||
1996 | } | 1936 | } |
1997 | 1937 | ||
1998 | /* Insert vm structure into process list sorted by address | 1938 | /* Insert vm structure into process list sorted by address |