aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:29 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:40 -0400
commit8f4f8c164cb4af1432cc25eda82928ea4519ba72 (patch)
tree49cd3c62069df1f8d6c863b9806923de16c10e8b
parent663b97f7efd001b0c56bd5fce059c5272725b86f (diff)
[PATCH] mm: unlink vma before pagetables
In most places the descent from pgd to pud to pmd to pte holds mmap_sem (exclusively or not), which ensures that free_pgtables cannot be freeing page tables from any level at the same time. But truncation and reverse mapping descend without mmap_sem. No problem: just make sure that a vma is unlinked from its prio_tree (or nonlinear list) and from its anon_vma list, after zapping the vma, but before freeing its page tables. Then neither vmtruncate nor rmap can reach that vma whose page tables are now volatile (nor do they need to reach it, since all its page entries have been zapped by this stage). The i_mmap_lock and anon_vma->lock already serialize this correctly; but the locking hierarchy is such that we cannot take them while holding page_table_lock. Well, we're trying to push that down anyway. So in this patch, move anon_vma_unlink and unlink_file_vma into free_pgtables, at the same time as moving page_table_lock around calls to unmap_vmas. tlb_gather_mmu and tlb_finish_mmu then fall outside the page_table_lock, but we made them preempt_disable and preempt_enable earlier; and a long source audit of all the architectures has shown no problem with removing page_table_lock from them. free_pgtables doesn't need page_table_lock for itself, nor for what it calls; tlb->mm->nr_ptes is usually protected by page_table_lock, but partly by non-exclusive mmap_sem - here it's decremented with exclusive mmap_sem, or mm_users 0. update_hiwater_rss and vm_unacct_memory don't need page_table_lock either. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/memory.c12
-rw-r--r--mm/mmap.c23
2 files changed, 16 insertions, 19 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 24ba688876d6..4ea89a2e3a83 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -260,6 +260,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
260 struct vm_area_struct *next = vma->vm_next; 260 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start; 261 unsigned long addr = vma->vm_start;
262 262
263 /*
264 * Hide vma from rmap and vmtruncate before freeing pgtables
265 */
266 anon_vma_unlink(vma);
267 unlink_file_vma(vma);
268
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 269 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 270 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling); 271 floor, next? next->vm_start: ceiling);
@@ -272,6 +278,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
272 HPAGE_SIZE)) { 278 HPAGE_SIZE)) {
273 vma = next; 279 vma = next;
274 next = vma->vm_next; 280 next = vma->vm_next;
281 anon_vma_unlink(vma);
282 unlink_file_vma(vma);
275 } 283 }
276 free_pgd_range(tlb, addr, vma->vm_end, 284 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling); 285 floor, next? next->vm_start: ceiling);
@@ -798,12 +806,12 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
798 } 806 }
799 807
800 lru_add_drain(); 808 lru_add_drain();
801 spin_lock(&mm->page_table_lock);
802 tlb = tlb_gather_mmu(mm, 0); 809 tlb = tlb_gather_mmu(mm, 0);
803 update_hiwater_rss(mm); 810 update_hiwater_rss(mm);
811 spin_lock(&mm->page_table_lock);
804 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); 812 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
805 tlb_finish_mmu(tlb, address, end);
806 spin_unlock(&mm->page_table_lock); 813 spin_unlock(&mm->page_table_lock);
814 tlb_finish_mmu(tlb, address, end);
807 return end; 815 return end;
808} 816}
809 817
diff --git a/mm/mmap.c b/mm/mmap.c
index d931d7e49ac9..fa35323a3c5b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -203,14 +203,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
203{ 203{
204 struct vm_area_struct *next = vma->vm_next; 204 struct vm_area_struct *next = vma->vm_next;
205 205
206 /*
207 * Hide vma from rmap and vmtruncate before freeing page tables:
208 * to be moved into free_pgtables once page_table_lock is lifted
209 * from it, but until then lock ordering forbids that move.
210 */
211 anon_vma_unlink(vma);
212 unlink_file_vma(vma);
213
214 might_sleep(); 206 might_sleep();
215 if (vma->vm_ops && vma->vm_ops->close) 207 if (vma->vm_ops && vma->vm_ops->close)
216 vma->vm_ops->close(vma); 208 vma->vm_ops->close(vma);
@@ -1679,15 +1671,15 @@ static void unmap_region(struct mm_struct *mm,
1679 unsigned long nr_accounted = 0; 1671 unsigned long nr_accounted = 0;
1680 1672
1681 lru_add_drain(); 1673 lru_add_drain();
1682 spin_lock(&mm->page_table_lock);
1683 tlb = tlb_gather_mmu(mm, 0); 1674 tlb = tlb_gather_mmu(mm, 0);
1684 update_hiwater_rss(mm); 1675 update_hiwater_rss(mm);
1676 spin_lock(&mm->page_table_lock);
1685 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); 1677 unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
1678 spin_unlock(&mm->page_table_lock);
1686 vm_unacct_memory(nr_accounted); 1679 vm_unacct_memory(nr_accounted);
1687 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, 1680 free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
1688 next? next->vm_start: 0); 1681 next? next->vm_start: 0);
1689 tlb_finish_mmu(tlb, start, end); 1682 tlb_finish_mmu(tlb, start, end);
1690 spin_unlock(&mm->page_table_lock);
1691} 1683}
1692 1684
1693/* 1685/*
@@ -1962,23 +1954,20 @@ void exit_mmap(struct mm_struct *mm)
1962 unsigned long end; 1954 unsigned long end;
1963 1955
1964 lru_add_drain(); 1956 lru_add_drain();
1965
1966 spin_lock(&mm->page_table_lock);
1967
1968 flush_cache_mm(mm); 1957 flush_cache_mm(mm);
1969 tlb = tlb_gather_mmu(mm, 1); 1958 tlb = tlb_gather_mmu(mm, 1);
1970 /* Don't update_hiwater_rss(mm) here, do_exit already did */ 1959 /* Don't update_hiwater_rss(mm) here, do_exit already did */
1971 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 1960 /* Use -1 here to ensure all VMAs in the mm are unmapped */
1961 spin_lock(&mm->page_table_lock);
1972 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); 1962 end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
1963 spin_unlock(&mm->page_table_lock);
1973 vm_unacct_memory(nr_accounted); 1964 vm_unacct_memory(nr_accounted);
1974 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); 1965 free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
1975 tlb_finish_mmu(tlb, 0, end); 1966 tlb_finish_mmu(tlb, 0, end);
1976 1967
1977 spin_unlock(&mm->page_table_lock);
1978
1979 /* 1968 /*
1980 * Walk the list again, actually closing and freeing it 1969 * Walk the list again, actually closing and freeing it,
1981 * without holding any MM locks. 1970 * with preemption enabled, without holding any MM locks.
1982 */ 1971 */
1983 while (vma) 1972 while (vma)
1984 vma = remove_vma(vma); 1973 vma = remove_vma(vma);