aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:23 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:40 -0400
commitc74df32c724a1652ad8399b4891bb02c9d43743a (patch)
tree5a79d56fdcf7dc2053a277dbf6db7c3b339e9659 /mm/memory.c
parent1bb3630e89cb8a7b3d3807629c20c5bad88290ff (diff)
[PATCH] mm: ptd_alloc take ptlock
Second step in pushing down the page_table_lock. Remove the temporary bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not to hold page_table_lock, whether it's on init_mm or a user mm; take page_table_lock internally to check if a racing task already allocated. Convert their callers from common code. But avoid coming back to change them again later: instead of moving the spin_lock(&mm->page_table_lock) down, switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which encapsulate the mapping+locking and unlocking+unmapping together, and in the end may use alternatives to the mm page_table_lock itself. These callers all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them; and pte_alloc uses the "atomic" pmd_present to test whether it needs to allocate. It appears that on all arches we can safely descend without page_table_lock. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c104
1 files changed, 32 insertions, 72 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 4bdd1186b43b..a40e4b1cee4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
282 282
283int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 283int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
284{ 284{
285 struct page *new; 285 struct page *new = pte_alloc_one(mm, address);
286
287 spin_unlock(&mm->page_table_lock);
288 new = pte_alloc_one(mm, address);
289 spin_lock(&mm->page_table_lock);
290 if (!new) 286 if (!new)
291 return -ENOMEM; 287 return -ENOMEM;
292 288
289 spin_lock(&mm->page_table_lock);
293 if (pmd_present(*pmd)) /* Another has populated it */ 290 if (pmd_present(*pmd)) /* Another has populated it */
294 pte_free(new); 291 pte_free(new);
295 else { 292 else {
@@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
297 inc_page_state(nr_page_table_pages); 294 inc_page_state(nr_page_table_pages);
298 pmd_populate(mm, pmd, new); 295 pmd_populate(mm, pmd, new);
299 } 296 }
297 spin_unlock(&mm->page_table_lock);
300 return 0; 298 return 0;
301} 299}
302 300
@@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
344 * copy one vm_area from one task to the other. Assumes the page tables 342 * copy one vm_area from one task to the other. Assumes the page tables
345 * already present in the new task to be cleared in the whole range 343 * already present in the new task to be cleared in the whole range
346 * covered by this vma. 344 * covered by this vma.
347 *
348 * dst->page_table_lock is held on entry and exit,
349 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
350 */ 345 */
351 346
352static inline void 347static inline void
@@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
419 unsigned long addr, unsigned long end) 414 unsigned long addr, unsigned long end)
420{ 415{
421 pte_t *src_pte, *dst_pte; 416 pte_t *src_pte, *dst_pte;
417 spinlock_t *src_ptl, *dst_ptl;
422 int progress = 0; 418 int progress = 0;
423 int rss[2]; 419 int rss[2];
424 420
425again: 421again:
426 rss[1] = rss[0] = 0; 422 rss[1] = rss[0] = 0;
427 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); 423 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
428 if (!dst_pte) 424 if (!dst_pte)
429 return -ENOMEM; 425 return -ENOMEM;
430 src_pte = pte_offset_map_nested(src_pmd, addr); 426 src_pte = pte_offset_map_nested(src_pmd, addr);
427 src_ptl = &src_mm->page_table_lock;
428 spin_lock(src_ptl);
431 429
432 spin_lock(&src_mm->page_table_lock);
433 do { 430 do {
434 /* 431 /*
435 * We are holding two locks at this point - either of them 432 * We are holding two locks at this point - either of them
@@ -438,8 +435,8 @@ again:
438 if (progress >= 32) { 435 if (progress >= 32) {
439 progress = 0; 436 progress = 0;
440 if (need_resched() || 437 if (need_resched() ||
441 need_lockbreak(&src_mm->page_table_lock) || 438 need_lockbreak(src_ptl) ||
442 need_lockbreak(&dst_mm->page_table_lock)) 439 need_lockbreak(dst_ptl))
443 break; 440 break;
444 } 441 }
445 if (pte_none(*src_pte)) { 442 if (pte_none(*src_pte)) {
@@ -449,12 +446,12 @@ again:
449 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 446 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
450 progress += 8; 447 progress += 8;
451 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 448 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
452 spin_unlock(&src_mm->page_table_lock);
453 449
450 spin_unlock(src_ptl);
454 pte_unmap_nested(src_pte - 1); 451 pte_unmap_nested(src_pte - 1);
455 pte_unmap(dst_pte - 1);
456 add_mm_rss(dst_mm, rss[0], rss[1]); 452 add_mm_rss(dst_mm, rss[0], rss[1]);
457 cond_resched_lock(&dst_mm->page_table_lock); 453 pte_unmap_unlock(dst_pte - 1, dst_ptl);
454 cond_resched();
458 if (addr != end) 455 if (addr != end)
459 goto again; 456 goto again;
460 return 0; 457 return 0;
@@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1049 unsigned long addr, unsigned long end, pgprot_t prot) 1046 unsigned long addr, unsigned long end, pgprot_t prot)
1050{ 1047{
1051 pte_t *pte; 1048 pte_t *pte;
1049 spinlock_t *ptl;
1052 1050
1053 pte = pte_alloc_map(mm, pmd, addr); 1051 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1054 if (!pte) 1052 if (!pte)
1055 return -ENOMEM; 1053 return -ENOMEM;
1056 do { 1054 do {
@@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1062 BUG_ON(!pte_none(*pte)); 1060 BUG_ON(!pte_none(*pte));
1063 set_pte_at(mm, addr, pte, zero_pte); 1061 set_pte_at(mm, addr, pte, zero_pte);
1064 } while (pte++, addr += PAGE_SIZE, addr != end); 1062 } while (pte++, addr += PAGE_SIZE, addr != end);
1065 pte_unmap(pte - 1); 1063 pte_unmap_unlock(pte - 1, ptl);
1066 return 0; 1064 return 0;
1067} 1065}
1068 1066
@@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
1112 BUG_ON(addr >= end); 1110 BUG_ON(addr >= end);
1113 pgd = pgd_offset(mm, addr); 1111 pgd = pgd_offset(mm, addr);
1114 flush_cache_range(vma, addr, end); 1112 flush_cache_range(vma, addr, end);
1115 spin_lock(&mm->page_table_lock);
1116 do { 1113 do {
1117 next = pgd_addr_end(addr, end); 1114 next = pgd_addr_end(addr, end);
1118 err = zeromap_pud_range(mm, pgd, addr, next, prot); 1115 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1119 if (err) 1116 if (err)
1120 break; 1117 break;
1121 } while (pgd++, addr = next, addr != end); 1118 } while (pgd++, addr = next, addr != end);
1122 spin_unlock(&mm->page_table_lock);
1123 return err; 1119 return err;
1124} 1120}
1125 1121
@@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1133 unsigned long pfn, pgprot_t prot) 1129 unsigned long pfn, pgprot_t prot)
1134{ 1130{
1135 pte_t *pte; 1131 pte_t *pte;
1132 spinlock_t *ptl;
1136 1133
1137 pte = pte_alloc_map(mm, pmd, addr); 1134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1138 if (!pte) 1135 if (!pte)
1139 return -ENOMEM; 1136 return -ENOMEM;
1140 do { 1137 do {
@@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1142 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1139 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1143 pfn++; 1140 pfn++;
1144 } while (pte++, addr += PAGE_SIZE, addr != end); 1141 } while (pte++, addr += PAGE_SIZE, addr != end);
1145 pte_unmap(pte - 1); 1142 pte_unmap_unlock(pte - 1, ptl);
1146 return 0; 1143 return 0;
1147} 1144}
1148 1145
@@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1210 pfn -= addr >> PAGE_SHIFT; 1207 pfn -= addr >> PAGE_SHIFT;
1211 pgd = pgd_offset(mm, addr); 1208 pgd = pgd_offset(mm, addr);
1212 flush_cache_range(vma, addr, end); 1209 flush_cache_range(vma, addr, end);
1213 spin_lock(&mm->page_table_lock);
1214 do { 1210 do {
1215 next = pgd_addr_end(addr, end); 1211 next = pgd_addr_end(addr, end);
1216 err = remap_pud_range(mm, pgd, addr, next, 1212 err = remap_pud_range(mm, pgd, addr, next,
@@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1218 if (err) 1214 if (err)
1219 break; 1215 break;
1220 } while (pgd++, addr = next, addr != end); 1216 } while (pgd++, addr = next, addr != end);
1221 spin_unlock(&mm->page_table_lock);
1222 return err; 1217 return err;
1223} 1218}
1224EXPORT_SYMBOL(remap_pfn_range); 1219EXPORT_SYMBOL(remap_pfn_range);
@@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1985 * with external mmu caches can use to update those (ie the Sparc or 1980 * with external mmu caches can use to update those (ie the Sparc or
1986 * PowerPC hashed page tables that act as extended TLBs). 1981 * PowerPC hashed page tables that act as extended TLBs).
1987 * 1982 *
1988 * Note the "page_table_lock". It is to protect against kswapd removing 1983 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1989 * pages from under us. Note that kswapd only ever _removes_ pages, never 1984 * but allow concurrent faults), and pte mapped but not yet locked.
1990 * adds them. As such, once we have noticed that the page is not present, 1985 * We return with mmap_sem still held, but pte unmapped and unlocked.
1991 * we can drop the lock early.
1992 *
1993 * The adding of pages is protected by the MM semaphore (which we hold),
1994 * so we don't need to worry about a page being suddenly been added into
1995 * our VM.
1996 *
1997 * We enter with the pagetable spinlock held, we are supposed to
1998 * release it when done.
1999 */ 1986 */
2000static inline int handle_pte_fault(struct mm_struct *mm, 1987static inline int handle_pte_fault(struct mm_struct *mm,
2001 struct vm_area_struct *vma, unsigned long address, 1988 struct vm_area_struct *vma, unsigned long address,
@@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2003{ 1990{
2004 pte_t entry; 1991 pte_t entry;
2005 1992
1993 spin_lock(&mm->page_table_lock);
2006 entry = *pte; 1994 entry = *pte;
2007 if (!pte_present(entry)) { 1995 if (!pte_present(entry)) {
2008 if (pte_none(entry)) { 1996 if (pte_none(entry)) {
@@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2051 if (unlikely(is_vm_hugetlb_page(vma))) 2039 if (unlikely(is_vm_hugetlb_page(vma)))
2052 return hugetlb_fault(mm, vma, address, write_access); 2040 return hugetlb_fault(mm, vma, address, write_access);
2053 2041
2054 /*
2055 * We need the page table lock to synchronize with kswapd
2056 * and the SMP-safe atomic PTE updates.
2057 */
2058 pgd = pgd_offset(mm, address); 2042 pgd = pgd_offset(mm, address);
2059 spin_lock(&mm->page_table_lock);
2060
2061 pud = pud_alloc(mm, pgd, address); 2043 pud = pud_alloc(mm, pgd, address);
2062 if (!pud) 2044 if (!pud)
2063 goto oom; 2045 return VM_FAULT_OOM;
2064
2065 pmd = pmd_alloc(mm, pud, address); 2046 pmd = pmd_alloc(mm, pud, address);
2066 if (!pmd) 2047 if (!pmd)
2067 goto oom; 2048 return VM_FAULT_OOM;
2068
2069 pte = pte_alloc_map(mm, pmd, address); 2049 pte = pte_alloc_map(mm, pmd, address);
2070 if (!pte) 2050 if (!pte)
2071 goto oom; 2051 return VM_FAULT_OOM;
2072
2073 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2074 2052
2075 oom: 2053 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2076 spin_unlock(&mm->page_table_lock);
2077 return VM_FAULT_OOM;
2078} 2054}
2079 2055
2080#ifndef __PAGETABLE_PUD_FOLDED 2056#ifndef __PAGETABLE_PUD_FOLDED
@@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2084 */ 2060 */
2085int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 2061int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2086{ 2062{
2087 pud_t *new; 2063 pud_t *new = pud_alloc_one(mm, address);
2088 2064 if (!new)
2089 if (mm != &init_mm) /* Temporary bridging hack */
2090 spin_unlock(&mm->page_table_lock);
2091 new = pud_alloc_one(mm, address);
2092 if (!new) {
2093 if (mm != &init_mm) /* Temporary bridging hack */
2094 spin_lock(&mm->page_table_lock);
2095 return -ENOMEM; 2065 return -ENOMEM;
2096 }
2097 2066
2098 spin_lock(&mm->page_table_lock); 2067 spin_lock(&mm->page_table_lock);
2099 if (pgd_present(*pgd)) /* Another has populated it */ 2068 if (pgd_present(*pgd)) /* Another has populated it */
2100 pud_free(new); 2069 pud_free(new);
2101 else 2070 else
2102 pgd_populate(mm, pgd, new); 2071 pgd_populate(mm, pgd, new);
2103 if (mm == &init_mm) /* Temporary bridging hack */ 2072 spin_unlock(&mm->page_table_lock);
2104 spin_unlock(&mm->page_table_lock);
2105 return 0; 2073 return 0;
2106} 2074}
2107#endif /* __PAGETABLE_PUD_FOLDED */ 2075#endif /* __PAGETABLE_PUD_FOLDED */
@@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2113 */ 2081 */
2114int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 2082int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2115{ 2083{
2116 pmd_t *new; 2084 pmd_t *new = pmd_alloc_one(mm, address);
2117 2085 if (!new)
2118 if (mm != &init_mm) /* Temporary bridging hack */
2119 spin_unlock(&mm->page_table_lock);
2120 new = pmd_alloc_one(mm, address);
2121 if (!new) {
2122 if (mm != &init_mm) /* Temporary bridging hack */
2123 spin_lock(&mm->page_table_lock);
2124 return -ENOMEM; 2086 return -ENOMEM;
2125 }
2126 2087
2127 spin_lock(&mm->page_table_lock); 2088 spin_lock(&mm->page_table_lock);
2128#ifndef __ARCH_HAS_4LEVEL_HACK 2089#ifndef __ARCH_HAS_4LEVEL_HACK
@@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2136 else 2097 else
2137 pgd_populate(mm, pud, new); 2098 pgd_populate(mm, pud, new);
2138#endif /* __ARCH_HAS_4LEVEL_HACK */ 2099#endif /* __ARCH_HAS_4LEVEL_HACK */
2139 if (mm == &init_mm) /* Temporary bridging hack */ 2100 spin_unlock(&mm->page_table_lock);
2140 spin_unlock(&mm->page_table_lock);
2141 return 0; 2101 return 0;
2142} 2102}
2143#endif /* __PAGETABLE_PMD_FOLDED */ 2103#endif /* __PAGETABLE_PMD_FOLDED */