aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hugh@veritas.com>2005-10-29 21:16:23 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:40 -0400
commitc74df32c724a1652ad8399b4891bb02c9d43743a (patch)
tree5a79d56fdcf7dc2053a277dbf6db7c3b339e9659
parent1bb3630e89cb8a7b3d3807629c20c5bad88290ff (diff)
[PATCH] mm: ptd_alloc take ptlock
Second step in pushing down the page_table_lock. Remove the temporary bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not to hold page_table_lock, whether it's on init_mm or a user mm; take page_table_lock internally to check if a racing task already allocated. Convert their callers from common code. But avoid coming back to change them again later: instead of moving the spin_lock(&mm->page_table_lock) down, switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which encapsulate the mapping+locking and unlocking+unmapping together, and in the end may use alternatives to the mm page_table_lock itself. These callers all hold mmap_sem (some exclusively, some not), so at no level can a page table be whipped away from beneath them; and pte_alloc uses the "atomic" pmd_present to test whether it needs to allocate. It appears that on all arches we can safely descend without page_table_lock. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/exec.c14
-rw-r--r--include/linux/mm.h18
-rw-r--r--kernel/fork.c2
-rw-r--r--mm/fremap.c48
-rw-r--r--mm/hugetlb.c12
-rw-r--r--mm/memory.c104
-rw-r--r--mm/mremap.c27
7 files changed, 90 insertions, 135 deletions
diff --git a/fs/exec.c b/fs/exec.c
index 9bb55c8cf224..ba73797eb4cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma,
309 pud_t * pud; 309 pud_t * pud;
310 pmd_t * pmd; 310 pmd_t * pmd;
311 pte_t * pte; 311 pte_t * pte;
312 spinlock_t *ptl;
312 313
313 if (unlikely(anon_vma_prepare(vma))) 314 if (unlikely(anon_vma_prepare(vma)))
314 goto out_sig; 315 goto out;
315 316
316 flush_dcache_page(page); 317 flush_dcache_page(page);
317 pgd = pgd_offset(mm, address); 318 pgd = pgd_offset(mm, address);
318
319 spin_lock(&mm->page_table_lock);
320 pud = pud_alloc(mm, pgd, address); 319 pud = pud_alloc(mm, pgd, address);
321 if (!pud) 320 if (!pud)
322 goto out; 321 goto out;
323 pmd = pmd_alloc(mm, pud, address); 322 pmd = pmd_alloc(mm, pud, address);
324 if (!pmd) 323 if (!pmd)
325 goto out; 324 goto out;
326 pte = pte_alloc_map(mm, pmd, address); 325 pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
327 if (!pte) 326 if (!pte)
328 goto out; 327 goto out;
329 if (!pte_none(*pte)) { 328 if (!pte_none(*pte)) {
330 pte_unmap(pte); 329 pte_unmap_unlock(pte, ptl);
331 goto out; 330 goto out;
332 } 331 }
333 inc_mm_counter(mm, anon_rss); 332 inc_mm_counter(mm, anon_rss);
@@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma,
335 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte( 334 set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
336 page, vma->vm_page_prot)))); 335 page, vma->vm_page_prot))));
337 page_add_anon_rmap(page, vma, address); 336 page_add_anon_rmap(page, vma, address);
338 pte_unmap(pte); 337 pte_unmap_unlock(pte, ptl);
339 spin_unlock(&mm->page_table_lock);
340 338
341 /* no need for flush_tlb */ 339 /* no need for flush_tlb */
342 return; 340 return;
343out: 341out:
344 spin_unlock(&mm->page_table_lock);
345out_sig:
346 __free_page(page); 342 __free_page(page);
347 force_sig(SIGKILL, current); 343 force_sig(SIGKILL, current);
348} 344}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22c2d6922c0e..d4c3512e7db4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
779} 779}
780#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ 780#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
781 781
782#define pte_offset_map_lock(mm, pmd, address, ptlp) \
783({ \
784 spinlock_t *__ptl = &(mm)->page_table_lock; \
785 pte_t *__pte = pte_offset_map(pmd, address); \
786 *(ptlp) = __ptl; \
787 spin_lock(__ptl); \
788 __pte; \
789})
790
791#define pte_unmap_unlock(pte, ptl) do { \
792 spin_unlock(ptl); \
793 pte_unmap(pte); \
794} while (0)
795
782#define pte_alloc_map(mm, pmd, address) \ 796#define pte_alloc_map(mm, pmd, address) \
783 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ 797 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
784 NULL: pte_offset_map(pmd, address)) 798 NULL: pte_offset_map(pmd, address))
785 799
800#define pte_alloc_map_lock(mm, pmd, address, ptlp) \
801 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
802 NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
803
786#define pte_alloc_kernel(pmd, address) \ 804#define pte_alloc_kernel(pmd, address) \
787 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ 805 ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
788 NULL: pte_offset_kernel(pmd, address)) 806 NULL: pte_offset_kernel(pmd, address))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a587b3224e3..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
255 /* 255 /*
256 * Link in the new vma and copy the page table entries. 256 * Link in the new vma and copy the page table entries.
257 */ 257 */
258 spin_lock(&mm->page_table_lock);
259 *pprev = tmp; 258 *pprev = tmp;
260 pprev = &tmp->vm_next; 259 pprev = &tmp->vm_next;
261 260
@@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
265 264
266 mm->map_count++; 265 mm->map_count++;
267 retval = copy_page_range(mm, oldmm, tmp); 266 retval = copy_page_range(mm, oldmm, tmp);
268 spin_unlock(&mm->page_table_lock);
269 267
270 if (tmp->vm_ops && tmp->vm_ops->open) 268 if (tmp->vm_ops && tmp->vm_ops->open)
271 tmp->vm_ops->open(tmp); 269 tmp->vm_ops->open(tmp);
diff --git a/mm/fremap.c b/mm/fremap.c
index 49719a35769a..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
63 pud_t *pud; 63 pud_t *pud;
64 pgd_t *pgd; 64 pgd_t *pgd;
65 pte_t pte_val; 65 pte_t pte_val;
66 spinlock_t *ptl;
66 67
67 BUG_ON(vma->vm_flags & VM_RESERVED); 68 BUG_ON(vma->vm_flags & VM_RESERVED);
68 69
69 pgd = pgd_offset(mm, addr); 70 pgd = pgd_offset(mm, addr);
70 spin_lock(&mm->page_table_lock);
71
72 pud = pud_alloc(mm, pgd, addr); 71 pud = pud_alloc(mm, pgd, addr);
73 if (!pud) 72 if (!pud)
74 goto err_unlock; 73 goto out;
75
76 pmd = pmd_alloc(mm, pud, addr); 74 pmd = pmd_alloc(mm, pud, addr);
77 if (!pmd) 75 if (!pmd)
78 goto err_unlock; 76 goto out;
79 77 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
80 pte = pte_alloc_map(mm, pmd, addr);
81 if (!pte) 78 if (!pte)
82 goto err_unlock; 79 goto out;
83 80
84 /* 81 /*
85 * This page may have been truncated. Tell the 82 * This page may have been truncated. Tell the
@@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
89 inode = vma->vm_file->f_mapping->host; 86 inode = vma->vm_file->f_mapping->host;
90 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 87 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
91 if (!page->mapping || page->index >= size) 88 if (!page->mapping || page->index >= size)
92 goto err_unlock; 89 goto unlock;
93 err = -ENOMEM; 90 err = -ENOMEM;
94 if (page_mapcount(page) > INT_MAX/2) 91 if (page_mapcount(page) > INT_MAX/2)
95 goto err_unlock; 92 goto unlock;
96 93
97 if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte)) 94 if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
98 inc_mm_counter(mm, file_rss); 95 inc_mm_counter(mm, file_rss);
@@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
101 set_pte_at(mm, addr, pte, mk_pte(page, prot)); 98 set_pte_at(mm, addr, pte, mk_pte(page, prot));
102 page_add_file_rmap(page); 99 page_add_file_rmap(page);
103 pte_val = *pte; 100 pte_val = *pte;
104 pte_unmap(pte);
105 update_mmu_cache(vma, addr, pte_val); 101 update_mmu_cache(vma, addr, pte_val);
106
107 err = 0; 102 err = 0;
108err_unlock: 103unlock:
109 spin_unlock(&mm->page_table_lock); 104 pte_unmap_unlock(pte, ptl);
105out:
110 return err; 106 return err;
111} 107}
112EXPORT_SYMBOL(install_page); 108EXPORT_SYMBOL(install_page);
113 109
114
115/* 110/*
116 * Install a file pte to a given virtual memory address, release any 111 * Install a file pte to a given virtual memory address, release any
117 * previously existing mapping. 112 * previously existing mapping.
@@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
125 pud_t *pud; 120 pud_t *pud;
126 pgd_t *pgd; 121 pgd_t *pgd;
127 pte_t pte_val; 122 pte_t pte_val;
123 spinlock_t *ptl;
128 124
129 BUG_ON(vma->vm_flags & VM_RESERVED); 125 BUG_ON(vma->vm_flags & VM_RESERVED);
130 126
131 pgd = pgd_offset(mm, addr); 127 pgd = pgd_offset(mm, addr);
132 spin_lock(&mm->page_table_lock);
133
134 pud = pud_alloc(mm, pgd, addr); 128 pud = pud_alloc(mm, pgd, addr);
135 if (!pud) 129 if (!pud)
136 goto err_unlock; 130 goto out;
137
138 pmd = pmd_alloc(mm, pud, addr); 131 pmd = pmd_alloc(mm, pud, addr);
139 if (!pmd) 132 if (!pmd)
140 goto err_unlock; 133 goto out;
141 134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
142 pte = pte_alloc_map(mm, pmd, addr);
143 if (!pte) 135 if (!pte)
144 goto err_unlock; 136 goto out;
145 137
146 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) { 138 if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
147 update_hiwater_rss(mm); 139 update_hiwater_rss(mm);
@@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
150 142
151 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); 143 set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
152 pte_val = *pte; 144 pte_val = *pte;
153 pte_unmap(pte);
154 update_mmu_cache(vma, addr, pte_val); 145 update_mmu_cache(vma, addr, pte_val);
155 spin_unlock(&mm->page_table_lock); 146 pte_unmap_unlock(pte, ptl);
156 return 0; 147 err = 0;
157 148out:
158err_unlock:
159 spin_unlock(&mm->page_table_lock);
160 return err; 149 return err;
161} 150}
162 151
163
164/*** 152/***
165 * sys_remap_file_pages - remap arbitrary pages of a shared backing store 153 * sys_remap_file_pages - remap arbitrary pages of a shared backing store
166 * file within an existing vma. 154 * file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac5f044bf514..ea0826ff2663 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
277 unsigned long addr; 277 unsigned long addr;
278 278
279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 279 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
280 src_pte = huge_pte_offset(src, addr);
281 if (!src_pte)
282 continue;
280 dst_pte = huge_pte_alloc(dst, addr); 283 dst_pte = huge_pte_alloc(dst, addr);
281 if (!dst_pte) 284 if (!dst_pte)
282 goto nomem; 285 goto nomem;
286 spin_lock(&dst->page_table_lock);
283 spin_lock(&src->page_table_lock); 287 spin_lock(&src->page_table_lock);
284 src_pte = huge_pte_offset(src, addr); 288 if (!pte_none(*src_pte)) {
285 if (src_pte && !pte_none(*src_pte)) {
286 entry = *src_pte; 289 entry = *src_pte;
287 ptepage = pte_page(entry); 290 ptepage = pte_page(entry);
288 get_page(ptepage); 291 get_page(ptepage);
@@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
290 set_huge_pte_at(dst, addr, dst_pte, entry); 293 set_huge_pte_at(dst, addr, dst_pte, entry);
291 } 294 }
292 spin_unlock(&src->page_table_lock); 295 spin_unlock(&src->page_table_lock);
296 spin_unlock(&dst->page_table_lock);
293 } 297 }
294 return 0; 298 return 0;
295 299
@@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
354 358
355 hugetlb_prefault_arch_hook(mm); 359 hugetlb_prefault_arch_hook(mm);
356 360
357 spin_lock(&mm->page_table_lock);
358 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { 361 for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
359 unsigned long idx; 362 unsigned long idx;
360 pte_t *pte = huge_pte_alloc(mm, addr); 363 pte_t *pte = huge_pte_alloc(mm, addr);
@@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
389 goto out; 392 goto out;
390 } 393 }
391 } 394 }
395 spin_lock(&mm->page_table_lock);
392 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE); 396 add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
393 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page)); 397 set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
398 spin_unlock(&mm->page_table_lock);
394 } 399 }
395out: 400out:
396 spin_unlock(&mm->page_table_lock);
397 return ret; 401 return ret;
398} 402}
399 403
diff --git a/mm/memory.c b/mm/memory.c
index 4bdd1186b43b..a40e4b1cee4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
282 282
283int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 283int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
284{ 284{
285 struct page *new; 285 struct page *new = pte_alloc_one(mm, address);
286
287 spin_unlock(&mm->page_table_lock);
288 new = pte_alloc_one(mm, address);
289 spin_lock(&mm->page_table_lock);
290 if (!new) 286 if (!new)
291 return -ENOMEM; 287 return -ENOMEM;
292 288
289 spin_lock(&mm->page_table_lock);
293 if (pmd_present(*pmd)) /* Another has populated it */ 290 if (pmd_present(*pmd)) /* Another has populated it */
294 pte_free(new); 291 pte_free(new);
295 else { 292 else {
@@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
297 inc_page_state(nr_page_table_pages); 294 inc_page_state(nr_page_table_pages);
298 pmd_populate(mm, pmd, new); 295 pmd_populate(mm, pmd, new);
299 } 296 }
297 spin_unlock(&mm->page_table_lock);
300 return 0; 298 return 0;
301} 299}
302 300
@@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
344 * copy one vm_area from one task to the other. Assumes the page tables 342 * copy one vm_area from one task to the other. Assumes the page tables
345 * already present in the new task to be cleared in the whole range 343 * already present in the new task to be cleared in the whole range
346 * covered by this vma. 344 * covered by this vma.
347 *
348 * dst->page_table_lock is held on entry and exit,
349 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
350 */ 345 */
351 346
352static inline void 347static inline void
@@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
419 unsigned long addr, unsigned long end) 414 unsigned long addr, unsigned long end)
420{ 415{
421 pte_t *src_pte, *dst_pte; 416 pte_t *src_pte, *dst_pte;
417 spinlock_t *src_ptl, *dst_ptl;
422 int progress = 0; 418 int progress = 0;
423 int rss[2]; 419 int rss[2];
424 420
425again: 421again:
426 rss[1] = rss[0] = 0; 422 rss[1] = rss[0] = 0;
427 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); 423 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
428 if (!dst_pte) 424 if (!dst_pte)
429 return -ENOMEM; 425 return -ENOMEM;
430 src_pte = pte_offset_map_nested(src_pmd, addr); 426 src_pte = pte_offset_map_nested(src_pmd, addr);
427 src_ptl = &src_mm->page_table_lock;
428 spin_lock(src_ptl);
431 429
432 spin_lock(&src_mm->page_table_lock);
433 do { 430 do {
434 /* 431 /*
435 * We are holding two locks at this point - either of them 432 * We are holding two locks at this point - either of them
@@ -438,8 +435,8 @@ again:
438 if (progress >= 32) { 435 if (progress >= 32) {
439 progress = 0; 436 progress = 0;
440 if (need_resched() || 437 if (need_resched() ||
441 need_lockbreak(&src_mm->page_table_lock) || 438 need_lockbreak(src_ptl) ||
442 need_lockbreak(&dst_mm->page_table_lock)) 439 need_lockbreak(dst_ptl))
443 break; 440 break;
444 } 441 }
445 if (pte_none(*src_pte)) { 442 if (pte_none(*src_pte)) {
@@ -449,12 +446,12 @@ again:
449 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); 446 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
450 progress += 8; 447 progress += 8;
451 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 448 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
452 spin_unlock(&src_mm->page_table_lock);
453 449
450 spin_unlock(src_ptl);
454 pte_unmap_nested(src_pte - 1); 451 pte_unmap_nested(src_pte - 1);
455 pte_unmap(dst_pte - 1);
456 add_mm_rss(dst_mm, rss[0], rss[1]); 452 add_mm_rss(dst_mm, rss[0], rss[1]);
457 cond_resched_lock(&dst_mm->page_table_lock); 453 pte_unmap_unlock(dst_pte - 1, dst_ptl);
454 cond_resched();
458 if (addr != end) 455 if (addr != end)
459 goto again; 456 goto again;
460 return 0; 457 return 0;
@@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1049 unsigned long addr, unsigned long end, pgprot_t prot) 1046 unsigned long addr, unsigned long end, pgprot_t prot)
1050{ 1047{
1051 pte_t *pte; 1048 pte_t *pte;
1049 spinlock_t *ptl;
1052 1050
1053 pte = pte_alloc_map(mm, pmd, addr); 1051 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1054 if (!pte) 1052 if (!pte)
1055 return -ENOMEM; 1053 return -ENOMEM;
1056 do { 1054 do {
@@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1062 BUG_ON(!pte_none(*pte)); 1060 BUG_ON(!pte_none(*pte));
1063 set_pte_at(mm, addr, pte, zero_pte); 1061 set_pte_at(mm, addr, pte, zero_pte);
1064 } while (pte++, addr += PAGE_SIZE, addr != end); 1062 } while (pte++, addr += PAGE_SIZE, addr != end);
1065 pte_unmap(pte - 1); 1063 pte_unmap_unlock(pte - 1, ptl);
1066 return 0; 1064 return 0;
1067} 1065}
1068 1066
@@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
1112 BUG_ON(addr >= end); 1110 BUG_ON(addr >= end);
1113 pgd = pgd_offset(mm, addr); 1111 pgd = pgd_offset(mm, addr);
1114 flush_cache_range(vma, addr, end); 1112 flush_cache_range(vma, addr, end);
1115 spin_lock(&mm->page_table_lock);
1116 do { 1113 do {
1117 next = pgd_addr_end(addr, end); 1114 next = pgd_addr_end(addr, end);
1118 err = zeromap_pud_range(mm, pgd, addr, next, prot); 1115 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1119 if (err) 1116 if (err)
1120 break; 1117 break;
1121 } while (pgd++, addr = next, addr != end); 1118 } while (pgd++, addr = next, addr != end);
1122 spin_unlock(&mm->page_table_lock);
1123 return err; 1119 return err;
1124} 1120}
1125 1121
@@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1133 unsigned long pfn, pgprot_t prot) 1129 unsigned long pfn, pgprot_t prot)
1134{ 1130{
1135 pte_t *pte; 1131 pte_t *pte;
1132 spinlock_t *ptl;
1136 1133
1137 pte = pte_alloc_map(mm, pmd, addr); 1134 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1138 if (!pte) 1135 if (!pte)
1139 return -ENOMEM; 1136 return -ENOMEM;
1140 do { 1137 do {
@@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1142 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1139 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1143 pfn++; 1140 pfn++;
1144 } while (pte++, addr += PAGE_SIZE, addr != end); 1141 } while (pte++, addr += PAGE_SIZE, addr != end);
1145 pte_unmap(pte - 1); 1142 pte_unmap_unlock(pte - 1, ptl);
1146 return 0; 1143 return 0;
1147} 1144}
1148 1145
@@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1210 pfn -= addr >> PAGE_SHIFT; 1207 pfn -= addr >> PAGE_SHIFT;
1211 pgd = pgd_offset(mm, addr); 1208 pgd = pgd_offset(mm, addr);
1212 flush_cache_range(vma, addr, end); 1209 flush_cache_range(vma, addr, end);
1213 spin_lock(&mm->page_table_lock);
1214 do { 1210 do {
1215 next = pgd_addr_end(addr, end); 1211 next = pgd_addr_end(addr, end);
1216 err = remap_pud_range(mm, pgd, addr, next, 1212 err = remap_pud_range(mm, pgd, addr, next,
@@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1218 if (err) 1214 if (err)
1219 break; 1215 break;
1220 } while (pgd++, addr = next, addr != end); 1216 } while (pgd++, addr = next, addr != end);
1221 spin_unlock(&mm->page_table_lock);
1222 return err; 1217 return err;
1223} 1218}
1224EXPORT_SYMBOL(remap_pfn_range); 1219EXPORT_SYMBOL(remap_pfn_range);
@@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1985 * with external mmu caches can use to update those (ie the Sparc or 1980 * with external mmu caches can use to update those (ie the Sparc or
1986 * PowerPC hashed page tables that act as extended TLBs). 1981 * PowerPC hashed page tables that act as extended TLBs).
1987 * 1982 *
1988 * Note the "page_table_lock". It is to protect against kswapd removing 1983 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1989 * pages from under us. Note that kswapd only ever _removes_ pages, never 1984 * but allow concurrent faults), and pte mapped but not yet locked.
1990 * adds them. As such, once we have noticed that the page is not present, 1985 * We return with mmap_sem still held, but pte unmapped and unlocked.
1991 * we can drop the lock early.
1992 *
1993 * The adding of pages is protected by the MM semaphore (which we hold),
1994 * so we don't need to worry about a page being suddenly been added into
1995 * our VM.
1996 *
1997 * We enter with the pagetable spinlock held, we are supposed to
1998 * release it when done.
1999 */ 1986 */
2000static inline int handle_pte_fault(struct mm_struct *mm, 1987static inline int handle_pte_fault(struct mm_struct *mm,
2001 struct vm_area_struct *vma, unsigned long address, 1988 struct vm_area_struct *vma, unsigned long address,
@@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2003{ 1990{
2004 pte_t entry; 1991 pte_t entry;
2005 1992
1993 spin_lock(&mm->page_table_lock);
2006 entry = *pte; 1994 entry = *pte;
2007 if (!pte_present(entry)) { 1995 if (!pte_present(entry)) {
2008 if (pte_none(entry)) { 1996 if (pte_none(entry)) {
@@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2051 if (unlikely(is_vm_hugetlb_page(vma))) 2039 if (unlikely(is_vm_hugetlb_page(vma)))
2052 return hugetlb_fault(mm, vma, address, write_access); 2040 return hugetlb_fault(mm, vma, address, write_access);
2053 2041
2054 /*
2055 * We need the page table lock to synchronize with kswapd
2056 * and the SMP-safe atomic PTE updates.
2057 */
2058 pgd = pgd_offset(mm, address); 2042 pgd = pgd_offset(mm, address);
2059 spin_lock(&mm->page_table_lock);
2060
2061 pud = pud_alloc(mm, pgd, address); 2043 pud = pud_alloc(mm, pgd, address);
2062 if (!pud) 2044 if (!pud)
2063 goto oom; 2045 return VM_FAULT_OOM;
2064
2065 pmd = pmd_alloc(mm, pud, address); 2046 pmd = pmd_alloc(mm, pud, address);
2066 if (!pmd) 2047 if (!pmd)
2067 goto oom; 2048 return VM_FAULT_OOM;
2068
2069 pte = pte_alloc_map(mm, pmd, address); 2049 pte = pte_alloc_map(mm, pmd, address);
2070 if (!pte) 2050 if (!pte)
2071 goto oom; 2051 return VM_FAULT_OOM;
2072
2073 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2074 2052
2075 oom: 2053 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2076 spin_unlock(&mm->page_table_lock);
2077 return VM_FAULT_OOM;
2078} 2054}
2079 2055
2080#ifndef __PAGETABLE_PUD_FOLDED 2056#ifndef __PAGETABLE_PUD_FOLDED
@@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2084 */ 2060 */
2085int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 2061int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2086{ 2062{
2087 pud_t *new; 2063 pud_t *new = pud_alloc_one(mm, address);
2088 2064 if (!new)
2089 if (mm != &init_mm) /* Temporary bridging hack */
2090 spin_unlock(&mm->page_table_lock);
2091 new = pud_alloc_one(mm, address);
2092 if (!new) {
2093 if (mm != &init_mm) /* Temporary bridging hack */
2094 spin_lock(&mm->page_table_lock);
2095 return -ENOMEM; 2065 return -ENOMEM;
2096 }
2097 2066
2098 spin_lock(&mm->page_table_lock); 2067 spin_lock(&mm->page_table_lock);
2099 if (pgd_present(*pgd)) /* Another has populated it */ 2068 if (pgd_present(*pgd)) /* Another has populated it */
2100 pud_free(new); 2069 pud_free(new);
2101 else 2070 else
2102 pgd_populate(mm, pgd, new); 2071 pgd_populate(mm, pgd, new);
2103 if (mm == &init_mm) /* Temporary bridging hack */ 2072 spin_unlock(&mm->page_table_lock);
2104 spin_unlock(&mm->page_table_lock);
2105 return 0; 2073 return 0;
2106} 2074}
2107#endif /* __PAGETABLE_PUD_FOLDED */ 2075#endif /* __PAGETABLE_PUD_FOLDED */
@@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2113 */ 2081 */
2114int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 2082int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2115{ 2083{
2116 pmd_t *new; 2084 pmd_t *new = pmd_alloc_one(mm, address);
2117 2085 if (!new)
2118 if (mm != &init_mm) /* Temporary bridging hack */
2119 spin_unlock(&mm->page_table_lock);
2120 new = pmd_alloc_one(mm, address);
2121 if (!new) {
2122 if (mm != &init_mm) /* Temporary bridging hack */
2123 spin_lock(&mm->page_table_lock);
2124 return -ENOMEM; 2086 return -ENOMEM;
2125 }
2126 2087
2127 spin_lock(&mm->page_table_lock); 2088 spin_lock(&mm->page_table_lock);
2128#ifndef __ARCH_HAS_4LEVEL_HACK 2089#ifndef __ARCH_HAS_4LEVEL_HACK
@@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2136 else 2097 else
2137 pgd_populate(mm, pud, new); 2098 pgd_populate(mm, pud, new);
2138#endif /* __ARCH_HAS_4LEVEL_HACK */ 2099#endif /* __ARCH_HAS_4LEVEL_HACK */
2139 if (mm == &init_mm) /* Temporary bridging hack */ 2100 spin_unlock(&mm->page_table_lock);
2140 spin_unlock(&mm->page_table_lock);
2141 return 0; 2101 return 0;
2142} 2102}
2143#endif /* __PAGETABLE_PMD_FOLDED */ 2103#endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/mremap.c b/mm/mremap.c
index 616facc3d28a..8de77b632a20 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
28 pud_t *pud; 28 pud_t *pud;
29 pmd_t *pmd; 29 pmd_t *pmd;
30 30
31 /*
32 * We don't need page_table_lock: we have mmap_sem exclusively.
33 */
34 pgd = pgd_offset(mm, addr); 31 pgd = pgd_offset(mm, addr);
35 if (pgd_none_or_clear_bad(pgd)) 32 if (pgd_none_or_clear_bad(pgd))
36 return NULL; 33 return NULL;
@@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
50{ 47{
51 pgd_t *pgd; 48 pgd_t *pgd;
52 pud_t *pud; 49 pud_t *pud;
53 pmd_t *pmd = NULL; 50 pmd_t *pmd;
54 51
55 /*
56 * We do need page_table_lock: because allocators expect that.
57 */
58 spin_lock(&mm->page_table_lock);
59 pgd = pgd_offset(mm, addr); 52 pgd = pgd_offset(mm, addr);
60 pud = pud_alloc(mm, pgd, addr); 53 pud = pud_alloc(mm, pgd, addr);
61 if (!pud) 54 if (!pud)
62 goto out; 55 return NULL;
63 56
64 pmd = pmd_alloc(mm, pud, addr); 57 pmd = pmd_alloc(mm, pud, addr);
65 if (!pmd) 58 if (!pmd)
66 goto out; 59 return NULL;
67 60
68 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) 61 if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
69 pmd = NULL; 62 return NULL;
70out: 63
71 spin_unlock(&mm->page_table_lock);
72 return pmd; 64 return pmd;
73} 65}
74 66
@@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
80 struct address_space *mapping = NULL; 72 struct address_space *mapping = NULL;
81 struct mm_struct *mm = vma->vm_mm; 73 struct mm_struct *mm = vma->vm_mm;
82 pte_t *old_pte, *new_pte, pte; 74 pte_t *old_pte, *new_pte, pte;
75 spinlock_t *old_ptl;
83 76
84 if (vma->vm_file) { 77 if (vma->vm_file) {
85 /* 78 /*
@@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
95 new_vma->vm_truncate_count = 0; 88 new_vma->vm_truncate_count = 0;
96 } 89 }
97 90
98 spin_lock(&mm->page_table_lock); 91 old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
99 old_pte = pte_offset_map(old_pmd, old_addr); 92 new_pte = pte_offset_map_nested(new_pmd, new_addr);
100 new_pte = pte_offset_map_nested(new_pmd, new_addr);
101 93
102 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, 94 for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
103 new_pte++, new_addr += PAGE_SIZE) { 95 new_pte++, new_addr += PAGE_SIZE) {
@@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
110 } 102 }
111 103
112 pte_unmap_nested(new_pte - 1); 104 pte_unmap_nested(new_pte - 1);
113 pte_unmap(old_pte - 1); 105 pte_unmap_unlock(old_pte - 1, old_ptl);
114 spin_unlock(&mm->page_table_lock);
115 if (mapping) 106 if (mapping)
116 spin_unlock(&mapping->i_mmap_lock); 107 spin_unlock(&mapping->i_mmap_lock);
117} 108}