aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c993
1 files changed, 471 insertions, 522 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 1db40e935e55..0f60baf6f69b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
114{ 114{
115 struct page *page = pmd_page(*pmd); 115 struct page *page = pmd_page(*pmd);
116 pmd_clear(pmd); 116 pmd_clear(pmd);
117 pte_lock_deinit(page);
117 pte_free_tlb(tlb, page); 118 pte_free_tlb(tlb, page);
118 dec_page_state(nr_page_table_pages); 119 dec_page_state(nr_page_table_pages);
119 tlb->mm->nr_ptes--; 120 tlb->mm->nr_ptes--;
@@ -249,7 +250,7 @@ void free_pgd_range(struct mmu_gather **tlb,
249 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 250 free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
250 } while (pgd++, addr = next, addr != end); 251 } while (pgd++, addr = next, addr != end);
251 252
252 if (!tlb_is_full_mm(*tlb)) 253 if (!(*tlb)->fullmm)
253 flush_tlb_pgtables((*tlb)->mm, start, end); 254 flush_tlb_pgtables((*tlb)->mm, start, end);
254} 255}
255 256
@@ -260,6 +261,12 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
260 struct vm_area_struct *next = vma->vm_next; 261 struct vm_area_struct *next = vma->vm_next;
261 unsigned long addr = vma->vm_start; 262 unsigned long addr = vma->vm_start;
262 263
264 /*
265 * Hide vma from rmap and vmtruncate before freeing pgtables
266 */
267 anon_vma_unlink(vma);
268 unlink_file_vma(vma);
269
263 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { 270 if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
264 hugetlb_free_pgd_range(tlb, addr, vma->vm_end, 271 hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
265 floor, next? next->vm_start: ceiling); 272 floor, next? next->vm_start: ceiling);
@@ -272,6 +279,8 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
272 HPAGE_SIZE)) { 279 HPAGE_SIZE)) {
273 vma = next; 280 vma = next;
274 next = vma->vm_next; 281 next = vma->vm_next;
282 anon_vma_unlink(vma);
283 unlink_file_vma(vma);
275 } 284 }
276 free_pgd_range(tlb, addr, vma->vm_end, 285 free_pgd_range(tlb, addr, vma->vm_end,
277 floor, next? next->vm_start: ceiling); 286 floor, next? next->vm_start: ceiling);
@@ -280,72 +289,78 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
280 } 289 }
281} 290}
282 291
283pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, 292int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
284 unsigned long address)
285{ 293{
286 if (!pmd_present(*pmd)) { 294 struct page *new = pte_alloc_one(mm, address);
287 struct page *new; 295 if (!new)
288 296 return -ENOMEM;
289 spin_unlock(&mm->page_table_lock); 297
290 new = pte_alloc_one(mm, address); 298 pte_lock_init(new);
291 spin_lock(&mm->page_table_lock); 299 spin_lock(&mm->page_table_lock);
292 if (!new) 300 if (pmd_present(*pmd)) { /* Another has populated it */
293 return NULL; 301 pte_lock_deinit(new);
294 /* 302 pte_free(new);
295 * Because we dropped the lock, we should re-check the 303 } else {
296 * entry, as somebody else could have populated it..
297 */
298 if (pmd_present(*pmd)) {
299 pte_free(new);
300 goto out;
301 }
302 mm->nr_ptes++; 304 mm->nr_ptes++;
303 inc_page_state(nr_page_table_pages); 305 inc_page_state(nr_page_table_pages);
304 pmd_populate(mm, pmd, new); 306 pmd_populate(mm, pmd, new);
305 } 307 }
306out: 308 spin_unlock(&mm->page_table_lock);
307 return pte_offset_map(pmd, address); 309 return 0;
308} 310}
309 311
310pte_t fastcall * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) 312int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
311{ 313{
312 if (!pmd_present(*pmd)) { 314 pte_t *new = pte_alloc_one_kernel(&init_mm, address);
313 pte_t *new; 315 if (!new)
316 return -ENOMEM;
314 317
315 spin_unlock(&mm->page_table_lock); 318 spin_lock(&init_mm.page_table_lock);
316 new = pte_alloc_one_kernel(mm, address); 319 if (pmd_present(*pmd)) /* Another has populated it */
317 spin_lock(&mm->page_table_lock); 320 pte_free_kernel(new);
318 if (!new) 321 else
319 return NULL; 322 pmd_populate_kernel(&init_mm, pmd, new);
323 spin_unlock(&init_mm.page_table_lock);
324 return 0;
325}
320 326
321 /* 327static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
322 * Because we dropped the lock, we should re-check the 328{
323 * entry, as somebody else could have populated it.. 329 if (file_rss)
324 */ 330 add_mm_counter(mm, file_rss, file_rss);
325 if (pmd_present(*pmd)) { 331 if (anon_rss)
326 pte_free_kernel(new); 332 add_mm_counter(mm, anon_rss, anon_rss);
327 goto out; 333}
328 } 334
329 pmd_populate_kernel(mm, pmd, new); 335/*
330 } 336 * This function is called to print an error when a pte in a
331out: 337 * !VM_RESERVED region is found pointing to an invalid pfn (which
332 return pte_offset_kernel(pmd, address); 338 * is an error.
339 *
340 * The calling function must still handle the error.
341 */
342void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
343{
344 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
345 "vm_flags = %lx, vaddr = %lx\n",
346 (long long)pte_val(pte),
347 (vma->vm_mm == current->mm ? current->comm : "???"),
348 vma->vm_flags, vaddr);
349 dump_stack();
333} 350}
334 351
335/* 352/*
336 * copy one vm_area from one task to the other. Assumes the page tables 353 * copy one vm_area from one task to the other. Assumes the page tables
337 * already present in the new task to be cleared in the whole range 354 * already present in the new task to be cleared in the whole range
338 * covered by this vma. 355 * covered by this vma.
339 *
340 * dst->page_table_lock is held on entry and exit,
341 * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
342 */ 356 */
343 357
344static inline void 358static inline void
345copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, 359copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
346 pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags, 360 pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
347 unsigned long addr) 361 unsigned long addr, int *rss)
348{ 362{
363 unsigned long vm_flags = vma->vm_flags;
349 pte_t pte = *src_pte; 364 pte_t pte = *src_pte;
350 struct page *page; 365 struct page *page;
351 unsigned long pfn; 366 unsigned long pfn;
@@ -357,29 +372,32 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
357 /* make sure dst_mm is on swapoff's mmlist. */ 372 /* make sure dst_mm is on swapoff's mmlist. */
358 if (unlikely(list_empty(&dst_mm->mmlist))) { 373 if (unlikely(list_empty(&dst_mm->mmlist))) {
359 spin_lock(&mmlist_lock); 374 spin_lock(&mmlist_lock);
360 list_add(&dst_mm->mmlist, &src_mm->mmlist); 375 if (list_empty(&dst_mm->mmlist))
376 list_add(&dst_mm->mmlist,
377 &src_mm->mmlist);
361 spin_unlock(&mmlist_lock); 378 spin_unlock(&mmlist_lock);
362 } 379 }
363 } 380 }
364 set_pte_at(dst_mm, addr, dst_pte, pte); 381 goto out_set_pte;
365 return;
366 } 382 }
367 383
368 pfn = pte_pfn(pte); 384 /* If the region is VM_RESERVED, the mapping is not
369 /* the pte points outside of valid memory, the 385 * mapped via rmap - duplicate the pte as is.
370 * mapping is assumed to be good, meaningful
371 * and not mapped via rmap - duplicate the
372 * mapping as is.
373 */ 386 */
374 page = NULL; 387 if (vm_flags & VM_RESERVED)
375 if (pfn_valid(pfn)) 388 goto out_set_pte;
376 page = pfn_to_page(pfn);
377 389
378 if (!page || PageReserved(page)) { 390 pfn = pte_pfn(pte);
379 set_pte_at(dst_mm, addr, dst_pte, pte); 391 /* If the pte points outside of valid memory but
380 return; 392 * the region is not VM_RESERVED, we have a problem.
393 */
394 if (unlikely(!pfn_valid(pfn))) {
395 print_bad_pte(vma, pte, addr);
396 goto out_set_pte; /* try to do something sane */
381 } 397 }
382 398
399 page = pfn_to_page(pfn);
400
383 /* 401 /*
384 * If it's a COW mapping, write protect it both 402 * If it's a COW mapping, write protect it both
385 * in the parent and the child 403 * in the parent and the child
@@ -397,11 +415,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 pte = pte_mkclean(pte); 415 pte = pte_mkclean(pte);
398 pte = pte_mkold(pte); 416 pte = pte_mkold(pte);
399 get_page(page); 417 get_page(page);
400 inc_mm_counter(dst_mm, rss);
401 if (PageAnon(page))
402 inc_mm_counter(dst_mm, anon_rss);
403 set_pte_at(dst_mm, addr, dst_pte, pte);
404 page_dup_rmap(page); 418 page_dup_rmap(page);
419 rss[!!PageAnon(page)]++;
420
421out_set_pte:
422 set_pte_at(dst_mm, addr, dst_pte, pte);
405} 423}
406 424
407static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, 425static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -409,38 +427,44 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
409 unsigned long addr, unsigned long end) 427 unsigned long addr, unsigned long end)
410{ 428{
411 pte_t *src_pte, *dst_pte; 429 pte_t *src_pte, *dst_pte;
412 unsigned long vm_flags = vma->vm_flags; 430 spinlock_t *src_ptl, *dst_ptl;
413 int progress; 431 int progress = 0;
432 int rss[2];
414 433
415again: 434again:
416 dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr); 435 rss[1] = rss[0] = 0;
436 dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
417 if (!dst_pte) 437 if (!dst_pte)
418 return -ENOMEM; 438 return -ENOMEM;
419 src_pte = pte_offset_map_nested(src_pmd, addr); 439 src_pte = pte_offset_map_nested(src_pmd, addr);
440 src_ptl = pte_lockptr(src_mm, src_pmd);
441 spin_lock(src_ptl);
420 442
421 progress = 0;
422 spin_lock(&src_mm->page_table_lock);
423 do { 443 do {
424 /* 444 /*
425 * We are holding two locks at this point - either of them 445 * We are holding two locks at this point - either of them
426 * could generate latencies in another task on another CPU. 446 * could generate latencies in another task on another CPU.
427 */ 447 */
428 if (progress >= 32 && (need_resched() || 448 if (progress >= 32) {
429 need_lockbreak(&src_mm->page_table_lock) || 449 progress = 0;
430 need_lockbreak(&dst_mm->page_table_lock))) 450 if (need_resched() ||
431 break; 451 need_lockbreak(src_ptl) ||
452 need_lockbreak(dst_ptl))
453 break;
454 }
432 if (pte_none(*src_pte)) { 455 if (pte_none(*src_pte)) {
433 progress++; 456 progress++;
434 continue; 457 continue;
435 } 458 }
436 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vm_flags, addr); 459 copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
437 progress += 8; 460 progress += 8;
438 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); 461 } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
439 spin_unlock(&src_mm->page_table_lock);
440 462
463 spin_unlock(src_ptl);
441 pte_unmap_nested(src_pte - 1); 464 pte_unmap_nested(src_pte - 1);
442 pte_unmap(dst_pte - 1); 465 add_mm_rss(dst_mm, rss[0], rss[1]);
443 cond_resched_lock(&dst_mm->page_table_lock); 466 pte_unmap_unlock(dst_pte - 1, dst_ptl);
467 cond_resched();
444 if (addr != end) 468 if (addr != end)
445 goto again; 469 goto again;
446 return 0; 470 return 0;
@@ -525,24 +549,30 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
525 return 0; 549 return 0;
526} 550}
527 551
528static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd, 552static void zap_pte_range(struct mmu_gather *tlb,
553 struct vm_area_struct *vma, pmd_t *pmd,
529 unsigned long addr, unsigned long end, 554 unsigned long addr, unsigned long end,
530 struct zap_details *details) 555 struct zap_details *details)
531{ 556{
557 struct mm_struct *mm = tlb->mm;
532 pte_t *pte; 558 pte_t *pte;
559 spinlock_t *ptl;
560 int file_rss = 0;
561 int anon_rss = 0;
533 562
534 pte = pte_offset_map(pmd, addr); 563 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
535 do { 564 do {
536 pte_t ptent = *pte; 565 pte_t ptent = *pte;
537 if (pte_none(ptent)) 566 if (pte_none(ptent))
538 continue; 567 continue;
539 if (pte_present(ptent)) { 568 if (pte_present(ptent)) {
540 struct page *page = NULL; 569 struct page *page = NULL;
541 unsigned long pfn = pte_pfn(ptent); 570 if (!(vma->vm_flags & VM_RESERVED)) {
542 if (pfn_valid(pfn)) { 571 unsigned long pfn = pte_pfn(ptent);
543 page = pfn_to_page(pfn); 572 if (unlikely(!pfn_valid(pfn)))
544 if (PageReserved(page)) 573 print_bad_pte(vma, ptent, addr);
545 page = NULL; 574 else
575 page = pfn_to_page(pfn);
546 } 576 }
547 if (unlikely(details) && page) { 577 if (unlikely(details) && page) {
548 /* 578 /*
@@ -562,7 +592,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
562 page->index > details->last_index)) 592 page->index > details->last_index))
563 continue; 593 continue;
564 } 594 }
565 ptent = ptep_get_and_clear_full(tlb->mm, addr, pte, 595 ptent = ptep_get_and_clear_full(mm, addr, pte,
566 tlb->fullmm); 596 tlb->fullmm);
567 tlb_remove_tlb_entry(tlb, pte, addr); 597 tlb_remove_tlb_entry(tlb, pte, addr);
568 if (unlikely(!page)) 598 if (unlikely(!page))
@@ -570,15 +600,17 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
570 if (unlikely(details) && details->nonlinear_vma 600 if (unlikely(details) && details->nonlinear_vma
571 && linear_page_index(details->nonlinear_vma, 601 && linear_page_index(details->nonlinear_vma,
572 addr) != page->index) 602 addr) != page->index)
573 set_pte_at(tlb->mm, addr, pte, 603 set_pte_at(mm, addr, pte,
574 pgoff_to_pte(page->index)); 604 pgoff_to_pte(page->index));
575 if (pte_dirty(ptent))
576 set_page_dirty(page);
577 if (PageAnon(page)) 605 if (PageAnon(page))
578 dec_mm_counter(tlb->mm, anon_rss); 606 anon_rss--;
579 else if (pte_young(ptent)) 607 else {
580 mark_page_accessed(page); 608 if (pte_dirty(ptent))
581 tlb->freed++; 609 set_page_dirty(page);
610 if (pte_young(ptent))
611 mark_page_accessed(page);
612 file_rss--;
613 }
582 page_remove_rmap(page); 614 page_remove_rmap(page);
583 tlb_remove_page(tlb, page); 615 tlb_remove_page(tlb, page);
584 continue; 616 continue;
@@ -591,12 +623,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
591 continue; 623 continue;
592 if (!pte_file(ptent)) 624 if (!pte_file(ptent))
593 free_swap_and_cache(pte_to_swp_entry(ptent)); 625 free_swap_and_cache(pte_to_swp_entry(ptent));
594 pte_clear_full(tlb->mm, addr, pte, tlb->fullmm); 626 pte_clear_full(mm, addr, pte, tlb->fullmm);
595 } while (pte++, addr += PAGE_SIZE, addr != end); 627 } while (pte++, addr += PAGE_SIZE, addr != end);
596 pte_unmap(pte - 1); 628
629 add_mm_rss(mm, file_rss, anon_rss);
630 pte_unmap_unlock(pte - 1, ptl);
597} 631}
598 632
599static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud, 633static inline void zap_pmd_range(struct mmu_gather *tlb,
634 struct vm_area_struct *vma, pud_t *pud,
600 unsigned long addr, unsigned long end, 635 unsigned long addr, unsigned long end,
601 struct zap_details *details) 636 struct zap_details *details)
602{ 637{
@@ -608,11 +643,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
608 next = pmd_addr_end(addr, end); 643 next = pmd_addr_end(addr, end);
609 if (pmd_none_or_clear_bad(pmd)) 644 if (pmd_none_or_clear_bad(pmd))
610 continue; 645 continue;
611 zap_pte_range(tlb, pmd, addr, next, details); 646 zap_pte_range(tlb, vma, pmd, addr, next, details);
612 } while (pmd++, addr = next, addr != end); 647 } while (pmd++, addr = next, addr != end);
613} 648}
614 649
615static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd, 650static inline void zap_pud_range(struct mmu_gather *tlb,
651 struct vm_area_struct *vma, pgd_t *pgd,
616 unsigned long addr, unsigned long end, 652 unsigned long addr, unsigned long end,
617 struct zap_details *details) 653 struct zap_details *details)
618{ 654{
@@ -624,7 +660,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
624 next = pud_addr_end(addr, end); 660 next = pud_addr_end(addr, end);
625 if (pud_none_or_clear_bad(pud)) 661 if (pud_none_or_clear_bad(pud))
626 continue; 662 continue;
627 zap_pmd_range(tlb, pud, addr, next, details); 663 zap_pmd_range(tlb, vma, pud, addr, next, details);
628 } while (pud++, addr = next, addr != end); 664 } while (pud++, addr = next, addr != end);
629} 665}
630 666
@@ -645,7 +681,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
645 next = pgd_addr_end(addr, end); 681 next = pgd_addr_end(addr, end);
646 if (pgd_none_or_clear_bad(pgd)) 682 if (pgd_none_or_clear_bad(pgd))
647 continue; 683 continue;
648 zap_pud_range(tlb, pgd, addr, next, details); 684 zap_pud_range(tlb, vma, pgd, addr, next, details);
649 } while (pgd++, addr = next, addr != end); 685 } while (pgd++, addr = next, addr != end);
650 tlb_end_vma(tlb, vma); 686 tlb_end_vma(tlb, vma);
651} 687}
@@ -660,7 +696,6 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
660/** 696/**
661 * unmap_vmas - unmap a range of memory covered by a list of vma's 697 * unmap_vmas - unmap a range of memory covered by a list of vma's
662 * @tlbp: address of the caller's struct mmu_gather 698 * @tlbp: address of the caller's struct mmu_gather
663 * @mm: the controlling mm_struct
664 * @vma: the starting vma 699 * @vma: the starting vma
665 * @start_addr: virtual address at which to start unmapping 700 * @start_addr: virtual address at which to start unmapping
666 * @end_addr: virtual address at which to end unmapping 701 * @end_addr: virtual address at which to end unmapping
@@ -669,10 +704,10 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
669 * 704 *
670 * Returns the end address of the unmapping (restart addr if interrupted). 705 * Returns the end address of the unmapping (restart addr if interrupted).
671 * 706 *
672 * Unmap all pages in the vma list. Called under page_table_lock. 707 * Unmap all pages in the vma list.
673 * 708 *
674 * We aim to not hold page_table_lock for too long (for scheduling latency 709 * We aim to not hold locks for too long (for scheduling latency reasons).
675 * reasons). So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to 710 * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to
676 * return the ending mmu_gather to the caller. 711 * return the ending mmu_gather to the caller.
677 * 712 *
678 * Only addresses between `start' and `end' will be unmapped. 713 * Only addresses between `start' and `end' will be unmapped.
@@ -684,7 +719,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
684 * ensure that any thus-far unmapped pages are flushed before unmap_vmas() 719 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
685 * drops the lock and schedules. 720 * drops the lock and schedules.
686 */ 721 */
687unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, 722unsigned long unmap_vmas(struct mmu_gather **tlbp,
688 struct vm_area_struct *vma, unsigned long start_addr, 723 struct vm_area_struct *vma, unsigned long start_addr,
689 unsigned long end_addr, unsigned long *nr_accounted, 724 unsigned long end_addr, unsigned long *nr_accounted,
690 struct zap_details *details) 725 struct zap_details *details)
@@ -694,7 +729,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
694 int tlb_start_valid = 0; 729 int tlb_start_valid = 0;
695 unsigned long start = start_addr; 730 unsigned long start = start_addr;
696 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; 731 spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
697 int fullmm = tlb_is_full_mm(*tlbp); 732 int fullmm = (*tlbp)->fullmm;
698 733
699 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { 734 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
700 unsigned long end; 735 unsigned long end;
@@ -734,19 +769,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
734 tlb_finish_mmu(*tlbp, tlb_start, start); 769 tlb_finish_mmu(*tlbp, tlb_start, start);
735 770
736 if (need_resched() || 771 if (need_resched() ||
737 need_lockbreak(&mm->page_table_lock) ||
738 (i_mmap_lock && need_lockbreak(i_mmap_lock))) { 772 (i_mmap_lock && need_lockbreak(i_mmap_lock))) {
739 if (i_mmap_lock) { 773 if (i_mmap_lock) {
740 /* must reset count of rss freed */ 774 *tlbp = NULL;
741 *tlbp = tlb_gather_mmu(mm, fullmm);
742 goto out; 775 goto out;
743 } 776 }
744 spin_unlock(&mm->page_table_lock);
745 cond_resched(); 777 cond_resched();
746 spin_lock(&mm->page_table_lock);
747 } 778 }
748 779
749 *tlbp = tlb_gather_mmu(mm, fullmm); 780 *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
750 tlb_start_valid = 0; 781 tlb_start_valid = 0;
751 zap_bytes = ZAP_BLOCK_SIZE; 782 zap_bytes = ZAP_BLOCK_SIZE;
752 } 783 }
@@ -770,123 +801,93 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
770 unsigned long end = address + size; 801 unsigned long end = address + size;
771 unsigned long nr_accounted = 0; 802 unsigned long nr_accounted = 0;
772 803
773 if (is_vm_hugetlb_page(vma)) {
774 zap_hugepage_range(vma, address, size);
775 return end;
776 }
777
778 lru_add_drain(); 804 lru_add_drain();
779 spin_lock(&mm->page_table_lock);
780 tlb = tlb_gather_mmu(mm, 0); 805 tlb = tlb_gather_mmu(mm, 0);
781 end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); 806 update_hiwater_rss(mm);
782 tlb_finish_mmu(tlb, address, end); 807 end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
783 spin_unlock(&mm->page_table_lock); 808 if (tlb)
809 tlb_finish_mmu(tlb, address, end);
784 return end; 810 return end;
785} 811}
786 812
787/* 813/*
788 * Do a quick page-table lookup for a single page. 814 * Do a quick page-table lookup for a single page.
789 * mm->page_table_lock must be held.
790 */ 815 */
791static struct page *__follow_page(struct mm_struct *mm, unsigned long address, 816struct page *follow_page(struct mm_struct *mm, unsigned long address,
792 int read, int write, int accessed) 817 unsigned int flags)
793{ 818{
794 pgd_t *pgd; 819 pgd_t *pgd;
795 pud_t *pud; 820 pud_t *pud;
796 pmd_t *pmd; 821 pmd_t *pmd;
797 pte_t *ptep, pte; 822 pte_t *ptep, pte;
823 spinlock_t *ptl;
798 unsigned long pfn; 824 unsigned long pfn;
799 struct page *page; 825 struct page *page;
800 826
801 page = follow_huge_addr(mm, address, write); 827 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
802 if (! IS_ERR(page)) 828 if (!IS_ERR(page)) {
803 return page; 829 BUG_ON(flags & FOLL_GET);
830 goto out;
831 }
804 832
833 page = NULL;
805 pgd = pgd_offset(mm, address); 834 pgd = pgd_offset(mm, address);
806 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) 835 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
807 goto out; 836 goto no_page_table;
808 837
809 pud = pud_offset(pgd, address); 838 pud = pud_offset(pgd, address);
810 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 839 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
811 goto out; 840 goto no_page_table;
812 841
813 pmd = pmd_offset(pud, address); 842 pmd = pmd_offset(pud, address);
814 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) 843 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
844 goto no_page_table;
845
846 if (pmd_huge(*pmd)) {
847 BUG_ON(flags & FOLL_GET);
848 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
815 goto out; 849 goto out;
816 if (pmd_huge(*pmd)) 850 }
817 return follow_huge_pmd(mm, address, pmd, write);
818 851
819 ptep = pte_offset_map(pmd, address); 852 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
820 if (!ptep) 853 if (!ptep)
821 goto out; 854 goto out;
822 855
823 pte = *ptep; 856 pte = *ptep;
824 pte_unmap(ptep); 857 if (!pte_present(pte))
825 if (pte_present(pte)) { 858 goto unlock;
826 if (write && !pte_write(pte)) 859 if ((flags & FOLL_WRITE) && !pte_write(pte))
827 goto out; 860 goto unlock;
828 if (read && !pte_read(pte)) 861 pfn = pte_pfn(pte);
829 goto out; 862 if (!pfn_valid(pfn))
830 pfn = pte_pfn(pte); 863 goto unlock;
831 if (pfn_valid(pfn)) { 864
832 page = pfn_to_page(pfn); 865 page = pfn_to_page(pfn);
833 if (accessed) { 866 if (flags & FOLL_GET)
834 if (write && !pte_dirty(pte) &&!PageDirty(page)) 867 get_page(page);
835 set_page_dirty(page); 868 if (flags & FOLL_TOUCH) {
836 mark_page_accessed(page); 869 if ((flags & FOLL_WRITE) &&
837 } 870 !pte_dirty(pte) && !PageDirty(page))
838 return page; 871 set_page_dirty(page);
839 } 872 mark_page_accessed(page);
840 } 873 }
841 874unlock:
875 pte_unmap_unlock(ptep, ptl);
842out: 876out:
843 return NULL; 877 return page;
844}
845
846inline struct page *
847follow_page(struct mm_struct *mm, unsigned long address, int write)
848{
849 return __follow_page(mm, address, 0, write, 1);
850}
851
852/*
853 * check_user_page_readable() can be called frm niterrupt context by oprofile,
854 * so we need to avoid taking any non-irq-safe locks
855 */
856int check_user_page_readable(struct mm_struct *mm, unsigned long address)
857{
858 return __follow_page(mm, address, 1, 0, 0) != NULL;
859}
860EXPORT_SYMBOL(check_user_page_readable);
861
862static inline int
863untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
864 unsigned long address)
865{
866 pgd_t *pgd;
867 pud_t *pud;
868 pmd_t *pmd;
869
870 /* Check if the vma is for an anonymous mapping. */
871 if (vma->vm_ops && vma->vm_ops->nopage)
872 return 0;
873
874 /* Check if page directory entry exists. */
875 pgd = pgd_offset(mm, address);
876 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
877 return 1;
878
879 pud = pud_offset(pgd, address);
880 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
881 return 1;
882
883 /* Check if page middle directory entry exists. */
884 pmd = pmd_offset(pud, address);
885 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
886 return 1;
887 878
888 /* There is a pte slot for 'address' in 'mm'. */ 879no_page_table:
889 return 0; 880 /*
881 * When core dumping an enormous anonymous area that nobody
882 * has touched so far, we don't want to allocate page tables.
883 */
884 if (flags & FOLL_ANON) {
885 page = ZERO_PAGE(address);
886 if (flags & FOLL_GET)
887 get_page(page);
888 BUG_ON(flags & FOLL_WRITE);
889 }
890 return page;
890} 891}
891 892
892int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 893int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +895,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
894 struct page **pages, struct vm_area_struct **vmas) 895 struct page **pages, struct vm_area_struct **vmas)
895{ 896{
896 int i; 897 int i;
897 unsigned int flags; 898 unsigned int vm_flags;
898 899
899 /* 900 /*
900 * Require read or write permissions. 901 * Require read or write permissions.
901 * If 'force' is set, we only require the "MAY" flags. 902 * If 'force' is set, we only require the "MAY" flags.
902 */ 903 */
903 flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); 904 vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
904 flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); 905 vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
905 i = 0; 906 i = 0;
906 907
907 do { 908 do {
908 struct vm_area_struct * vma; 909 struct vm_area_struct *vma;
910 unsigned int foll_flags;
909 911
910 vma = find_extend_vma(mm, start); 912 vma = find_extend_vma(mm, start);
911 if (!vma && in_gate_area(tsk, start)) { 913 if (!vma && in_gate_area(tsk, start)) {
@@ -945,8 +947,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
945 continue; 947 continue;
946 } 948 }
947 949
948 if (!vma || (vma->vm_flags & VM_IO) 950 if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
949 || !(flags & vma->vm_flags)) 951 || !(vm_flags & vma->vm_flags))
950 return i ? : -EFAULT; 952 return i ? : -EFAULT;
951 953
952 if (is_vm_hugetlb_page(vma)) { 954 if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +956,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
954 &start, &len, i); 956 &start, &len, i);
955 continue; 957 continue;
956 } 958 }
957 spin_lock(&mm->page_table_lock); 959
960 foll_flags = FOLL_TOUCH;
961 if (pages)
962 foll_flags |= FOLL_GET;
963 if (!write && !(vma->vm_flags & VM_LOCKED) &&
964 (!vma->vm_ops || !vma->vm_ops->nopage))
965 foll_flags |= FOLL_ANON;
966
958 do { 967 do {
959 int write_access = write;
960 struct page *page; 968 struct page *page;
961 969
962 cond_resched_lock(&mm->page_table_lock); 970 if (write)
963 while (!(page = follow_page(mm, start, write_access))) { 971 foll_flags |= FOLL_WRITE;
964 int ret;
965
966 /*
967 * Shortcut for anonymous pages. We don't want
968 * to force the creation of pages tables for
969 * insanely big anonymously mapped areas that
970 * nobody touched so far. This is important
971 * for doing a core dump for these mappings.
972 */
973 if (!write && untouched_anonymous_page(mm,vma,start)) {
974 page = ZERO_PAGE(start);
975 break;
976 }
977 spin_unlock(&mm->page_table_lock);
978 ret = __handle_mm_fault(mm, vma, start, write_access);
979 972
973 cond_resched();
974 while (!(page = follow_page(mm, start, foll_flags))) {
975 int ret;
976 ret = __handle_mm_fault(mm, vma, start,
977 foll_flags & FOLL_WRITE);
980 /* 978 /*
981 * The VM_FAULT_WRITE bit tells us that do_wp_page has 979 * The VM_FAULT_WRITE bit tells us that do_wp_page has
982 * broken COW when necessary, even if maybe_mkwrite 980 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +982,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
984 * subsequent page lookups as if they were reads. 982 * subsequent page lookups as if they were reads.
985 */ 983 */
986 if (ret & VM_FAULT_WRITE) 984 if (ret & VM_FAULT_WRITE)
987 write_access = 0; 985 foll_flags &= ~FOLL_WRITE;
988 986
989 switch (ret & ~VM_FAULT_WRITE) { 987 switch (ret & ~VM_FAULT_WRITE) {
990 case VM_FAULT_MINOR: 988 case VM_FAULT_MINOR:
@@ -1000,13 +998,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1000 default: 998 default:
1001 BUG(); 999 BUG();
1002 } 1000 }
1003 spin_lock(&mm->page_table_lock);
1004 } 1001 }
1005 if (pages) { 1002 if (pages) {
1006 pages[i] = page; 1003 pages[i] = page;
1007 flush_dcache_page(page); 1004 flush_dcache_page(page);
1008 if (!PageReserved(page))
1009 page_cache_get(page);
1010 } 1005 }
1011 if (vmas) 1006 if (vmas)
1012 vmas[i] = vma; 1007 vmas[i] = vma;
@@ -1014,7 +1009,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1014 start += PAGE_SIZE; 1009 start += PAGE_SIZE;
1015 len--; 1010 len--;
1016 } while (len && start < vma->vm_end); 1011 } while (len && start < vma->vm_end);
1017 spin_unlock(&mm->page_table_lock);
1018 } while (len); 1012 } while (len);
1019 return i; 1013 return i;
1020} 1014}
@@ -1024,16 +1018,21 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1024 unsigned long addr, unsigned long end, pgprot_t prot) 1018 unsigned long addr, unsigned long end, pgprot_t prot)
1025{ 1019{
1026 pte_t *pte; 1020 pte_t *pte;
1021 spinlock_t *ptl;
1027 1022
1028 pte = pte_alloc_map(mm, pmd, addr); 1023 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1029 if (!pte) 1024 if (!pte)
1030 return -ENOMEM; 1025 return -ENOMEM;
1031 do { 1026 do {
1032 pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot)); 1027 struct page *page = ZERO_PAGE(addr);
1028 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
1029 page_cache_get(page);
1030 page_add_file_rmap(page);
1031 inc_mm_counter(mm, file_rss);
1033 BUG_ON(!pte_none(*pte)); 1032 BUG_ON(!pte_none(*pte));
1034 set_pte_at(mm, addr, pte, zero_pte); 1033 set_pte_at(mm, addr, pte, zero_pte);
1035 } while (pte++, addr += PAGE_SIZE, addr != end); 1034 } while (pte++, addr += PAGE_SIZE, addr != end);
1036 pte_unmap(pte - 1); 1035 pte_unmap_unlock(pte - 1, ptl);
1037 return 0; 1036 return 0;
1038} 1037}
1039 1038
@@ -1083,14 +1082,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
1083 BUG_ON(addr >= end); 1082 BUG_ON(addr >= end);
1084 pgd = pgd_offset(mm, addr); 1083 pgd = pgd_offset(mm, addr);
1085 flush_cache_range(vma, addr, end); 1084 flush_cache_range(vma, addr, end);
1086 spin_lock(&mm->page_table_lock);
1087 do { 1085 do {
1088 next = pgd_addr_end(addr, end); 1086 next = pgd_addr_end(addr, end);
1089 err = zeromap_pud_range(mm, pgd, addr, next, prot); 1087 err = zeromap_pud_range(mm, pgd, addr, next, prot);
1090 if (err) 1088 if (err)
1091 break; 1089 break;
1092 } while (pgd++, addr = next, addr != end); 1090 } while (pgd++, addr = next, addr != end);
1093 spin_unlock(&mm->page_table_lock);
1094 return err; 1091 return err;
1095} 1092}
1096 1093
@@ -1104,17 +1101,17 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1104 unsigned long pfn, pgprot_t prot) 1101 unsigned long pfn, pgprot_t prot)
1105{ 1102{
1106 pte_t *pte; 1103 pte_t *pte;
1104 spinlock_t *ptl;
1107 1105
1108 pte = pte_alloc_map(mm, pmd, addr); 1106 pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
1109 if (!pte) 1107 if (!pte)
1110 return -ENOMEM; 1108 return -ENOMEM;
1111 do { 1109 do {
1112 BUG_ON(!pte_none(*pte)); 1110 BUG_ON(!pte_none(*pte));
1113 if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) 1111 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1114 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
1115 pfn++; 1112 pfn++;
1116 } while (pte++, addr += PAGE_SIZE, addr != end); 1113 } while (pte++, addr += PAGE_SIZE, addr != end);
1117 pte_unmap(pte - 1); 1114 pte_unmap_unlock(pte - 1, ptl);
1118 return 0; 1115 return 0;
1119} 1116}
1120 1117
@@ -1173,8 +1170,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1173 * rest of the world about it: 1170 * rest of the world about it:
1174 * VM_IO tells people not to look at these pages 1171 * VM_IO tells people not to look at these pages
1175 * (accesses can have side effects). 1172 * (accesses can have side effects).
1176 * VM_RESERVED tells swapout not to try to touch 1173 * VM_RESERVED tells the core MM not to "manage" these pages
1177 * this region. 1174 * (e.g. refcount, mapcount, try to swap them out).
1178 */ 1175 */
1179 vma->vm_flags |= VM_IO | VM_RESERVED; 1176 vma->vm_flags |= VM_IO | VM_RESERVED;
1180 1177
@@ -1182,7 +1179,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1182 pfn -= addr >> PAGE_SHIFT; 1179 pfn -= addr >> PAGE_SHIFT;
1183 pgd = pgd_offset(mm, addr); 1180 pgd = pgd_offset(mm, addr);
1184 flush_cache_range(vma, addr, end); 1181 flush_cache_range(vma, addr, end);
1185 spin_lock(&mm->page_table_lock);
1186 do { 1182 do {
1187 next = pgd_addr_end(addr, end); 1183 next = pgd_addr_end(addr, end);
1188 err = remap_pud_range(mm, pgd, addr, next, 1184 err = remap_pud_range(mm, pgd, addr, next,
@@ -1190,12 +1186,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1190 if (err) 1186 if (err)
1191 break; 1187 break;
1192 } while (pgd++, addr = next, addr != end); 1188 } while (pgd++, addr = next, addr != end);
1193 spin_unlock(&mm->page_table_lock);
1194 return err; 1189 return err;
1195} 1190}
1196EXPORT_SYMBOL(remap_pfn_range); 1191EXPORT_SYMBOL(remap_pfn_range);
1197 1192
1198/* 1193/*
1194 * handle_pte_fault chooses page fault handler according to an entry
1195 * which was read non-atomically. Before making any commitment, on
1196 * those architectures or configurations (e.g. i386 with PAE) which
1197 * might give a mix of unmatched parts, do_swap_page and do_file_page
1198 * must check under lock before unmapping the pte and proceeding
1199 * (but do_wp_page is only called after already making such a check;
1200 * and do_anonymous_page and do_no_page can safely check later on).
1201 */
1202static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
1203 pte_t *page_table, pte_t orig_pte)
1204{
1205 int same = 1;
1206#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
1207 if (sizeof(pte_t) > sizeof(unsigned long)) {
1208 spinlock_t *ptl = pte_lockptr(mm, pmd);
1209 spin_lock(ptl);
1210 same = pte_same(*page_table, orig_pte);
1211 spin_unlock(ptl);
1212 }
1213#endif
1214 pte_unmap(page_table);
1215 return same;
1216}
1217
1218/*
1199 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when 1219 * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when
1200 * servicing faults for write access. In the normal case, do always want 1220 * servicing faults for write access. In the normal case, do always want
1201 * pte_mkwrite. But get_user_pages can cause write faults for mappings 1221 * pte_mkwrite. But get_user_pages can cause write faults for mappings
@@ -1209,28 +1229,10 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1209} 1229}
1210 1230
1211/* 1231/*
1212 * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock
1213 */
1214static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
1215 pte_t *page_table)
1216{
1217 pte_t entry;
1218
1219 entry = maybe_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)),
1220 vma);
1221 ptep_establish(vma, address, page_table, entry);
1222 update_mmu_cache(vma, address, entry);
1223 lazy_mmu_prot_update(entry);
1224}
1225
1226/*
1227 * This routine handles present pages, when users try to write 1232 * This routine handles present pages, when users try to write
1228 * to a shared page. It is done by copying the page to a new address 1233 * to a shared page. It is done by copying the page to a new address
1229 * and decrementing the shared-page counter for the old page. 1234 * and decrementing the shared-page counter for the old page.
1230 * 1235 *
1231 * Goto-purists beware: the only reason for goto's here is that it results
1232 * in better assembly code.. The "default" path will see no jumps at all.
1233 *
1234 * Note that this routine assumes that the protection checks have been 1236 * Note that this routine assumes that the protection checks have been
1235 * done by the caller (the low-level page fault routine in most cases). 1237 * done by the caller (the low-level page fault routine in most cases).
1236 * Thus we can safely just mark it writable once we've done any necessary 1238 * Thus we can safely just mark it writable once we've done any necessary
@@ -1240,28 +1242,28 @@ static inline void break_cow(struct vm_area_struct * vma, struct page * new_page
1240 * change only once the write actually happens. This avoids a few races, 1242 * change only once the write actually happens. This avoids a few races,
1241 * and potentially makes it more efficient. 1243 * and potentially makes it more efficient.
1242 * 1244 *
1243 * We hold the mm semaphore and the page_table_lock on entry and exit 1245 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1244 * with the page_table_lock released. 1246 * but allow concurrent faults), with pte both mapped and locked.
1247 * We return with mmap_sem still held, but pte unmapped and unlocked.
1245 */ 1248 */
1246static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, 1249static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1247 unsigned long address, pte_t *page_table, pmd_t *pmd, pte_t pte) 1250 unsigned long address, pte_t *page_table, pmd_t *pmd,
1251 spinlock_t *ptl, pte_t orig_pte)
1248{ 1252{
1249 struct page *old_page, *new_page; 1253 struct page *old_page, *new_page;
1250 unsigned long pfn = pte_pfn(pte); 1254 unsigned long pfn = pte_pfn(orig_pte);
1251 pte_t entry; 1255 pte_t entry;
1252 int ret; 1256 int ret = VM_FAULT_MINOR;
1257
1258 BUG_ON(vma->vm_flags & VM_RESERVED);
1253 1259
1254 if (unlikely(!pfn_valid(pfn))) { 1260 if (unlikely(!pfn_valid(pfn))) {
1255 /* 1261 /*
1256 * This should really halt the system so it can be debugged or 1262 * Page table corrupted: show pte and kill process.
1257 * at least the kernel stops what it's doing before it corrupts
1258 * data, but for the moment just pretend this is OOM.
1259 */ 1263 */
1260 pte_unmap(page_table); 1264 print_bad_pte(vma, orig_pte, address);
1261 printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", 1265 ret = VM_FAULT_OOM;
1262 address); 1266 goto unlock;
1263 spin_unlock(&mm->page_table_lock);
1264 return VM_FAULT_OOM;
1265 } 1267 }
1266 old_page = pfn_to_page(pfn); 1268 old_page = pfn_to_page(pfn);
1267 1269
@@ -1270,52 +1272,51 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1270 unlock_page(old_page); 1272 unlock_page(old_page);
1271 if (reuse) { 1273 if (reuse) {
1272 flush_cache_page(vma, address, pfn); 1274 flush_cache_page(vma, address, pfn);
1273 entry = maybe_mkwrite(pte_mkyoung(pte_mkdirty(pte)), 1275 entry = pte_mkyoung(orig_pte);
1274 vma); 1276 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1275 ptep_set_access_flags(vma, address, page_table, entry, 1); 1277 ptep_set_access_flags(vma, address, page_table, entry, 1);
1276 update_mmu_cache(vma, address, entry); 1278 update_mmu_cache(vma, address, entry);
1277 lazy_mmu_prot_update(entry); 1279 lazy_mmu_prot_update(entry);
1278 pte_unmap(page_table); 1280 ret |= VM_FAULT_WRITE;
1279 spin_unlock(&mm->page_table_lock); 1281 goto unlock;
1280 return VM_FAULT_MINOR|VM_FAULT_WRITE;
1281 } 1282 }
1282 } 1283 }
1283 pte_unmap(page_table);
1284 1284
1285 /* 1285 /*
1286 * Ok, we need to copy. Oh, well.. 1286 * Ok, we need to copy. Oh, well..
1287 */ 1287 */
1288 if (!PageReserved(old_page)) 1288 page_cache_get(old_page);
1289 page_cache_get(old_page); 1289 pte_unmap_unlock(page_table, ptl);
1290 spin_unlock(&mm->page_table_lock);
1291 1290
1292 if (unlikely(anon_vma_prepare(vma))) 1291 if (unlikely(anon_vma_prepare(vma)))
1293 goto no_new_page; 1292 goto oom;
1294 if (old_page == ZERO_PAGE(address)) { 1293 if (old_page == ZERO_PAGE(address)) {
1295 new_page = alloc_zeroed_user_highpage(vma, address); 1294 new_page = alloc_zeroed_user_highpage(vma, address);
1296 if (!new_page) 1295 if (!new_page)
1297 goto no_new_page; 1296 goto oom;
1298 } else { 1297 } else {
1299 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1298 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1300 if (!new_page) 1299 if (!new_page)
1301 goto no_new_page; 1300 goto oom;
1302 copy_user_highpage(new_page, old_page, address); 1301 copy_user_highpage(new_page, old_page, address);
1303 } 1302 }
1303
1304 /* 1304 /*
1305 * Re-check the pte - we dropped the lock 1305 * Re-check the pte - we dropped the lock
1306 */ 1306 */
1307 ret = VM_FAULT_MINOR; 1307 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1308 spin_lock(&mm->page_table_lock); 1308 if (likely(pte_same(*page_table, orig_pte))) {
1309 page_table = pte_offset_map(pmd, address); 1309 page_remove_rmap(old_page);
1310 if (likely(pte_same(*page_table, pte))) { 1310 if (!PageAnon(old_page)) {
1311 if (PageAnon(old_page)) 1311 inc_mm_counter(mm, anon_rss);
1312 dec_mm_counter(mm, anon_rss); 1312 dec_mm_counter(mm, file_rss);
1313 if (PageReserved(old_page)) 1313 }
1314 inc_mm_counter(mm, rss);
1315 else
1316 page_remove_rmap(old_page);
1317 flush_cache_page(vma, address, pfn); 1314 flush_cache_page(vma, address, pfn);
1318 break_cow(vma, new_page, address, page_table); 1315 entry = mk_pte(new_page, vma->vm_page_prot);
1316 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1317 ptep_establish(vma, address, page_table, entry);
1318 update_mmu_cache(vma, address, entry);
1319 lazy_mmu_prot_update(entry);
1319 lru_cache_add_active(new_page); 1320 lru_cache_add_active(new_page);
1320 page_add_anon_rmap(new_page, vma, address); 1321 page_add_anon_rmap(new_page, vma, address);
1321 1322
@@ -1323,13 +1324,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
1323 new_page = old_page; 1324 new_page = old_page;
1324 ret |= VM_FAULT_WRITE; 1325 ret |= VM_FAULT_WRITE;
1325 } 1326 }
1326 pte_unmap(page_table);
1327 page_cache_release(new_page); 1327 page_cache_release(new_page);
1328 page_cache_release(old_page); 1328 page_cache_release(old_page);
1329 spin_unlock(&mm->page_table_lock); 1329unlock:
1330 pte_unmap_unlock(page_table, ptl);
1330 return ret; 1331 return ret;
1331 1332oom:
1332no_new_page:
1333 page_cache_release(old_page); 1333 page_cache_release(old_page);
1334 return VM_FAULT_OOM; 1334 return VM_FAULT_OOM;
1335} 1335}
@@ -1399,13 +1399,6 @@ again:
1399 1399
1400 restart_addr = zap_page_range(vma, start_addr, 1400 restart_addr = zap_page_range(vma, start_addr,
1401 end_addr - start_addr, details); 1401 end_addr - start_addr, details);
1402
1403 /*
1404 * We cannot rely on the break test in unmap_vmas:
1405 * on the one hand, we don't want to restart our loop
1406 * just because that broke out for the page_table_lock;
1407 * on the other hand, it does no test when vma is small.
1408 */
1409 need_break = need_resched() || 1402 need_break = need_resched() ||
1410 need_lockbreak(details->i_mmap_lock); 1403 need_lockbreak(details->i_mmap_lock);
1411 1404
@@ -1654,38 +1647,37 @@ void swapin_readahead(swp_entry_t entry, unsigned long addr,struct vm_area_struc
1654} 1647}
1655 1648
1656/* 1649/*
1657 * We hold the mm semaphore and the page_table_lock on entry and 1650 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1658 * should release the pagetable lock on exit.. 1651 * but allow concurrent faults), and pte mapped but not yet locked.
1652 * We return with mmap_sem still held, but pte unmapped and unlocked.
1659 */ 1653 */
1660static int do_swap_page(struct mm_struct * mm, 1654static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
1661 struct vm_area_struct * vma, unsigned long address, 1655 unsigned long address, pte_t *page_table, pmd_t *pmd,
1662 pte_t *page_table, pmd_t *pmd, pte_t orig_pte, int write_access) 1656 int write_access, pte_t orig_pte)
1663{ 1657{
1658 spinlock_t *ptl;
1664 struct page *page; 1659 struct page *page;
1665 swp_entry_t entry = pte_to_swp_entry(orig_pte); 1660 swp_entry_t entry;
1666 pte_t pte; 1661 pte_t pte;
1667 int ret = VM_FAULT_MINOR; 1662 int ret = VM_FAULT_MINOR;
1668 1663
1669 pte_unmap(page_table); 1664 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1670 spin_unlock(&mm->page_table_lock); 1665 goto out;
1666
1667 entry = pte_to_swp_entry(orig_pte);
1671 page = lookup_swap_cache(entry); 1668 page = lookup_swap_cache(entry);
1672 if (!page) { 1669 if (!page) {
1673 swapin_readahead(entry, address, vma); 1670 swapin_readahead(entry, address, vma);
1674 page = read_swap_cache_async(entry, vma, address); 1671 page = read_swap_cache_async(entry, vma, address);
1675 if (!page) { 1672 if (!page) {
1676 /* 1673 /*
1677 * Back out if somebody else faulted in this pte while 1674 * Back out if somebody else faulted in this pte
1678 * we released the page table lock. 1675 * while we released the pte lock.
1679 */ 1676 */
1680 spin_lock(&mm->page_table_lock); 1677 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1681 page_table = pte_offset_map(pmd, address);
1682 if (likely(pte_same(*page_table, orig_pte))) 1678 if (likely(pte_same(*page_table, orig_pte)))
1683 ret = VM_FAULT_OOM; 1679 ret = VM_FAULT_OOM;
1684 else 1680 goto unlock;
1685 ret = VM_FAULT_MINOR;
1686 pte_unmap(page_table);
1687 spin_unlock(&mm->page_table_lock);
1688 goto out;
1689 } 1681 }
1690 1682
1691 /* Had to read the page from swap area: Major fault */ 1683 /* Had to read the page from swap area: Major fault */
@@ -1698,15 +1690,11 @@ static int do_swap_page(struct mm_struct * mm,
1698 lock_page(page); 1690 lock_page(page);
1699 1691
1700 /* 1692 /*
1701 * Back out if somebody else faulted in this pte while we 1693 * Back out if somebody else already faulted in this pte.
1702 * released the page table lock.
1703 */ 1694 */
1704 spin_lock(&mm->page_table_lock); 1695 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1705 page_table = pte_offset_map(pmd, address); 1696 if (unlikely(!pte_same(*page_table, orig_pte)))
1706 if (unlikely(!pte_same(*page_table, orig_pte))) {
1707 ret = VM_FAULT_MINOR;
1708 goto out_nomap; 1697 goto out_nomap;
1709 }
1710 1698
1711 if (unlikely(!PageUptodate(page))) { 1699 if (unlikely(!PageUptodate(page))) {
1712 ret = VM_FAULT_SIGBUS; 1700 ret = VM_FAULT_SIGBUS;
@@ -1715,7 +1703,7 @@ static int do_swap_page(struct mm_struct * mm,
1715 1703
1716 /* The page isn't present yet, go ahead with the fault. */ 1704 /* The page isn't present yet, go ahead with the fault. */
1717 1705
1718 inc_mm_counter(mm, rss); 1706 inc_mm_counter(mm, anon_rss);
1719 pte = mk_pte(page, vma->vm_page_prot); 1707 pte = mk_pte(page, vma->vm_page_prot);
1720 if (write_access && can_share_swap_page(page)) { 1708 if (write_access && can_share_swap_page(page)) {
1721 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 1709 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1733,7 +1721,7 @@ static int do_swap_page(struct mm_struct * mm,
1733 1721
1734 if (write_access) { 1722 if (write_access) {
1735 if (do_wp_page(mm, vma, address, 1723 if (do_wp_page(mm, vma, address,
1736 page_table, pmd, pte) == VM_FAULT_OOM) 1724 page_table, pmd, ptl, pte) == VM_FAULT_OOM)
1737 ret = VM_FAULT_OOM; 1725 ret = VM_FAULT_OOM;
1738 goto out; 1726 goto out;
1739 } 1727 }
@@ -1741,74 +1729,76 @@ static int do_swap_page(struct mm_struct * mm,
1741 /* No need to invalidate - it was non-present before */ 1729 /* No need to invalidate - it was non-present before */
1742 update_mmu_cache(vma, address, pte); 1730 update_mmu_cache(vma, address, pte);
1743 lazy_mmu_prot_update(pte); 1731 lazy_mmu_prot_update(pte);
1744 pte_unmap(page_table); 1732unlock:
1745 spin_unlock(&mm->page_table_lock); 1733 pte_unmap_unlock(page_table, ptl);
1746out: 1734out:
1747 return ret; 1735 return ret;
1748out_nomap: 1736out_nomap:
1749 pte_unmap(page_table); 1737 pte_unmap_unlock(page_table, ptl);
1750 spin_unlock(&mm->page_table_lock);
1751 unlock_page(page); 1738 unlock_page(page);
1752 page_cache_release(page); 1739 page_cache_release(page);
1753 goto out; 1740 return ret;
1754} 1741}
1755 1742
1756/* 1743/*
1757 * We are called with the MM semaphore and page_table_lock 1744 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1758 * spinlock held to protect against concurrent faults in 1745 * but allow concurrent faults), and pte mapped but not yet locked.
1759 * multithreaded programs. 1746 * We return with mmap_sem still held, but pte unmapped and unlocked.
1760 */ 1747 */
1761static int 1748static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1762do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, 1749 unsigned long address, pte_t *page_table, pmd_t *pmd,
1763 pte_t *page_table, pmd_t *pmd, int write_access, 1750 int write_access)
1764 unsigned long addr)
1765{ 1751{
1752 struct page *page;
1753 spinlock_t *ptl;
1766 pte_t entry; 1754 pte_t entry;
1767 struct page * page = ZERO_PAGE(addr);
1768
1769 /* Read-only mapping of ZERO_PAGE. */
1770 entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot));
1771 1755
1772 /* ..except if it's a write access */
1773 if (write_access) { 1756 if (write_access) {
1774 /* Allocate our own private page. */ 1757 /* Allocate our own private page. */
1775 pte_unmap(page_table); 1758 pte_unmap(page_table);
1776 spin_unlock(&mm->page_table_lock);
1777 1759
1778 if (unlikely(anon_vma_prepare(vma))) 1760 if (unlikely(anon_vma_prepare(vma)))
1779 goto no_mem; 1761 goto oom;
1780 page = alloc_zeroed_user_highpage(vma, addr); 1762 page = alloc_zeroed_user_highpage(vma, address);
1781 if (!page) 1763 if (!page)
1782 goto no_mem; 1764 goto oom;
1783 1765
1784 spin_lock(&mm->page_table_lock); 1766 entry = mk_pte(page, vma->vm_page_prot);
1785 page_table = pte_offset_map(pmd, addr); 1767 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1786 1768
1787 if (!pte_none(*page_table)) { 1769 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1788 pte_unmap(page_table); 1770 if (!pte_none(*page_table))
1789 page_cache_release(page); 1771 goto release;
1790 spin_unlock(&mm->page_table_lock); 1772 inc_mm_counter(mm, anon_rss);
1791 goto out;
1792 }
1793 inc_mm_counter(mm, rss);
1794 entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
1795 vma->vm_page_prot)),
1796 vma);
1797 lru_cache_add_active(page); 1773 lru_cache_add_active(page);
1798 SetPageReferenced(page); 1774 SetPageReferenced(page);
1799 page_add_anon_rmap(page, vma, addr); 1775 page_add_anon_rmap(page, vma, address);
1776 } else {
1777 /* Map the ZERO_PAGE - vm_page_prot is readonly */
1778 page = ZERO_PAGE(address);
1779 page_cache_get(page);
1780 entry = mk_pte(page, vma->vm_page_prot);
1781
1782 ptl = pte_lockptr(mm, pmd);
1783 spin_lock(ptl);
1784 if (!pte_none(*page_table))
1785 goto release;
1786 inc_mm_counter(mm, file_rss);
1787 page_add_file_rmap(page);
1800 } 1788 }
1801 1789
1802 set_pte_at(mm, addr, page_table, entry); 1790 set_pte_at(mm, address, page_table, entry);
1803 pte_unmap(page_table);
1804 1791
1805 /* No need to invalidate - it was non-present before */ 1792 /* No need to invalidate - it was non-present before */
1806 update_mmu_cache(vma, addr, entry); 1793 update_mmu_cache(vma, address, entry);
1807 lazy_mmu_prot_update(entry); 1794 lazy_mmu_prot_update(entry);
1808 spin_unlock(&mm->page_table_lock); 1795unlock:
1809out: 1796 pte_unmap_unlock(page_table, ptl);
1810 return VM_FAULT_MINOR; 1797 return VM_FAULT_MINOR;
1811no_mem: 1798release:
1799 page_cache_release(page);
1800 goto unlock;
1801oom:
1812 return VM_FAULT_OOM; 1802 return VM_FAULT_OOM;
1813} 1803}
1814 1804
@@ -1821,25 +1811,23 @@ no_mem:
1821 * As this is called only for pages that do not currently exist, we 1811 * As this is called only for pages that do not currently exist, we
1822 * do not need to flush old virtual caches or the TLB. 1812 * do not need to flush old virtual caches or the TLB.
1823 * 1813 *
1824 * This is called with the MM semaphore held and the page table 1814 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1825 * spinlock held. Exit with the spinlock released. 1815 * but allow concurrent faults), and pte mapped but not yet locked.
1816 * We return with mmap_sem still held, but pte unmapped and unlocked.
1826 */ 1817 */
1827static int 1818static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1828do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, 1819 unsigned long address, pte_t *page_table, pmd_t *pmd,
1829 unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) 1820 int write_access)
1830{ 1821{
1831 struct page * new_page; 1822 spinlock_t *ptl;
1823 struct page *new_page;
1832 struct address_space *mapping = NULL; 1824 struct address_space *mapping = NULL;
1833 pte_t entry; 1825 pte_t entry;
1834 unsigned int sequence = 0; 1826 unsigned int sequence = 0;
1835 int ret = VM_FAULT_MINOR; 1827 int ret = VM_FAULT_MINOR;
1836 int anon = 0; 1828 int anon = 0;
1837 1829
1838 if (!vma->vm_ops || !vma->vm_ops->nopage)
1839 return do_anonymous_page(mm, vma, page_table,
1840 pmd, write_access, address);
1841 pte_unmap(page_table); 1830 pte_unmap(page_table);
1842 spin_unlock(&mm->page_table_lock);
1843 1831
1844 if (vma->vm_file) { 1832 if (vma->vm_file) {
1845 mapping = vma->vm_file->f_mapping; 1833 mapping = vma->vm_file->f_mapping;
@@ -1847,7 +1835,6 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1847 smp_rmb(); /* serializes i_size against truncate_count */ 1835 smp_rmb(); /* serializes i_size against truncate_count */
1848 } 1836 }
1849retry: 1837retry:
1850 cond_resched();
1851 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret); 1838 new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
1852 /* 1839 /*
1853 * No smp_rmb is needed here as long as there's a full 1840 * No smp_rmb is needed here as long as there's a full
@@ -1880,19 +1867,20 @@ retry:
1880 anon = 1; 1867 anon = 1;
1881 } 1868 }
1882 1869
1883 spin_lock(&mm->page_table_lock); 1870 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
1884 /* 1871 /*
1885 * For a file-backed vma, someone could have truncated or otherwise 1872 * For a file-backed vma, someone could have truncated or otherwise
1886 * invalidated this page. If unmap_mapping_range got called, 1873 * invalidated this page. If unmap_mapping_range got called,
1887 * retry getting the page. 1874 * retry getting the page.
1888 */ 1875 */
1889 if (mapping && unlikely(sequence != mapping->truncate_count)) { 1876 if (mapping && unlikely(sequence != mapping->truncate_count)) {
1890 sequence = mapping->truncate_count; 1877 pte_unmap_unlock(page_table, ptl);
1891 spin_unlock(&mm->page_table_lock);
1892 page_cache_release(new_page); 1878 page_cache_release(new_page);
1879 cond_resched();
1880 sequence = mapping->truncate_count;
1881 smp_rmb();
1893 goto retry; 1882 goto retry;
1894 } 1883 }
1895 page_table = pte_offset_map(pmd, address);
1896 1884
1897 /* 1885 /*
1898 * This silly early PAGE_DIRTY setting removes a race 1886 * This silly early PAGE_DIRTY setting removes a race
@@ -1906,68 +1894,67 @@ retry:
1906 */ 1894 */
1907 /* Only go through if we didn't race with anybody else... */ 1895 /* Only go through if we didn't race with anybody else... */
1908 if (pte_none(*page_table)) { 1896 if (pte_none(*page_table)) {
1909 if (!PageReserved(new_page))
1910 inc_mm_counter(mm, rss);
1911
1912 flush_icache_page(vma, new_page); 1897 flush_icache_page(vma, new_page);
1913 entry = mk_pte(new_page, vma->vm_page_prot); 1898 entry = mk_pte(new_page, vma->vm_page_prot);
1914 if (write_access) 1899 if (write_access)
1915 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1900 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1916 set_pte_at(mm, address, page_table, entry); 1901 set_pte_at(mm, address, page_table, entry);
1917 if (anon) { 1902 if (anon) {
1903 inc_mm_counter(mm, anon_rss);
1918 lru_cache_add_active(new_page); 1904 lru_cache_add_active(new_page);
1919 page_add_anon_rmap(new_page, vma, address); 1905 page_add_anon_rmap(new_page, vma, address);
1920 } else 1906 } else if (!(vma->vm_flags & VM_RESERVED)) {
1907 inc_mm_counter(mm, file_rss);
1921 page_add_file_rmap(new_page); 1908 page_add_file_rmap(new_page);
1922 pte_unmap(page_table); 1909 }
1923 } else { 1910 } else {
1924 /* One of our sibling threads was faster, back out. */ 1911 /* One of our sibling threads was faster, back out. */
1925 pte_unmap(page_table);
1926 page_cache_release(new_page); 1912 page_cache_release(new_page);
1927 spin_unlock(&mm->page_table_lock); 1913 goto unlock;
1928 goto out;
1929 } 1914 }
1930 1915
1931 /* no need to invalidate: a not-present page shouldn't be cached */ 1916 /* no need to invalidate: a not-present page shouldn't be cached */
1932 update_mmu_cache(vma, address, entry); 1917 update_mmu_cache(vma, address, entry);
1933 lazy_mmu_prot_update(entry); 1918 lazy_mmu_prot_update(entry);
1934 spin_unlock(&mm->page_table_lock); 1919unlock:
1935out: 1920 pte_unmap_unlock(page_table, ptl);
1936 return ret; 1921 return ret;
1937oom: 1922oom:
1938 page_cache_release(new_page); 1923 page_cache_release(new_page);
1939 ret = VM_FAULT_OOM; 1924 return VM_FAULT_OOM;
1940 goto out;
1941} 1925}
1942 1926
1943/* 1927/*
1944 * Fault of a previously existing named mapping. Repopulate the pte 1928 * Fault of a previously existing named mapping. Repopulate the pte
1945 * from the encoded file_pte if possible. This enables swappable 1929 * from the encoded file_pte if possible. This enables swappable
1946 * nonlinear vmas. 1930 * nonlinear vmas.
1931 *
1932 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1933 * but allow concurrent faults), and pte mapped but not yet locked.
1934 * We return with mmap_sem still held, but pte unmapped and unlocked.
1947 */ 1935 */
1948static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma, 1936static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
1949 unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) 1937 unsigned long address, pte_t *page_table, pmd_t *pmd,
1938 int write_access, pte_t orig_pte)
1950{ 1939{
1951 unsigned long pgoff; 1940 pgoff_t pgoff;
1952 int err; 1941 int err;
1953 1942
1954 BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); 1943 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
1955 /* 1944 return VM_FAULT_MINOR;
1956 * Fall back to the linear mapping if the fs does not support
1957 * ->populate:
1958 */
1959 if (!vma->vm_ops->populate ||
1960 (write_access && !(vma->vm_flags & VM_SHARED))) {
1961 pte_clear(mm, address, pte);
1962 return do_no_page(mm, vma, address, write_access, pte, pmd);
1963 }
1964
1965 pgoff = pte_to_pgoff(*pte);
1966 1945
1967 pte_unmap(pte); 1946 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
1968 spin_unlock(&mm->page_table_lock); 1947 /*
1948 * Page table corrupted: show pte and kill process.
1949 */
1950 print_bad_pte(vma, orig_pte, address);
1951 return VM_FAULT_OOM;
1952 }
1953 /* We can then assume vm->vm_ops && vma->vm_ops->populate */
1969 1954
1970 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); 1955 pgoff = pte_to_pgoff(orig_pte);
1956 err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
1957 vma->vm_page_prot, pgoff, 0);
1971 if (err == -ENOMEM) 1958 if (err == -ENOMEM)
1972 return VM_FAULT_OOM; 1959 return VM_FAULT_OOM;
1973 if (err) 1960 if (err)
@@ -1984,56 +1971,68 @@ static int do_file_page(struct mm_struct * mm, struct vm_area_struct * vma,
1984 * with external mmu caches can use to update those (ie the Sparc or 1971 * with external mmu caches can use to update those (ie the Sparc or
1985 * PowerPC hashed page tables that act as extended TLBs). 1972 * PowerPC hashed page tables that act as extended TLBs).
1986 * 1973 *
1987 * Note the "page_table_lock". It is to protect against kswapd removing 1974 * We enter with non-exclusive mmap_sem (to exclude vma changes,
1988 * pages from under us. Note that kswapd only ever _removes_ pages, never 1975 * but allow concurrent faults), and pte mapped but not yet locked.
1989 * adds them. As such, once we have noticed that the page is not present, 1976 * We return with mmap_sem still held, but pte unmapped and unlocked.
1990 * we can drop the lock early.
1991 *
1992 * The adding of pages is protected by the MM semaphore (which we hold),
1993 * so we don't need to worry about a page being suddenly been added into
1994 * our VM.
1995 *
1996 * We enter with the pagetable spinlock held, we are supposed to
1997 * release it when done.
1998 */ 1977 */
1999static inline int handle_pte_fault(struct mm_struct *mm, 1978static inline int handle_pte_fault(struct mm_struct *mm,
2000 struct vm_area_struct * vma, unsigned long address, 1979 struct vm_area_struct *vma, unsigned long address,
2001 int write_access, pte_t *pte, pmd_t *pmd) 1980 pte_t *pte, pmd_t *pmd, int write_access)
2002{ 1981{
2003 pte_t entry; 1982 pte_t entry;
1983 pte_t old_entry;
1984 spinlock_t *ptl;
2004 1985
2005 entry = *pte; 1986 old_entry = entry = *pte;
2006 if (!pte_present(entry)) { 1987 if (!pte_present(entry)) {
2007 /* 1988 if (pte_none(entry)) {
2008 * If it truly wasn't present, we know that kswapd 1989 if (!vma->vm_ops || !vma->vm_ops->nopage)
2009 * and the PTE updates will not touch it later. So 1990 return do_anonymous_page(mm, vma, address,
2010 * drop the lock. 1991 pte, pmd, write_access);
2011 */ 1992 return do_no_page(mm, vma, address,
2012 if (pte_none(entry)) 1993 pte, pmd, write_access);
2013 return do_no_page(mm, vma, address, write_access, pte, pmd); 1994 }
2014 if (pte_file(entry)) 1995 if (pte_file(entry))
2015 return do_file_page(mm, vma, address, write_access, pte, pmd); 1996 return do_file_page(mm, vma, address,
2016 return do_swap_page(mm, vma, address, pte, pmd, entry, write_access); 1997 pte, pmd, write_access, entry);
1998 return do_swap_page(mm, vma, address,
1999 pte, pmd, write_access, entry);
2017 } 2000 }
2018 2001
2002 ptl = pte_lockptr(mm, pmd);
2003 spin_lock(ptl);
2004 if (unlikely(!pte_same(*pte, entry)))
2005 goto unlock;
2019 if (write_access) { 2006 if (write_access) {
2020 if (!pte_write(entry)) 2007 if (!pte_write(entry))
2021 return do_wp_page(mm, vma, address, pte, pmd, entry); 2008 return do_wp_page(mm, vma, address,
2009 pte, pmd, ptl, entry);
2022 entry = pte_mkdirty(entry); 2010 entry = pte_mkdirty(entry);
2023 } 2011 }
2024 entry = pte_mkyoung(entry); 2012 entry = pte_mkyoung(entry);
2025 ptep_set_access_flags(vma, address, pte, entry, write_access); 2013 if (!pte_same(old_entry, entry)) {
2026 update_mmu_cache(vma, address, entry); 2014 ptep_set_access_flags(vma, address, pte, entry, write_access);
2027 lazy_mmu_prot_update(entry); 2015 update_mmu_cache(vma, address, entry);
2028 pte_unmap(pte); 2016 lazy_mmu_prot_update(entry);
2029 spin_unlock(&mm->page_table_lock); 2017 } else {
2018 /*
2019 * This is needed only for protection faults but the arch code
2020 * is not yet telling us if this is a protection fault or not.
2021 * This still avoids useless tlb flushes for .text page faults
2022 * with threads.
2023 */
2024 if (write_access)
2025 flush_tlb_page(vma, address);
2026 }
2027unlock:
2028 pte_unmap_unlock(pte, ptl);
2030 return VM_FAULT_MINOR; 2029 return VM_FAULT_MINOR;
2031} 2030}
2032 2031
2033/* 2032/*
2034 * By the time we get here, we already hold the mm semaphore 2033 * By the time we get here, we already hold the mm semaphore
2035 */ 2034 */
2036int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, 2035int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2037 unsigned long address, int write_access) 2036 unsigned long address, int write_access)
2038{ 2037{
2039 pgd_t *pgd; 2038 pgd_t *pgd;
@@ -2048,100 +2047,66 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
2048 if (unlikely(is_vm_hugetlb_page(vma))) 2047 if (unlikely(is_vm_hugetlb_page(vma)))
2049 return hugetlb_fault(mm, vma, address, write_access); 2048 return hugetlb_fault(mm, vma, address, write_access);
2050 2049
2051 /*
2052 * We need the page table lock to synchronize with kswapd
2053 * and the SMP-safe atomic PTE updates.
2054 */
2055 pgd = pgd_offset(mm, address); 2050 pgd = pgd_offset(mm, address);
2056 spin_lock(&mm->page_table_lock);
2057
2058 pud = pud_alloc(mm, pgd, address); 2051 pud = pud_alloc(mm, pgd, address);
2059 if (!pud) 2052 if (!pud)
2060 goto oom; 2053 return VM_FAULT_OOM;
2061
2062 pmd = pmd_alloc(mm, pud, address); 2054 pmd = pmd_alloc(mm, pud, address);
2063 if (!pmd) 2055 if (!pmd)
2064 goto oom; 2056 return VM_FAULT_OOM;
2065
2066 pte = pte_alloc_map(mm, pmd, address); 2057 pte = pte_alloc_map(mm, pmd, address);
2067 if (!pte) 2058 if (!pte)
2068 goto oom; 2059 return VM_FAULT_OOM;
2069
2070 return handle_pte_fault(mm, vma, address, write_access, pte, pmd);
2071 2060
2072 oom: 2061 return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
2073 spin_unlock(&mm->page_table_lock);
2074 return VM_FAULT_OOM;
2075} 2062}
2076 2063
2077#ifndef __PAGETABLE_PUD_FOLDED 2064#ifndef __PAGETABLE_PUD_FOLDED
2078/* 2065/*
2079 * Allocate page upper directory. 2066 * Allocate page upper directory.
2080 * 2067 * We've already handled the fast-path in-line.
2081 * We've already handled the fast-path in-line, and we own the
2082 * page table lock.
2083 */ 2068 */
2084pud_t fastcall *__pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) 2069int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
2085{ 2070{
2086 pud_t *new; 2071 pud_t *new = pud_alloc_one(mm, address);
2087
2088 spin_unlock(&mm->page_table_lock);
2089 new = pud_alloc_one(mm, address);
2090 spin_lock(&mm->page_table_lock);
2091 if (!new) 2072 if (!new)
2092 return NULL; 2073 return -ENOMEM;
2093 2074
2094 /* 2075 spin_lock(&mm->page_table_lock);
2095 * Because we dropped the lock, we should re-check the 2076 if (pgd_present(*pgd)) /* Another has populated it */
2096 * entry, as somebody else could have populated it..
2097 */
2098 if (pgd_present(*pgd)) {
2099 pud_free(new); 2077 pud_free(new);
2100 goto out; 2078 else
2101 } 2079 pgd_populate(mm, pgd, new);
2102 pgd_populate(mm, pgd, new); 2080 spin_unlock(&mm->page_table_lock);
2103 out: 2081 return 0;
2104 return pud_offset(pgd, address);
2105} 2082}
2106#endif /* __PAGETABLE_PUD_FOLDED */ 2083#endif /* __PAGETABLE_PUD_FOLDED */
2107 2084
2108#ifndef __PAGETABLE_PMD_FOLDED 2085#ifndef __PAGETABLE_PMD_FOLDED
2109/* 2086/*
2110 * Allocate page middle directory. 2087 * Allocate page middle directory.
2111 * 2088 * We've already handled the fast-path in-line.
2112 * We've already handled the fast-path in-line, and we own the
2113 * page table lock.
2114 */ 2089 */
2115pmd_t fastcall *__pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) 2090int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
2116{ 2091{
2117 pmd_t *new; 2092 pmd_t *new = pmd_alloc_one(mm, address);
2118
2119 spin_unlock(&mm->page_table_lock);
2120 new = pmd_alloc_one(mm, address);
2121 spin_lock(&mm->page_table_lock);
2122 if (!new) 2093 if (!new)
2123 return NULL; 2094 return -ENOMEM;
2124 2095
2125 /* 2096 spin_lock(&mm->page_table_lock);
2126 * Because we dropped the lock, we should re-check the
2127 * entry, as somebody else could have populated it..
2128 */
2129#ifndef __ARCH_HAS_4LEVEL_HACK 2097#ifndef __ARCH_HAS_4LEVEL_HACK
2130 if (pud_present(*pud)) { 2098 if (pud_present(*pud)) /* Another has populated it */
2131 pmd_free(new); 2099 pmd_free(new);
2132 goto out; 2100 else
2133 } 2101 pud_populate(mm, pud, new);
2134 pud_populate(mm, pud, new);
2135#else 2102#else
2136 if (pgd_present(*pud)) { 2103 if (pgd_present(*pud)) /* Another has populated it */
2137 pmd_free(new); 2104 pmd_free(new);
2138 goto out; 2105 else
2139 } 2106 pgd_populate(mm, pud, new);
2140 pgd_populate(mm, pud, new);
2141#endif /* __ARCH_HAS_4LEVEL_HACK */ 2107#endif /* __ARCH_HAS_4LEVEL_HACK */
2142 2108 spin_unlock(&mm->page_table_lock);
2143 out: 2109 return 0;
2144 return pmd_offset(pud, address);
2145} 2110}
2146#endif /* __PAGETABLE_PMD_FOLDED */ 2111#endif /* __PAGETABLE_PMD_FOLDED */
2147 2112
@@ -2206,22 +2171,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
2206 2171
2207EXPORT_SYMBOL(vmalloc_to_pfn); 2172EXPORT_SYMBOL(vmalloc_to_pfn);
2208 2173
2209/*
2210 * update_mem_hiwater
2211 * - update per process rss and vm high water data
2212 */
2213void update_mem_hiwater(struct task_struct *tsk)
2214{
2215 if (tsk->mm) {
2216 unsigned long rss = get_mm_counter(tsk->mm, rss);
2217
2218 if (tsk->mm->hiwater_rss < rss)
2219 tsk->mm->hiwater_rss = rss;
2220 if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
2221 tsk->mm->hiwater_vm = tsk->mm->total_vm;
2222 }
2223}
2224
2225#if !defined(__HAVE_ARCH_GATE_AREA) 2174#if !defined(__HAVE_ARCH_GATE_AREA)
2226 2175
2227#if defined(AT_SYSINFO_EHDR) 2176#if defined(AT_SYSINFO_EHDR)
@@ -2233,7 +2182,7 @@ static int __init gate_vma_init(void)
2233 gate_vma.vm_start = FIXADDR_USER_START; 2182 gate_vma.vm_start = FIXADDR_USER_START;
2234 gate_vma.vm_end = FIXADDR_USER_END; 2183 gate_vma.vm_end = FIXADDR_USER_END;
2235 gate_vma.vm_page_prot = PAGE_READONLY; 2184 gate_vma.vm_page_prot = PAGE_READONLY;
2236 gate_vma.vm_flags = 0; 2185 gate_vma.vm_flags = VM_RESERVED;
2237 return 0; 2186 return 0;
2238} 2187}
2239__initcall(gate_vma_init); 2188__initcall(gate_vma_init);