aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2005-11-28 17:34:23 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-28 17:34:23 -0500
commit6aab341e0a28aff100a09831c5300a2994b8b986 (patch)
tree1af3908275aa5e1b16e80efee554a9a7504c56d4
parent458af5439fe7ae7d95ca14106844e61f0795166c (diff)
mm: re-architect the VM_UNPAGED logic
This replaces the (in my opinion horrible) VM_UNMAPPED logic with very explicit support for a "remapped page range" aka VM_PFNMAP. It allows a VM area to contain an arbitrary range of page table entries that the VM never touches, and never considers to be normal pages. Any user of "remap_pfn_range()" automatically gets this new functionality, and doesn't even have to mark the pages reserved or indeed mark them any other way. It just works. As a side effect, doing mmap() on /dev/mem works for arbitrary ranges. Sparc update from David in the next commit. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--arch/powerpc/kernel/vdso.c6
-rw-r--r--drivers/char/mem.c2
-rw-r--r--fs/proc/task_mmu.c7
-rw-r--r--include/linux/mm.h5
-rw-r--r--mm/fremap.c22
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c189
-rw-r--r--mm/mempolicy.c12
-rw-r--r--mm/msync.c12
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/rmap.c14
11 files changed, 127 insertions, 146 deletions
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index b44b36e0c293..f0c47dab0903 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
145 struct page *pg = virt_to_page(vdso32_kbase + 145 struct page *pg = virt_to_page(vdso32_kbase +
146 i*PAGE_SIZE); 146 i*PAGE_SIZE);
147 struct page *upg = (vma && vma->vm_mm) ? 147 struct page *upg = (vma && vma->vm_mm) ?
148 follow_page(vma->vm_mm, vma->vm_start + 148 follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
149 i*PAGE_SIZE, 0)
150 : NULL; 149 : NULL;
151 dump_one_vdso_page(pg, upg); 150 dump_one_vdso_page(pg, upg);
152 } 151 }
@@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma)
157 struct page *pg = virt_to_page(vdso64_kbase + 156 struct page *pg = virt_to_page(vdso64_kbase +
158 i*PAGE_SIZE); 157 i*PAGE_SIZE);
159 struct page *upg = (vma && vma->vm_mm) ? 158 struct page *upg = (vma && vma->vm_mm) ?
160 follow_page(vma->vm_mm, vma->vm_start + 159 follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0)
161 i*PAGE_SIZE, 0)
162 : NULL; 160 : NULL;
163 dump_one_vdso_page(pg, upg); 161 dump_one_vdso_page(pg, upg);
164 } 162 }
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 29c3b631445a..91dd669273e0 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size)
591 591
592 if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) 592 if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0)
593 goto out_up; 593 goto out_up;
594 if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED)) 594 if (vma->vm_flags & (VM_SHARED | VM_HUGETLB))
595 break; 595 break;
596 count = vma->vm_end - addr; 596 count = vma->vm_end - addr;
597 if (count > size) 597 if (count > size)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 9ab97cef0daa..50bd5a8f0446 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -402,12 +402,11 @@ struct numa_maps {
402/* 402/*
403 * Calculate numa node maps for a vma 403 * Calculate numa node maps for a vma
404 */ 404 */
405static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) 405static struct numa_maps *get_numa_maps(struct vm_area_struct *vma)
406{ 406{
407 int i;
407 struct page *page; 408 struct page *page;
408 unsigned long vaddr; 409 unsigned long vaddr;
409 struct mm_struct *mm = vma->vm_mm;
410 int i;
411 struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); 410 struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
412 411
413 if (!md) 412 if (!md)
@@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
420 md->node[i] =0; 419 md->node[i] =0;
421 420
422 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { 421 for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
423 page = follow_page(mm, vaddr, 0); 422 page = follow_page(vma, vaddr, 0);
424 if (page) { 423 if (page) {
425 int count = page_mapcount(page); 424 int count = page_mapcount(page);
426 425
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f0cdfd18db55..6a75a7a78bf1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp);
145#define VM_GROWSDOWN 0x00000100 /* general info on the segment */ 145#define VM_GROWSDOWN 0x00000100 /* general info on the segment */
146#define VM_GROWSUP 0x00000200 146#define VM_GROWSUP 0x00000200
147#define VM_SHM 0x00000000 /* Means nothing: delete it later */ 147#define VM_SHM 0x00000000 /* Means nothing: delete it later */
148#define VM_UNPAGED 0x00000400 /* Pages managed without map count */ 148#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
149#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 149#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
150 150
151#define VM_EXECUTABLE 0x00001000 151#define VM_EXECUTABLE 0x00001000
@@ -664,6 +664,7 @@ struct zap_details {
664 unsigned long truncate_count; /* Compare vm_truncate_count */ 664 unsigned long truncate_count; /* Compare vm_truncate_count */
665}; 665};
666 666
667struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t);
667unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, 668unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
668 unsigned long size, struct zap_details *); 669 unsigned long size, struct zap_details *);
669unsigned long unmap_vmas(struct mmu_gather **tlb, 670unsigned long unmap_vmas(struct mmu_gather **tlb,
@@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr);
953int remap_pfn_range(struct vm_area_struct *, unsigned long addr, 954int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
954 unsigned long pfn, unsigned long size, pgprot_t); 955 unsigned long pfn, unsigned long size, pgprot_t);
955 956
956struct page *follow_page(struct mm_struct *, unsigned long address, 957struct page *follow_page(struct vm_area_struct *, unsigned long address,
957 unsigned int foll_flags); 958 unsigned int foll_flags);
958#define FOLL_WRITE 0x01 /* check pte is writable */ 959#define FOLL_WRITE 0x01 /* check pte is writable */
959#define FOLL_TOUCH 0x02 /* mark page accessed */ 960#define FOLL_TOUCH 0x02 /* mark page accessed */
diff --git a/mm/fremap.c b/mm/fremap.c
index 007cbad9331e..f851775e09c2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
27 struct page *page = NULL; 27 struct page *page = NULL;
28 28
29 if (pte_present(pte)) { 29 if (pte_present(pte)) {
30 unsigned long pfn = pte_pfn(pte); 30 flush_cache_page(vma, addr, pte_pfn(pte));
31 flush_cache_page(vma, addr, pfn);
32 pte = ptep_clear_flush(vma, addr, ptep); 31 pte = ptep_clear_flush(vma, addr, ptep);
33 if (unlikely(!pfn_valid(pfn))) { 32 page = vm_normal_page(vma, addr, pte);
34 print_bad_pte(vma, pte, addr); 33 if (page) {
35 goto out; 34 if (pte_dirty(pte))
35 set_page_dirty(page);
36 page_remove_rmap(page);
37 page_cache_release(page);
36 } 38 }
37 page = pfn_to_page(pfn);
38 if (pte_dirty(pte))
39 set_page_dirty(page);
40 page_remove_rmap(page);
41 page_cache_release(page);
42 } else { 39 } else {
43 if (!pte_file(pte)) 40 if (!pte_file(pte))
44 free_swap_and_cache(pte_to_swp_entry(pte)); 41 free_swap_and_cache(pte_to_swp_entry(pte));
45 pte_clear(mm, addr, ptep); 42 pte_clear(mm, addr, ptep);
46 } 43 }
47out:
48 return !!page; 44 return !!page;
49} 45}
50 46
@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
65 pte_t pte_val; 61 pte_t pte_val;
66 spinlock_t *ptl; 62 spinlock_t *ptl;
67 63
68 BUG_ON(vma->vm_flags & VM_UNPAGED);
69
70 pgd = pgd_offset(mm, addr); 64 pgd = pgd_offset(mm, addr);
71 pud = pud_alloc(mm, pgd, addr); 65 pud = pud_alloc(mm, pgd, addr);
72 if (!pud) 66 if (!pud)
@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
122 pte_t pte_val; 116 pte_t pte_val;
123 spinlock_t *ptl; 117 spinlock_t *ptl;
124 118
125 BUG_ON(vma->vm_flags & VM_UNPAGED);
126
127 pgd = pgd_offset(mm, addr); 119 pgd = pgd_offset(mm, addr);
128 pud = pud_alloc(mm, pgd, addr); 120 pud = pud_alloc(mm, pgd, addr);
129 if (!pud) 121 if (!pud)
diff --git a/mm/madvise.c b/mm/madvise.c
index 328a3bcce527..2b7cf0400a21 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
126 unsigned long start, unsigned long end) 126 unsigned long start, unsigned long end)
127{ 127{
128 *prev = vma; 128 *prev = vma;
129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) 129 if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
130 return -EINVAL; 130 return -EINVAL;
131 131
132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) { 132 if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c8a..b57fbc636058 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
333} 333}
334 334
335/* 335/*
336 * This function is called to print an error when a pte in a 336 * This function is called to print an error when a bad pte
337 * !VM_UNPAGED region is found pointing to an invalid pfn (which 337 * is found. For example, we might have a PFN-mapped pte in
338 * is an error. 338 * a region that doesn't allow it.
339 * 339 *
340 * The calling function must still handle the error. 340 * The calling function must still handle the error.
341 */ 341 */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
350} 350}
351 351
352/* 352/*
353 * page_is_anon applies strict checks for an anonymous page belonging to 353 * This function gets the "struct page" associated with a pte.
354 * this vma at this address. It is used on VM_UNPAGED vmas, which are 354 *
355 * usually populated with shared originals (which must not be counted), 355 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
356 * but occasionally contain private COWed copies (when !VM_SHARED, or 356 * will have each page table entry just pointing to a raw page frame
357 * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window 357 * number, and as far as the VM layer is concerned, those do not have
358 * free pages, pages from other processes, or from other parts of this: 358 * pages associated with them - even if the PFN might point to memory
359 * it's tricky, but try not to be deceived by foreign anonymous pages. 359 * that otherwise is perfectly fine and has a "struct page".
360 *
361 * The way we recognize those mappings is through the rules set up
362 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
363 * and the vm_pgoff will point to the first PFN mapped: thus every
364 * page that is a raw mapping will always honor the rule
365 *
366 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
367 *
368 * and if that isn't true, the page has been COW'ed (in which case it
369 * _does_ have a "struct page" associated with it even if it is in a
370 * VM_PFNMAP range).
360 */ 371 */
361static inline int page_is_anon(struct page *page, 372struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
362 struct vm_area_struct *vma, unsigned long addr)
363{ 373{
364 return page && PageAnon(page) && page_mapped(page) && 374 unsigned long pfn = pte_pfn(pte);
365 page_address_in_vma(page, vma) == addr; 375
376 if (vma->vm_flags & VM_PFNMAP) {
377 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
378 if (pfn == vma->vm_pgoff + off)
379 return NULL;
380 }
381
382 /*
383 * Add some anal sanity checks for now. Eventually,
384 * we should just do "return pfn_to_page(pfn)", but
385 * in the meantime we check that we get a valid pfn,
386 * and that the resulting page looks ok.
387 *
388 * Remove this test eventually!
389 */
390 if (unlikely(!pfn_valid(pfn))) {
391 print_bad_pte(vma, pte, addr);
392 return NULL;
393 }
394
395 /*
396 * NOTE! We still have PageReserved() pages in the page
397 * tables.
398 *
399 * The PAGE_ZERO() pages and various VDSO mappings can
400 * cause them to exist.
401 */
402 return pfn_to_page(pfn);
366} 403}
367 404
368/* 405/*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
379 unsigned long vm_flags = vma->vm_flags; 416 unsigned long vm_flags = vma->vm_flags;
380 pte_t pte = *src_pte; 417 pte_t pte = *src_pte;
381 struct page *page; 418 struct page *page;
382 unsigned long pfn;
383 419
384 /* pte contains position in swap or file, so copy. */ 420 /* pte contains position in swap or file, so copy. */
385 if (unlikely(!pte_present(pte))) { 421 if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
397 goto out_set_pte; 433 goto out_set_pte;
398 } 434 }
399 435
400 pfn = pte_pfn(pte);
401 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
402
403 if (unlikely(vm_flags & VM_UNPAGED))
404 if (!page_is_anon(page, vma, addr))
405 goto out_set_pte;
406
407 /*
408 * If the pte points outside of valid memory but
409 * the region is not VM_UNPAGED, we have a problem.
410 */
411 if (unlikely(!page)) {
412 print_bad_pte(vma, pte, addr);
413 goto out_set_pte; /* try to do something sane */
414 }
415
416 /* 436 /*
417 * If it's a COW mapping, write protect it both 437 * If it's a COW mapping, write protect it both
418 * in the parent and the child 438 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
429 if (vm_flags & VM_SHARED) 449 if (vm_flags & VM_SHARED)
430 pte = pte_mkclean(pte); 450 pte = pte_mkclean(pte);
431 pte = pte_mkold(pte); 451 pte = pte_mkold(pte);
432 get_page(page); 452
433 page_dup_rmap(page); 453 page = vm_normal_page(vma, addr, pte);
434 rss[!!PageAnon(page)]++; 454 if (page) {
455 get_page(page);
456 page_dup_rmap(page);
457 rss[!!PageAnon(page)]++;
458 }
435 459
436out_set_pte: 460out_set_pte:
437 set_pte_at(dst_mm, addr, dst_pte, pte); 461 set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
543 * readonly mappings. The tradeoff is that copy_page_range is more 567 * readonly mappings. The tradeoff is that copy_page_range is more
544 * efficient than faulting. 568 * efficient than faulting.
545 */ 569 */
546 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { 570 if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
547 if (!vma->anon_vma) 571 if (!vma->anon_vma)
548 return 0; 572 return 0;
549 } 573 }
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
584 } 608 }
585 if (pte_present(ptent)) { 609 if (pte_present(ptent)) {
586 struct page *page; 610 struct page *page;
587 unsigned long pfn;
588 611
589 (*zap_work) -= PAGE_SIZE; 612 (*zap_work) -= PAGE_SIZE;
590 613
591 pfn = pte_pfn(ptent); 614 page = vm_normal_page(vma, addr, ptent);
592 page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
593
594 if (unlikely(vma->vm_flags & VM_UNPAGED)) {
595 if (!page_is_anon(page, vma, addr))
596 page = NULL;
597 } else if (unlikely(!page))
598 print_bad_pte(vma, ptent, addr);
599
600 if (unlikely(details) && page) { 615 if (unlikely(details) && page) {
601 /* 616 /*
602 * unmap_shared_mapping_pages() wants to 617 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
852/* 867/*
853 * Do a quick page-table lookup for a single page. 868 * Do a quick page-table lookup for a single page.
854 */ 869 */
855struct page *follow_page(struct mm_struct *mm, unsigned long address, 870struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
856 unsigned int flags) 871 unsigned int flags)
857{ 872{
858 pgd_t *pgd; 873 pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
860 pmd_t *pmd; 875 pmd_t *pmd;
861 pte_t *ptep, pte; 876 pte_t *ptep, pte;
862 spinlock_t *ptl; 877 spinlock_t *ptl;
863 unsigned long pfn;
864 struct page *page; 878 struct page *page;
879 struct mm_struct *mm = vma->vm_mm;
865 880
866 page = follow_huge_addr(mm, address, flags & FOLL_WRITE); 881 page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
867 if (!IS_ERR(page)) { 882 if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
897 goto unlock; 912 goto unlock;
898 if ((flags & FOLL_WRITE) && !pte_write(pte)) 913 if ((flags & FOLL_WRITE) && !pte_write(pte))
899 goto unlock; 914 goto unlock;
900 pfn = pte_pfn(pte); 915 page = vm_normal_page(vma, address, pte);
901 if (!pfn_valid(pfn)) 916 if (unlikely(!page))
902 goto unlock; 917 goto unlock;
903 918
904 page = pfn_to_page(pfn);
905 if (flags & FOLL_GET) 919 if (flags & FOLL_GET)
906 get_page(page); 920 get_page(page);
907 if (flags & FOLL_TOUCH) { 921 if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
974 return i ? : -EFAULT; 988 return i ? : -EFAULT;
975 } 989 }
976 if (pages) { 990 if (pages) {
977 pages[i] = pte_page(*pte); 991 struct page *page = vm_normal_page(vma, start, *pte);
978 get_page(pages[i]); 992 pages[i] = page;
993 if (page)
994 get_page(page);
979 } 995 }
980 pte_unmap(pte); 996 pte_unmap(pte);
981 if (vmas) 997 if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1010 foll_flags |= FOLL_WRITE; 1026 foll_flags |= FOLL_WRITE;
1011 1027
1012 cond_resched(); 1028 cond_resched();
1013 while (!(page = follow_page(mm, start, foll_flags))) { 1029 while (!(page = follow_page(vma, start, foll_flags))) {
1014 int ret; 1030 int ret;
1015 ret = __handle_mm_fault(mm, vma, start, 1031 ret = __handle_mm_fault(mm, vma, start,
1016 foll_flags & FOLL_WRITE); 1032 foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1214 * in 2.6 the LRU scan won't even find its pages, so this 1230 * in 2.6 the LRU scan won't even find its pages, so this
1215 * flag means no more than count its pages in reserved_vm, 1231 * flag means no more than count its pages in reserved_vm,
1216 * and omit it from core dump, even when VM_IO turned off. 1232 * and omit it from core dump, even when VM_IO turned off.
1217 * VM_UNPAGED tells the core MM not to "manage" these pages 1233 * VM_PFNMAP tells the core MM that the base pages are just
1218 * (e.g. refcount, mapcount, try to swap them out): in 1234 * raw PFN mappings, and do not have a "struct page" associated
1219 * particular, zap_pte_range does not try to free them. 1235 * with them.
1220 */ 1236 */
1221 vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; 1237 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1238 vma->vm_pgoff = pfn;
1222 1239
1223 BUG_ON(addr >= end); 1240 BUG_ON(addr >= end);
1224 pfn -= addr >> PAGE_SHIFT; 1241 pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
1273 return pte; 1290 return pte;
1274} 1291}
1275 1292
1293static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
1294{
1295 /*
1296 * If the source page was a PFN mapping, we don't have
1297 * a "struct page" for it. We do a best-effort copy by
1298 * just copying from the original user address. If that
1299 * fails, we just zero-fill it. Live with it.
1300 */
1301 if (unlikely(!src)) {
1302 void *kaddr = kmap_atomic(dst, KM_USER0);
1303 unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
1304 if (left)
1305 memset(kaddr, 0, PAGE_SIZE);
1306 kunmap_atomic(kaddr, KM_USER0);
1307 return;
1308
1309 }
1310 copy_user_highpage(dst, src, va);
1311}
1312
1276/* 1313/*
1277 * This routine handles present pages, when users try to write 1314 * This routine handles present pages, when users try to write
1278 * to a shared page. It is done by copying the page to a new address 1315 * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1296 spinlock_t *ptl, pte_t orig_pte) 1333 spinlock_t *ptl, pte_t orig_pte)
1297{ 1334{
1298 struct page *old_page, *src_page, *new_page; 1335 struct page *old_page, *src_page, *new_page;
1299 unsigned long pfn = pte_pfn(orig_pte);
1300 pte_t entry; 1336 pte_t entry;
1301 int ret = VM_FAULT_MINOR; 1337 int ret = VM_FAULT_MINOR;
1302 1338
1303 if (unlikely(!pfn_valid(pfn))) { 1339 old_page = vm_normal_page(vma, address, orig_pte);
1304 /*
1305 * Page table corrupted: show pte and kill process.
1306 * Or it's an attempt to COW an out-of-map VM_UNPAGED
1307 * entry, which copy_user_highpage does not support.
1308 */
1309 print_bad_pte(vma, orig_pte, address);
1310 ret = VM_FAULT_OOM;
1311 goto unlock;
1312 }
1313 old_page = pfn_to_page(pfn);
1314 src_page = old_page; 1340 src_page = old_page;
1315 1341 if (!old_page)
1316 if (unlikely(vma->vm_flags & VM_UNPAGED)) 1342 goto gotten;
1317 if (!page_is_anon(old_page, vma, address)) {
1318 old_page = NULL;
1319 goto gotten;
1320 }
1321 1343
1322 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { 1344 if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
1323 int reuse = can_share_swap_page(old_page); 1345 int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
1351 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1373 new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1352 if (!new_page) 1374 if (!new_page)
1353 goto oom; 1375 goto oom;
1354 copy_user_highpage(new_page, src_page, address); 1376 cow_user_page(new_page, src_page, address);
1355 } 1377 }
1356 1378
1357 /* 1379 /*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
1812 spinlock_t *ptl; 1834 spinlock_t *ptl;
1813 pte_t entry; 1835 pte_t entry;
1814 1836
1815 /* 1837 if (write_access) {
1816 * A VM_UNPAGED vma will normally be filled with present ptes
1817 * by remap_pfn_range, and never arrive here; but it might have
1818 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
1819 * It's weird enough handling anon pages in unpaged vmas, we do
1820 * not want to worry about ZERO_PAGEs too (it may or may not
1821 * matter if their counts wrap): just give them anon pages.
1822 */
1823
1824 if (write_access || (vma->vm_flags & VM_UNPAGED)) {
1825 /* Allocate our own private page. */ 1838 /* Allocate our own private page. */
1826 pte_unmap(page_table); 1839 pte_unmap(page_table);
1827 1840
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
1896 int anon = 0; 1909 int anon = 0;
1897 1910
1898 pte_unmap(page_table); 1911 pte_unmap(page_table);
1899 BUG_ON(vma->vm_flags & VM_UNPAGED);
1900
1901 if (vma->vm_file) { 1912 if (vma->vm_file) {
1902 mapping = vma->vm_file->f_mapping; 1913 mapping = vma->vm_file->f_mapping;
1903 sequence = mapping->truncate_count; 1914 sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
1930 page = alloc_page_vma(GFP_HIGHUSER, vma, address); 1941 page = alloc_page_vma(GFP_HIGHUSER, vma, address);
1931 if (!page) 1942 if (!page)
1932 goto oom; 1943 goto oom;
1933 copy_user_highpage(page, new_page, address); 1944 cow_user_page(page, new_page, address);
1934 page_cache_release(new_page); 1945 page_cache_release(new_page);
1935 new_page = page; 1946 new_page = page;
1936 anon = 1; 1947 anon = 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5609a31bdf22..bec88c81244e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
189 189
190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
191 do { 191 do {
192 unsigned long pfn; 192 struct page *page;
193 unsigned int nid; 193 unsigned int nid;
194 194
195 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
196 continue; 196 continue;
197 pfn = pte_pfn(*pte); 197 page = vm_normal_page(vma, addr, *pte);
198 if (!pfn_valid(pfn)) { 198 if (!page)
199 print_bad_pte(vma, *pte, addr);
200 continue; 199 continue;
201 } 200 nid = page_to_nid(page);
202 nid = pfn_to_nid(pfn);
203 if (!node_isset(nid, *nodes)) 201 if (!node_isset(nid, *nodes))
204 break; 202 break;
205 } while (pte++, addr += PAGE_SIZE, addr != end); 203 } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
269 first = find_vma(mm, start); 267 first = find_vma(mm, start);
270 if (!first) 268 if (!first)
271 return ERR_PTR(-EFAULT); 269 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_UNPAGED)
273 return ERR_PTR(-EACCES);
274 prev = NULL; 270 prev = NULL;
275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 271 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
276 if (!vma->vm_next && vma->vm_end < end) 272 if (!vma->vm_next && vma->vm_end < end)
diff --git a/mm/msync.c b/mm/msync.c
index b3f4caf3010b..1b5b6f662dcf 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
27again: 27again:
28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 28 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
29 do { 29 do {
30 unsigned long pfn;
31 struct page *page; 30 struct page *page;
32 31
33 if (progress >= 64) { 32 if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
40 continue; 39 continue;
41 if (!pte_maybe_dirty(*pte)) 40 if (!pte_maybe_dirty(*pte))
42 continue; 41 continue;
43 pfn = pte_pfn(*pte); 42 page = vm_normal_page(vma, addr, *pte);
44 if (unlikely(!pfn_valid(pfn))) { 43 if (!page)
45 print_bad_pte(vma, *pte, addr);
46 continue; 44 continue;
47 }
48 page = pfn_to_page(pfn);
49
50 if (ptep_clear_flush_dirty(vma, addr, pte) || 45 if (ptep_clear_flush_dirty(vma, addr, pte) ||
51 page_test_and_clear_dirty(page)) 46 page_test_and_clear_dirty(page))
52 set_page_dirty(page); 47 set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
97 /* For hugepages we can't go walking the page table normally, 92 /* For hugepages we can't go walking the page table normally,
98 * but that's ok, hugetlbfs is memory based, so we don't need 93 * but that's ok, hugetlbfs is memory based, so we don't need
99 * to do anything more on an msync(). 94 * to do anything more on an msync().
100 * Can't do anything with VM_UNPAGED regions either.
101 */ 95 */
102 if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) 96 if (vma->vm_flags & VM_HUGETLB)
103 return; 97 return;
104 98
105 BUG_ON(addr >= end); 99 BUG_ON(addr >= end);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6ad..c1196812876b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1045 1045
1046EXPORT_SYMBOL(find_vma); 1046EXPORT_SYMBOL(find_vma);
1047 1047
1048struct page *follow_page(struct mm_struct *mm, unsigned long address, 1048struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1049 unsigned int foll_flags) 1049 unsigned int foll_flags)
1050{ 1050{
1051 return NULL; 1051 return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index 2e034a0b89ab..6389cda02a20 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma)
226/* 226/*
227 * At what user virtual address is page expected in vma? checking that the 227 * At what user virtual address is page expected in vma? checking that the
228 * page matches the vma: currently only used on anon pages, by unuse_vma; 228 * page matches the vma: currently only used on anon pages, by unuse_vma;
229 * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking
230 * care that an mmap of /dev/mem might window free and foreign pages.
231 */ 229 */
232unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) 230unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
233{ 231{
@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
614 struct page *page; 612 struct page *page;
615 unsigned long address; 613 unsigned long address;
616 unsigned long end; 614 unsigned long end;
617 unsigned long pfn;
618 615
619 address = (vma->vm_start + cursor) & CLUSTER_MASK; 616 address = (vma->vm_start + cursor) & CLUSTER_MASK;
620 end = address + CLUSTER_SIZE; 617 end = address + CLUSTER_SIZE;
@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
643 for (; address < end; pte++, address += PAGE_SIZE) { 640 for (; address < end; pte++, address += PAGE_SIZE) {
644 if (!pte_present(*pte)) 641 if (!pte_present(*pte))
645 continue; 642 continue;
646 643 page = vm_normal_page(vma, address, *pte);
647 pfn = pte_pfn(*pte); 644 BUG_ON(!page || PageAnon(page));
648 if (unlikely(!pfn_valid(pfn))) {
649 print_bad_pte(vma, *pte, address);
650 continue;
651 }
652
653 page = pfn_to_page(pfn);
654 BUG_ON(PageAnon(page));
655 645
656 if (ptep_clear_flush_young(vma, address, pte)) 646 if (ptep_clear_flush_young(vma, address, pte))
657 continue; 647 continue;