diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-28 17:34:23 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-11-28 17:34:23 -0500 |
commit | 6aab341e0a28aff100a09831c5300a2994b8b986 (patch) | |
tree | 1af3908275aa5e1b16e80efee554a9a7504c56d4 | |
parent | 458af5439fe7ae7d95ca14106844e61f0795166c (diff) |
mm: re-architect the VM_UNPAGED logic
This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP. It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.
Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way. It just works. As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.
Sparc update from David in the next commit.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | arch/powerpc/kernel/vdso.c | 6 | ||||
-rw-r--r-- | drivers/char/mem.c | 2 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 7 | ||||
-rw-r--r-- | include/linux/mm.h | 5 | ||||
-rw-r--r-- | mm/fremap.c | 22 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 189 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/msync.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/rmap.c | 14 |
11 files changed, 127 insertions, 146 deletions
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index b44b36e0c293..f0c47dab0903 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c | |||
@@ -145,8 +145,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) | |||
145 | struct page *pg = virt_to_page(vdso32_kbase + | 145 | struct page *pg = virt_to_page(vdso32_kbase + |
146 | i*PAGE_SIZE); | 146 | i*PAGE_SIZE); |
147 | struct page *upg = (vma && vma->vm_mm) ? | 147 | struct page *upg = (vma && vma->vm_mm) ? |
148 | follow_page(vma->vm_mm, vma->vm_start + | 148 | follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) |
149 | i*PAGE_SIZE, 0) | ||
150 | : NULL; | 149 | : NULL; |
151 | dump_one_vdso_page(pg, upg); | 150 | dump_one_vdso_page(pg, upg); |
152 | } | 151 | } |
@@ -157,8 +156,7 @@ static void dump_vdso_pages(struct vm_area_struct * vma) | |||
157 | struct page *pg = virt_to_page(vdso64_kbase + | 156 | struct page *pg = virt_to_page(vdso64_kbase + |
158 | i*PAGE_SIZE); | 157 | i*PAGE_SIZE); |
159 | struct page *upg = (vma && vma->vm_mm) ? | 158 | struct page *upg = (vma && vma->vm_mm) ? |
160 | follow_page(vma->vm_mm, vma->vm_start + | 159 | follow_page(vma, vma->vm_start + i*PAGE_SIZE, 0) |
161 | i*PAGE_SIZE, 0) | ||
162 | : NULL; | 160 | : NULL; |
163 | dump_one_vdso_page(pg, upg); | 161 | dump_one_vdso_page(pg, upg); |
164 | } | 162 | } |
diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 29c3b631445a..91dd669273e0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c | |||
@@ -591,7 +591,7 @@ static inline size_t read_zero_pagealigned(char __user * buf, size_t size) | |||
591 | 591 | ||
592 | if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) | 592 | if (vma->vm_start > addr || (vma->vm_flags & VM_WRITE) == 0) |
593 | goto out_up; | 593 | goto out_up; |
594 | if (vma->vm_flags & (VM_SHARED | VM_HUGETLB | VM_UNPAGED)) | 594 | if (vma->vm_flags & (VM_SHARED | VM_HUGETLB)) |
595 | break; | 595 | break; |
596 | count = vma->vm_end - addr; | 596 | count = vma->vm_end - addr; |
597 | if (count > size) | 597 | if (count > size) |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9ab97cef0daa..50bd5a8f0446 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -402,12 +402,11 @@ struct numa_maps { | |||
402 | /* | 402 | /* |
403 | * Calculate numa node maps for a vma | 403 | * Calculate numa node maps for a vma |
404 | */ | 404 | */ |
405 | static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | 405 | static struct numa_maps *get_numa_maps(struct vm_area_struct *vma) |
406 | { | 406 | { |
407 | int i; | ||
407 | struct page *page; | 408 | struct page *page; |
408 | unsigned long vaddr; | 409 | unsigned long vaddr; |
409 | struct mm_struct *mm = vma->vm_mm; | ||
410 | int i; | ||
411 | struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); | 410 | struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); |
412 | 411 | ||
413 | if (!md) | 412 | if (!md) |
@@ -420,7 +419,7 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) | |||
420 | md->node[i] =0; | 419 | md->node[i] =0; |
421 | 420 | ||
422 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { | 421 | for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { |
423 | page = follow_page(mm, vaddr, 0); | 422 | page = follow_page(vma, vaddr, 0); |
424 | if (page) { | 423 | if (page) { |
425 | int count = page_mapcount(page); | 424 | int count = page_mapcount(page); |
426 | 425 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index f0cdfd18db55..6a75a7a78bf1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -145,7 +145,7 @@ extern unsigned int kobjsize(const void *objp); | |||
145 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ | 145 | #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ |
146 | #define VM_GROWSUP 0x00000200 | 146 | #define VM_GROWSUP 0x00000200 |
147 | #define VM_SHM 0x00000000 /* Means nothing: delete it later */ | 147 | #define VM_SHM 0x00000000 /* Means nothing: delete it later */ |
148 | #define VM_UNPAGED 0x00000400 /* Pages managed without map count */ | 148 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
149 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 149 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
150 | 150 | ||
151 | #define VM_EXECUTABLE 0x00001000 | 151 | #define VM_EXECUTABLE 0x00001000 |
@@ -664,6 +664,7 @@ struct zap_details { | |||
664 | unsigned long truncate_count; /* Compare vm_truncate_count */ | 664 | unsigned long truncate_count; /* Compare vm_truncate_count */ |
665 | }; | 665 | }; |
666 | 666 | ||
667 | struct page *vm_normal_page(struct vm_area_struct *, unsigned long, pte_t); | ||
667 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | 668 | unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, |
668 | unsigned long size, struct zap_details *); | 669 | unsigned long size, struct zap_details *); |
669 | unsigned long unmap_vmas(struct mmu_gather **tlb, | 670 | unsigned long unmap_vmas(struct mmu_gather **tlb, |
@@ -953,7 +954,7 @@ unsigned long vmalloc_to_pfn(void *addr); | |||
953 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, | 954 | int remap_pfn_range(struct vm_area_struct *, unsigned long addr, |
954 | unsigned long pfn, unsigned long size, pgprot_t); | 955 | unsigned long pfn, unsigned long size, pgprot_t); |
955 | 956 | ||
956 | struct page *follow_page(struct mm_struct *, unsigned long address, | 957 | struct page *follow_page(struct vm_area_struct *, unsigned long address, |
957 | unsigned int foll_flags); | 958 | unsigned int foll_flags); |
958 | #define FOLL_WRITE 0x01 /* check pte is writable */ | 959 | #define FOLL_WRITE 0x01 /* check pte is writable */ |
959 | #define FOLL_TOUCH 0x02 /* mark page accessed */ | 960 | #define FOLL_TOUCH 0x02 /* mark page accessed */ |
diff --git a/mm/fremap.c b/mm/fremap.c index 007cbad9331e..f851775e09c2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
27 | struct page *page = NULL; | 27 | struct page *page = NULL; |
28 | 28 | ||
29 | if (pte_present(pte)) { | 29 | if (pte_present(pte)) { |
30 | unsigned long pfn = pte_pfn(pte); | 30 | flush_cache_page(vma, addr, pte_pfn(pte)); |
31 | flush_cache_page(vma, addr, pfn); | ||
32 | pte = ptep_clear_flush(vma, addr, ptep); | 31 | pte = ptep_clear_flush(vma, addr, ptep); |
33 | if (unlikely(!pfn_valid(pfn))) { | 32 | page = vm_normal_page(vma, addr, pte); |
34 | print_bad_pte(vma, pte, addr); | 33 | if (page) { |
35 | goto out; | 34 | if (pte_dirty(pte)) |
35 | set_page_dirty(page); | ||
36 | page_remove_rmap(page); | ||
37 | page_cache_release(page); | ||
36 | } | 38 | } |
37 | page = pfn_to_page(pfn); | ||
38 | if (pte_dirty(pte)) | ||
39 | set_page_dirty(page); | ||
40 | page_remove_rmap(page); | ||
41 | page_cache_release(page); | ||
42 | } else { | 39 | } else { |
43 | if (!pte_file(pte)) | 40 | if (!pte_file(pte)) |
44 | free_swap_and_cache(pte_to_swp_entry(pte)); | 41 | free_swap_and_cache(pte_to_swp_entry(pte)); |
45 | pte_clear(mm, addr, ptep); | 42 | pte_clear(mm, addr, ptep); |
46 | } | 43 | } |
47 | out: | ||
48 | return !!page; | 44 | return !!page; |
49 | } | 45 | } |
50 | 46 | ||
@@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
65 | pte_t pte_val; | 61 | pte_t pte_val; |
66 | spinlock_t *ptl; | 62 | spinlock_t *ptl; |
67 | 63 | ||
68 | BUG_ON(vma->vm_flags & VM_UNPAGED); | ||
69 | |||
70 | pgd = pgd_offset(mm, addr); | 64 | pgd = pgd_offset(mm, addr); |
71 | pud = pud_alloc(mm, pgd, addr); | 65 | pud = pud_alloc(mm, pgd, addr); |
72 | if (!pud) | 66 | if (!pud) |
@@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
122 | pte_t pte_val; | 116 | pte_t pte_val; |
123 | spinlock_t *ptl; | 117 | spinlock_t *ptl; |
124 | 118 | ||
125 | BUG_ON(vma->vm_flags & VM_UNPAGED); | ||
126 | |||
127 | pgd = pgd_offset(mm, addr); | 119 | pgd = pgd_offset(mm, addr); |
128 | pud = pud_alloc(mm, pgd, addr); | 120 | pud = pud_alloc(mm, pgd, addr); |
129 | if (!pud) | 121 | if (!pud) |
diff --git a/mm/madvise.c b/mm/madvise.c index 328a3bcce527..2b7cf0400a21 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, | |||
126 | unsigned long start, unsigned long end) | 126 | unsigned long start, unsigned long end) |
127 | { | 127 | { |
128 | *prev = vma; | 128 | *prev = vma; |
129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_UNPAGED)) | 129 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) |
130 | return -EINVAL; | 130 | return -EINVAL; |
131 | 131 | ||
132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { | 132 | if (unlikely(vma->vm_flags & VM_NONLINEAR)) { |
diff --git a/mm/memory.c b/mm/memory.c index d1f46f4e4c8a..b57fbc636058 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
333 | } | 333 | } |
334 | 334 | ||
335 | /* | 335 | /* |
336 | * This function is called to print an error when a pte in a | 336 | * This function is called to print an error when a bad pte |
337 | * !VM_UNPAGED region is found pointing to an invalid pfn (which | 337 | * is found. For example, we might have a PFN-mapped pte in |
338 | * is an error. | 338 | * a region that doesn't allow it. |
339 | * | 339 | * |
340 | * The calling function must still handle the error. | 340 | * The calling function must still handle the error. |
341 | */ | 341 | */ |
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | |||
350 | } | 350 | } |
351 | 351 | ||
352 | /* | 352 | /* |
353 | * page_is_anon applies strict checks for an anonymous page belonging to | 353 | * This function gets the "struct page" associated with a pte. |
354 | * this vma at this address. It is used on VM_UNPAGED vmas, which are | 354 | * |
355 | * usually populated with shared originals (which must not be counted), | 355 | * NOTE! Some mappings do not have "struct pages". A raw PFN mapping |
356 | * but occasionally contain private COWed copies (when !VM_SHARED, or | 356 | * will have each page table entry just pointing to a raw page frame |
357 | * perhaps via ptrace when VM_SHARED). An mmap of /dev/mem might window | 357 | * number, and as far as the VM layer is concerned, those do not have |
358 | * free pages, pages from other processes, or from other parts of this: | 358 | * pages associated with them - even if the PFN might point to memory |
359 | * it's tricky, but try not to be deceived by foreign anonymous pages. | 359 | * that otherwise is perfectly fine and has a "struct page". |
360 | * | ||
361 | * The way we recognize those mappings is through the rules set up | ||
362 | * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, | ||
363 | * and the vm_pgoff will point to the first PFN mapped: thus every | ||
364 | * page that is a raw mapping will always honor the rule | ||
365 | * | ||
366 | * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) | ||
367 | * | ||
368 | * and if that isn't true, the page has been COW'ed (in which case it | ||
369 | * _does_ have a "struct page" associated with it even if it is in a | ||
370 | * VM_PFNMAP range). | ||
360 | */ | 371 | */ |
361 | static inline int page_is_anon(struct page *page, | 372 | struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
362 | struct vm_area_struct *vma, unsigned long addr) | ||
363 | { | 373 | { |
364 | return page && PageAnon(page) && page_mapped(page) && | 374 | unsigned long pfn = pte_pfn(pte); |
365 | page_address_in_vma(page, vma) == addr; | 375 | |
376 | if (vma->vm_flags & VM_PFNMAP) { | ||
377 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | ||
378 | if (pfn == vma->vm_pgoff + off) | ||
379 | return NULL; | ||
380 | } | ||
381 | |||
382 | /* | ||
383 | * Add some anal sanity checks for now. Eventually, | ||
384 | * we should just do "return pfn_to_page(pfn)", but | ||
385 | * in the meantime we check that we get a valid pfn, | ||
386 | * and that the resulting page looks ok. | ||
387 | * | ||
388 | * Remove this test eventually! | ||
389 | */ | ||
390 | if (unlikely(!pfn_valid(pfn))) { | ||
391 | print_bad_pte(vma, pte, addr); | ||
392 | return NULL; | ||
393 | } | ||
394 | |||
395 | /* | ||
396 | * NOTE! We still have PageReserved() pages in the page | ||
397 | * tables. | ||
398 | * | ||
399 | * The PAGE_ZERO() pages and various VDSO mappings can | ||
400 | * cause them to exist. | ||
401 | */ | ||
402 | return pfn_to_page(pfn); | ||
366 | } | 403 | } |
367 | 404 | ||
368 | /* | 405 | /* |
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
379 | unsigned long vm_flags = vma->vm_flags; | 416 | unsigned long vm_flags = vma->vm_flags; |
380 | pte_t pte = *src_pte; | 417 | pte_t pte = *src_pte; |
381 | struct page *page; | 418 | struct page *page; |
382 | unsigned long pfn; | ||
383 | 419 | ||
384 | /* pte contains position in swap or file, so copy. */ | 420 | /* pte contains position in swap or file, so copy. */ |
385 | if (unlikely(!pte_present(pte))) { | 421 | if (unlikely(!pte_present(pte))) { |
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
397 | goto out_set_pte; | 433 | goto out_set_pte; |
398 | } | 434 | } |
399 | 435 | ||
400 | pfn = pte_pfn(pte); | ||
401 | page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; | ||
402 | |||
403 | if (unlikely(vm_flags & VM_UNPAGED)) | ||
404 | if (!page_is_anon(page, vma, addr)) | ||
405 | goto out_set_pte; | ||
406 | |||
407 | /* | ||
408 | * If the pte points outside of valid memory but | ||
409 | * the region is not VM_UNPAGED, we have a problem. | ||
410 | */ | ||
411 | if (unlikely(!page)) { | ||
412 | print_bad_pte(vma, pte, addr); | ||
413 | goto out_set_pte; /* try to do something sane */ | ||
414 | } | ||
415 | |||
416 | /* | 436 | /* |
417 | * If it's a COW mapping, write protect it both | 437 | * If it's a COW mapping, write protect it both |
418 | * in the parent and the child | 438 | * in the parent and the child |
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
429 | if (vm_flags & VM_SHARED) | 449 | if (vm_flags & VM_SHARED) |
430 | pte = pte_mkclean(pte); | 450 | pte = pte_mkclean(pte); |
431 | pte = pte_mkold(pte); | 451 | pte = pte_mkold(pte); |
432 | get_page(page); | 452 | |
433 | page_dup_rmap(page); | 453 | page = vm_normal_page(vma, addr, pte); |
434 | rss[!!PageAnon(page)]++; | 454 | if (page) { |
455 | get_page(page); | ||
456 | page_dup_rmap(page); | ||
457 | rss[!!PageAnon(page)]++; | ||
458 | } | ||
435 | 459 | ||
436 | out_set_pte: | 460 | out_set_pte: |
437 | set_pte_at(dst_mm, addr, dst_pte, pte); | 461 | set_pte_at(dst_mm, addr, dst_pte, pte); |
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
543 | * readonly mappings. The tradeoff is that copy_page_range is more | 567 | * readonly mappings. The tradeoff is that copy_page_range is more |
544 | * efficient than faulting. | 568 | * efficient than faulting. |
545 | */ | 569 | */ |
546 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) { | 570 | if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { |
547 | if (!vma->anon_vma) | 571 | if (!vma->anon_vma) |
548 | return 0; | 572 | return 0; |
549 | } | 573 | } |
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
584 | } | 608 | } |
585 | if (pte_present(ptent)) { | 609 | if (pte_present(ptent)) { |
586 | struct page *page; | 610 | struct page *page; |
587 | unsigned long pfn; | ||
588 | 611 | ||
589 | (*zap_work) -= PAGE_SIZE; | 612 | (*zap_work) -= PAGE_SIZE; |
590 | 613 | ||
591 | pfn = pte_pfn(ptent); | 614 | page = vm_normal_page(vma, addr, ptent); |
592 | page = pfn_valid(pfn)? pfn_to_page(pfn): NULL; | ||
593 | |||
594 | if (unlikely(vma->vm_flags & VM_UNPAGED)) { | ||
595 | if (!page_is_anon(page, vma, addr)) | ||
596 | page = NULL; | ||
597 | } else if (unlikely(!page)) | ||
598 | print_bad_pte(vma, ptent, addr); | ||
599 | |||
600 | if (unlikely(details) && page) { | 615 | if (unlikely(details) && page) { |
601 | /* | 616 | /* |
602 | * unmap_shared_mapping_pages() wants to | 617 | * unmap_shared_mapping_pages() wants to |
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
852 | /* | 867 | /* |
853 | * Do a quick page-table lookup for a single page. | 868 | * Do a quick page-table lookup for a single page. |
854 | */ | 869 | */ |
855 | struct page *follow_page(struct mm_struct *mm, unsigned long address, | 870 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
856 | unsigned int flags) | 871 | unsigned int flags) |
857 | { | 872 | { |
858 | pgd_t *pgd; | 873 | pgd_t *pgd; |
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, | |||
860 | pmd_t *pmd; | 875 | pmd_t *pmd; |
861 | pte_t *ptep, pte; | 876 | pte_t *ptep, pte; |
862 | spinlock_t *ptl; | 877 | spinlock_t *ptl; |
863 | unsigned long pfn; | ||
864 | struct page *page; | 878 | struct page *page; |
879 | struct mm_struct *mm = vma->vm_mm; | ||
865 | 880 | ||
866 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); | 881 | page = follow_huge_addr(mm, address, flags & FOLL_WRITE); |
867 | if (!IS_ERR(page)) { | 882 | if (!IS_ERR(page)) { |
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, | |||
897 | goto unlock; | 912 | goto unlock; |
898 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 913 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
899 | goto unlock; | 914 | goto unlock; |
900 | pfn = pte_pfn(pte); | 915 | page = vm_normal_page(vma, address, pte); |
901 | if (!pfn_valid(pfn)) | 916 | if (unlikely(!page)) |
902 | goto unlock; | 917 | goto unlock; |
903 | 918 | ||
904 | page = pfn_to_page(pfn); | ||
905 | if (flags & FOLL_GET) | 919 | if (flags & FOLL_GET) |
906 | get_page(page); | 920 | get_page(page); |
907 | if (flags & FOLL_TOUCH) { | 921 | if (flags & FOLL_TOUCH) { |
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
974 | return i ? : -EFAULT; | 988 | return i ? : -EFAULT; |
975 | } | 989 | } |
976 | if (pages) { | 990 | if (pages) { |
977 | pages[i] = pte_page(*pte); | 991 | struct page *page = vm_normal_page(vma, start, *pte); |
978 | get_page(pages[i]); | 992 | pages[i] = page; |
993 | if (page) | ||
994 | get_page(page); | ||
979 | } | 995 | } |
980 | pte_unmap(pte); | 996 | pte_unmap(pte); |
981 | if (vmas) | 997 | if (vmas) |
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1010 | foll_flags |= FOLL_WRITE; | 1026 | foll_flags |= FOLL_WRITE; |
1011 | 1027 | ||
1012 | cond_resched(); | 1028 | cond_resched(); |
1013 | while (!(page = follow_page(mm, start, foll_flags))) { | 1029 | while (!(page = follow_page(vma, start, foll_flags))) { |
1014 | int ret; | 1030 | int ret; |
1015 | ret = __handle_mm_fault(mm, vma, start, | 1031 | ret = __handle_mm_fault(mm, vma, start, |
1016 | foll_flags & FOLL_WRITE); | 1032 | foll_flags & FOLL_WRITE); |
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, | |||
1214 | * in 2.6 the LRU scan won't even find its pages, so this | 1230 | * in 2.6 the LRU scan won't even find its pages, so this |
1215 | * flag means no more than count its pages in reserved_vm, | 1231 | * flag means no more than count its pages in reserved_vm, |
1216 | * and omit it from core dump, even when VM_IO turned off. | 1232 | * and omit it from core dump, even when VM_IO turned off. |
1217 | * VM_UNPAGED tells the core MM not to "manage" these pages | 1233 | * VM_PFNMAP tells the core MM that the base pages are just |
1218 | * (e.g. refcount, mapcount, try to swap them out): in | 1234 | * raw PFN mappings, and do not have a "struct page" associated |
1219 | * particular, zap_pte_range does not try to free them. | 1235 | * with them. |
1220 | */ | 1236 | */ |
1221 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED; | 1237 | vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; |
1238 | vma->vm_pgoff = pfn; | ||
1222 | 1239 | ||
1223 | BUG_ON(addr >= end); | 1240 | BUG_ON(addr >= end); |
1224 | pfn -= addr >> PAGE_SHIFT; | 1241 | pfn -= addr >> PAGE_SHIFT; |
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) | |||
1273 | return pte; | 1290 | return pte; |
1274 | } | 1291 | } |
1275 | 1292 | ||
1293 | static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) | ||
1294 | { | ||
1295 | /* | ||
1296 | * If the source page was a PFN mapping, we don't have | ||
1297 | * a "struct page" for it. We do a best-effort copy by | ||
1298 | * just copying from the original user address. If that | ||
1299 | * fails, we just zero-fill it. Live with it. | ||
1300 | */ | ||
1301 | if (unlikely(!src)) { | ||
1302 | void *kaddr = kmap_atomic(dst, KM_USER0); | ||
1303 | unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE); | ||
1304 | if (left) | ||
1305 | memset(kaddr, 0, PAGE_SIZE); | ||
1306 | kunmap_atomic(kaddr, KM_USER0); | ||
1307 | return; | ||
1308 | |||
1309 | } | ||
1310 | copy_user_highpage(dst, src, va); | ||
1311 | } | ||
1312 | |||
1276 | /* | 1313 | /* |
1277 | * This routine handles present pages, when users try to write | 1314 | * This routine handles present pages, when users try to write |
1278 | * to a shared page. It is done by copying the page to a new address | 1315 | * to a shared page. It is done by copying the page to a new address |
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1296 | spinlock_t *ptl, pte_t orig_pte) | 1333 | spinlock_t *ptl, pte_t orig_pte) |
1297 | { | 1334 | { |
1298 | struct page *old_page, *src_page, *new_page; | 1335 | struct page *old_page, *src_page, *new_page; |
1299 | unsigned long pfn = pte_pfn(orig_pte); | ||
1300 | pte_t entry; | 1336 | pte_t entry; |
1301 | int ret = VM_FAULT_MINOR; | 1337 | int ret = VM_FAULT_MINOR; |
1302 | 1338 | ||
1303 | if (unlikely(!pfn_valid(pfn))) { | 1339 | old_page = vm_normal_page(vma, address, orig_pte); |
1304 | /* | ||
1305 | * Page table corrupted: show pte and kill process. | ||
1306 | * Or it's an attempt to COW an out-of-map VM_UNPAGED | ||
1307 | * entry, which copy_user_highpage does not support. | ||
1308 | */ | ||
1309 | print_bad_pte(vma, orig_pte, address); | ||
1310 | ret = VM_FAULT_OOM; | ||
1311 | goto unlock; | ||
1312 | } | ||
1313 | old_page = pfn_to_page(pfn); | ||
1314 | src_page = old_page; | 1340 | src_page = old_page; |
1315 | 1341 | if (!old_page) | |
1316 | if (unlikely(vma->vm_flags & VM_UNPAGED)) | 1342 | goto gotten; |
1317 | if (!page_is_anon(old_page, vma, address)) { | ||
1318 | old_page = NULL; | ||
1319 | goto gotten; | ||
1320 | } | ||
1321 | 1343 | ||
1322 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { | 1344 | if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { |
1323 | int reuse = can_share_swap_page(old_page); | 1345 | int reuse = can_share_swap_page(old_page); |
@@ -1351,7 +1373,7 @@ gotten: | |||
1351 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1373 | new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
1352 | if (!new_page) | 1374 | if (!new_page) |
1353 | goto oom; | 1375 | goto oom; |
1354 | copy_user_highpage(new_page, src_page, address); | 1376 | cow_user_page(new_page, src_page, address); |
1355 | } | 1377 | } |
1356 | 1378 | ||
1357 | /* | 1379 | /* |
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1812 | spinlock_t *ptl; | 1834 | spinlock_t *ptl; |
1813 | pte_t entry; | 1835 | pte_t entry; |
1814 | 1836 | ||
1815 | /* | 1837 | if (write_access) { |
1816 | * A VM_UNPAGED vma will normally be filled with present ptes | ||
1817 | * by remap_pfn_range, and never arrive here; but it might have | ||
1818 | * holes, or if !VM_DONTEXPAND, mremap might have expanded it. | ||
1819 | * It's weird enough handling anon pages in unpaged vmas, we do | ||
1820 | * not want to worry about ZERO_PAGEs too (it may or may not | ||
1821 | * matter if their counts wrap): just give them anon pages. | ||
1822 | */ | ||
1823 | |||
1824 | if (write_access || (vma->vm_flags & VM_UNPAGED)) { | ||
1825 | /* Allocate our own private page. */ | 1838 | /* Allocate our own private page. */ |
1826 | pte_unmap(page_table); | 1839 | pte_unmap(page_table); |
1827 | 1840 | ||
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1896 | int anon = 0; | 1909 | int anon = 0; |
1897 | 1910 | ||
1898 | pte_unmap(page_table); | 1911 | pte_unmap(page_table); |
1899 | BUG_ON(vma->vm_flags & VM_UNPAGED); | ||
1900 | |||
1901 | if (vma->vm_file) { | 1912 | if (vma->vm_file) { |
1902 | mapping = vma->vm_file->f_mapping; | 1913 | mapping = vma->vm_file->f_mapping; |
1903 | sequence = mapping->truncate_count; | 1914 | sequence = mapping->truncate_count; |
@@ -1930,7 +1941,7 @@ retry: | |||
1930 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); | 1941 | page = alloc_page_vma(GFP_HIGHUSER, vma, address); |
1931 | if (!page) | 1942 | if (!page) |
1932 | goto oom; | 1943 | goto oom; |
1933 | copy_user_highpage(page, new_page, address); | 1944 | cow_user_page(page, new_page, address); |
1934 | page_cache_release(new_page); | 1945 | page_cache_release(new_page); |
1935 | new_page = page; | 1946 | new_page = page; |
1936 | anon = 1; | 1947 | anon = 1; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5609a31bdf22..bec88c81244e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
189 | 189 | ||
190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 190 | orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
191 | do { | 191 | do { |
192 | unsigned long pfn; | 192 | struct page *page; |
193 | unsigned int nid; | 193 | unsigned int nid; |
194 | 194 | ||
195 | if (!pte_present(*pte)) | 195 | if (!pte_present(*pte)) |
196 | continue; | 196 | continue; |
197 | pfn = pte_pfn(*pte); | 197 | page = vm_normal_page(vma, addr, *pte); |
198 | if (!pfn_valid(pfn)) { | 198 | if (!page) |
199 | print_bad_pte(vma, *pte, addr); | ||
200 | continue; | 199 | continue; |
201 | } | 200 | nid = page_to_nid(page); |
202 | nid = pfn_to_nid(pfn); | ||
203 | if (!node_isset(nid, *nodes)) | 201 | if (!node_isset(nid, *nodes)) |
204 | break; | 202 | break; |
205 | } while (pte++, addr += PAGE_SIZE, addr != end); | 203 | } while (pte++, addr += PAGE_SIZE, addr != end); |
@@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
269 | first = find_vma(mm, start); | 267 | first = find_vma(mm, start); |
270 | if (!first) | 268 | if (!first) |
271 | return ERR_PTR(-EFAULT); | 269 | return ERR_PTR(-EFAULT); |
272 | if (first->vm_flags & VM_UNPAGED) | ||
273 | return ERR_PTR(-EACCES); | ||
274 | prev = NULL; | 270 | prev = NULL; |
275 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { | 271 | for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { |
276 | if (!vma->vm_next && vma->vm_end < end) | 272 | if (!vma->vm_next && vma->vm_end < end) |
diff --git a/mm/msync.c b/mm/msync.c index b3f4caf3010b..1b5b6f662dcf 100644 --- a/mm/msync.c +++ b/mm/msync.c | |||
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
27 | again: | 27 | again: |
28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 28 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
29 | do { | 29 | do { |
30 | unsigned long pfn; | ||
31 | struct page *page; | 30 | struct page *page; |
32 | 31 | ||
33 | if (progress >= 64) { | 32 | if (progress >= 64) { |
@@ -40,13 +39,9 @@ again: | |||
40 | continue; | 39 | continue; |
41 | if (!pte_maybe_dirty(*pte)) | 40 | if (!pte_maybe_dirty(*pte)) |
42 | continue; | 41 | continue; |
43 | pfn = pte_pfn(*pte); | 42 | page = vm_normal_page(vma, addr, *pte); |
44 | if (unlikely(!pfn_valid(pfn))) { | 43 | if (!page) |
45 | print_bad_pte(vma, *pte, addr); | ||
46 | continue; | 44 | continue; |
47 | } | ||
48 | page = pfn_to_page(pfn); | ||
49 | |||
50 | if (ptep_clear_flush_dirty(vma, addr, pte) || | 45 | if (ptep_clear_flush_dirty(vma, addr, pte) || |
51 | page_test_and_clear_dirty(page)) | 46 | page_test_and_clear_dirty(page)) |
52 | set_page_dirty(page); | 47 | set_page_dirty(page); |
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, | |||
97 | /* For hugepages we can't go walking the page table normally, | 92 | /* For hugepages we can't go walking the page table normally, |
98 | * but that's ok, hugetlbfs is memory based, so we don't need | 93 | * but that's ok, hugetlbfs is memory based, so we don't need |
99 | * to do anything more on an msync(). | 94 | * to do anything more on an msync(). |
100 | * Can't do anything with VM_UNPAGED regions either. | ||
101 | */ | 95 | */ |
102 | if (vma->vm_flags & (VM_HUGETLB|VM_UNPAGED)) | 96 | if (vma->vm_flags & VM_HUGETLB) |
103 | return; | 97 | return; |
104 | 98 | ||
105 | BUG_ON(addr >= end); | 99 | BUG_ON(addr >= end); |
diff --git a/mm/nommu.c b/mm/nommu.c index 6deb6ab3d6ad..c1196812876b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) | |||
1045 | 1045 | ||
1046 | EXPORT_SYMBOL(find_vma); | 1046 | EXPORT_SYMBOL(find_vma); |
1047 | 1047 | ||
1048 | struct page *follow_page(struct mm_struct *mm, unsigned long address, | 1048 | struct page *follow_page(struct vm_area_struct *vma, unsigned long address, |
1049 | unsigned int foll_flags) | 1049 | unsigned int foll_flags) |
1050 | { | 1050 | { |
1051 | return NULL; | 1051 | return NULL; |
@@ -226,8 +226,6 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
226 | /* | 226 | /* |
227 | * At what user virtual address is page expected in vma? checking that the | 227 | * At what user virtual address is page expected in vma? checking that the |
228 | * page matches the vma: currently only used on anon pages, by unuse_vma; | 228 | * page matches the vma: currently only used on anon pages, by unuse_vma; |
229 | * and by extraordinary checks on anon pages in VM_UNPAGED vmas, taking | ||
230 | * care that an mmap of /dev/mem might window free and foreign pages. | ||
231 | */ | 229 | */ |
232 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 230 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
233 | { | 231 | { |
@@ -614,7 +612,6 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
614 | struct page *page; | 612 | struct page *page; |
615 | unsigned long address; | 613 | unsigned long address; |
616 | unsigned long end; | 614 | unsigned long end; |
617 | unsigned long pfn; | ||
618 | 615 | ||
619 | address = (vma->vm_start + cursor) & CLUSTER_MASK; | 616 | address = (vma->vm_start + cursor) & CLUSTER_MASK; |
620 | end = address + CLUSTER_SIZE; | 617 | end = address + CLUSTER_SIZE; |
@@ -643,15 +640,8 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
643 | for (; address < end; pte++, address += PAGE_SIZE) { | 640 | for (; address < end; pte++, address += PAGE_SIZE) { |
644 | if (!pte_present(*pte)) | 641 | if (!pte_present(*pte)) |
645 | continue; | 642 | continue; |
646 | 643 | page = vm_normal_page(vma, address, *pte); | |
647 | pfn = pte_pfn(*pte); | 644 | BUG_ON(!page || PageAnon(page)); |
648 | if (unlikely(!pfn_valid(pfn))) { | ||
649 | print_bad_pte(vma, *pte, address); | ||
650 | continue; | ||
651 | } | ||
652 | |||
653 | page = pfn_to_page(pfn); | ||
654 | BUG_ON(PageAnon(page)); | ||
655 | 645 | ||
656 | if (ptep_clear_flush_young(vma, address, pte)) | 646 | if (ptep_clear_flush_young(vma, address, pte)) |
657 | continue; | 647 | continue; |