diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 246 |
1 files changed, 156 insertions, 90 deletions
diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..a8ca04faaea6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -61,6 +61,8 @@ | |||
61 | #include <linux/swapops.h> | 61 | #include <linux/swapops.h> |
62 | #include <linux/elf.h> | 62 | #include <linux/elf.h> |
63 | 63 | ||
64 | #include "internal.h" | ||
65 | |||
64 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 66 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
65 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 67 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
66 | unsigned long max_mapnr; | 68 | unsigned long max_mapnr; |
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
211 | * | 213 | * |
212 | * Must be called with pagetable lock held. | 214 | * Must be called with pagetable lock held. |
213 | */ | 215 | */ |
214 | void free_pgd_range(struct mmu_gather **tlb, | 216 | void free_pgd_range(struct mmu_gather *tlb, |
215 | unsigned long addr, unsigned long end, | 217 | unsigned long addr, unsigned long end, |
216 | unsigned long floor, unsigned long ceiling) | 218 | unsigned long floor, unsigned long ceiling) |
217 | { | 219 | { |
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
262 | return; | 264 | return; |
263 | 265 | ||
264 | start = addr; | 266 | start = addr; |
265 | pgd = pgd_offset((*tlb)->mm, addr); | 267 | pgd = pgd_offset(tlb->mm, addr); |
266 | do { | 268 | do { |
267 | next = pgd_addr_end(addr, end); | 269 | next = pgd_addr_end(addr, end); |
268 | if (pgd_none_or_clear_bad(pgd)) | 270 | if (pgd_none_or_clear_bad(pgd)) |
269 | continue; | 271 | continue; |
270 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 272 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
271 | } while (pgd++, addr = next, addr != end); | 273 | } while (pgd++, addr = next, addr != end); |
272 | } | 274 | } |
273 | 275 | ||
274 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | 276 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, |
275 | unsigned long floor, unsigned long ceiling) | 277 | unsigned long floor, unsigned long ceiling) |
276 | { | 278 | { |
277 | while (vma) { | 279 | while (vma) { |
@@ -372,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
372 | * | 374 | * |
373 | * The calling function must still handle the error. | 375 | * The calling function must still handle the error. |
374 | */ | 376 | */ |
375 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | 377 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, |
378 | unsigned long vaddr) | ||
376 | { | 379 | { |
377 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 380 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " |
378 | "vm_flags = %lx, vaddr = %lx\n", | 381 | "vm_flags = %lx, vaddr = %lx\n", |
@@ -899,9 +902,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
899 | } | 902 | } |
900 | 903 | ||
901 | if (unlikely(is_vm_hugetlb_page(vma))) { | 904 | if (unlikely(is_vm_hugetlb_page(vma))) { |
902 | unmap_hugepage_range(vma, start, end); | 905 | /* |
903 | zap_work -= (end - start) / | 906 | * It is undesirable to test vma->vm_file as it |
904 | (HPAGE_SIZE / PAGE_SIZE); | 907 | * should be non-null for valid hugetlb area. |
908 | * However, vm_file will be NULL in the error | ||
909 | * cleanup path of do_mmap_pgoff. When | ||
910 | * hugetlbfs ->mmap method fails, | ||
911 | * do_mmap_pgoff() nullifies vma->vm_file | ||
912 | * before calling this function to clean up. | ||
913 | * Since no pte has actually been setup, it is | ||
914 | * safe to do nothing in this case. | ||
915 | */ | ||
916 | if (vma->vm_file) { | ||
917 | unmap_hugepage_range(vma, start, end, NULL); | ||
918 | zap_work -= (end - start) / | ||
919 | pages_per_huge_page(hstate_vma(vma)); | ||
920 | } | ||
921 | |||
905 | start = end; | 922 | start = end; |
906 | } else | 923 | } else |
907 | start = unmap_page_range(*tlbp, vma, | 924 | start = unmap_page_range(*tlbp, vma, |
@@ -982,19 +999,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
982 | goto no_page_table; | 999 | goto no_page_table; |
983 | 1000 | ||
984 | pud = pud_offset(pgd, address); | 1001 | pud = pud_offset(pgd, address); |
985 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 1002 | if (pud_none(*pud)) |
986 | goto no_page_table; | 1003 | goto no_page_table; |
987 | 1004 | if (pud_huge(*pud)) { | |
1005 | BUG_ON(flags & FOLL_GET); | ||
1006 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
1007 | goto out; | ||
1008 | } | ||
1009 | if (unlikely(pud_bad(*pud))) | ||
1010 | goto no_page_table; | ||
1011 | |||
988 | pmd = pmd_offset(pud, address); | 1012 | pmd = pmd_offset(pud, address); |
989 | if (pmd_none(*pmd)) | 1013 | if (pmd_none(*pmd)) |
990 | goto no_page_table; | 1014 | goto no_page_table; |
991 | |||
992 | if (pmd_huge(*pmd)) { | 1015 | if (pmd_huge(*pmd)) { |
993 | BUG_ON(flags & FOLL_GET); | 1016 | BUG_ON(flags & FOLL_GET); |
994 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1017 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
995 | goto out; | 1018 | goto out; |
996 | } | 1019 | } |
997 | |||
998 | if (unlikely(pmd_bad(*pmd))) | 1020 | if (unlikely(pmd_bad(*pmd))) |
999 | goto no_page_table; | 1021 | goto no_page_table; |
1000 | 1022 | ||
@@ -1058,11 +1080,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) | |||
1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | 1080 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) |
1059 | return 0; | 1081 | return 0; |
1060 | /* | 1082 | /* |
1061 | * And if we have a fault or a nopfn routine, it's not an | 1083 | * And if we have a fault routine, it's not an anonymous region. |
1062 | * anonymous region. | ||
1063 | */ | 1084 | */ |
1064 | return !vma->vm_ops || | 1085 | return !vma->vm_ops || !vma->vm_ops->fault; |
1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
1066 | } | 1086 | } |
1067 | 1087 | ||
1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1088 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
@@ -1338,6 +1358,11 @@ out: | |||
1338 | * | 1358 | * |
1339 | * This function should only be called from a vm_ops->fault handler, and | 1359 | * This function should only be called from a vm_ops->fault handler, and |
1340 | * in that case the handler should return NULL. | 1360 | * in that case the handler should return NULL. |
1361 | * | ||
1362 | * vma cannot be a COW mapping. | ||
1363 | * | ||
1364 | * As this is called only for pages that do not currently exist, we | ||
1365 | * do not need to flush old virtual caches or the TLB. | ||
1341 | */ | 1366 | */ |
1342 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1367 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1343 | unsigned long pfn) | 1368 | unsigned long pfn) |
@@ -1548,6 +1573,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1548 | unsigned long next; | 1573 | unsigned long next; |
1549 | int err; | 1574 | int err; |
1550 | 1575 | ||
1576 | BUG_ON(pud_huge(*pud)); | ||
1577 | |||
1551 | pmd = pmd_alloc(mm, pud, addr); | 1578 | pmd = pmd_alloc(mm, pud, addr); |
1552 | if (!pmd) | 1579 | if (!pmd) |
1553 | return -ENOMEM; | 1580 | return -ENOMEM; |
@@ -2501,59 +2528,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2501 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2528 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
2502 | } | 2529 | } |
2503 | 2530 | ||
2504 | |||
2505 | /* | ||
2506 | * do_no_pfn() tries to create a new page mapping for a page without | ||
2507 | * a struct_page backing it | ||
2508 | * | ||
2509 | * As this is called only for pages that do not currently exist, we | ||
2510 | * do not need to flush old virtual caches or the TLB. | ||
2511 | * | ||
2512 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2513 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
2514 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2515 | * | ||
2516 | * It is expected that the ->nopfn handler always returns the same pfn | ||
2517 | * for a given virtual mapping. | ||
2518 | * | ||
2519 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
2520 | */ | ||
2521 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2522 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2523 | int write_access) | ||
2524 | { | ||
2525 | spinlock_t *ptl; | ||
2526 | pte_t entry; | ||
2527 | unsigned long pfn; | ||
2528 | |||
2529 | pte_unmap(page_table); | ||
2530 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
2531 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
2532 | |||
2533 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
2534 | |||
2535 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
2536 | |||
2537 | if (unlikely(pfn == NOPFN_OOM)) | ||
2538 | return VM_FAULT_OOM; | ||
2539 | else if (unlikely(pfn == NOPFN_SIGBUS)) | ||
2540 | return VM_FAULT_SIGBUS; | ||
2541 | else if (unlikely(pfn == NOPFN_REFAULT)) | ||
2542 | return 0; | ||
2543 | |||
2544 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2545 | |||
2546 | /* Only go through if we didn't race with anybody else... */ | ||
2547 | if (pte_none(*page_table)) { | ||
2548 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
2549 | if (write_access) | ||
2550 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2551 | set_pte_at(mm, address, page_table, entry); | ||
2552 | } | ||
2553 | pte_unmap_unlock(page_table, ptl); | ||
2554 | return 0; | ||
2555 | } | ||
2556 | |||
2557 | /* | 2531 | /* |
2558 | * Fault of a previously existing named mapping. Repopulate the pte | 2532 | * Fault of a previously existing named mapping. Repopulate the pte |
2559 | * from the encoded file_pte if possible. This enables swappable | 2533 | * from the encoded file_pte if possible. This enables swappable |
@@ -2614,9 +2588,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2614 | if (likely(vma->vm_ops->fault)) | 2588 | if (likely(vma->vm_ops->fault)) |
2615 | return do_linear_fault(mm, vma, address, | 2589 | return do_linear_fault(mm, vma, address, |
2616 | pte, pmd, write_access, entry); | 2590 | pte, pmd, write_access, entry); |
2617 | if (unlikely(vma->vm_ops->nopfn)) | ||
2618 | return do_no_pfn(mm, vma, address, pte, | ||
2619 | pmd, write_access); | ||
2620 | } | 2591 | } |
2621 | return do_anonymous_page(mm, vma, address, | 2592 | return do_anonymous_page(mm, vma, address, |
2622 | pte, pmd, write_access); | 2593 | pte, pmd, write_access); |
@@ -2804,6 +2775,86 @@ int in_gate_area_no_task(unsigned long addr) | |||
2804 | 2775 | ||
2805 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2776 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2806 | 2777 | ||
2778 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
2779 | static resource_size_t follow_phys(struct vm_area_struct *vma, | ||
2780 | unsigned long address, unsigned int flags, | ||
2781 | unsigned long *prot) | ||
2782 | { | ||
2783 | pgd_t *pgd; | ||
2784 | pud_t *pud; | ||
2785 | pmd_t *pmd; | ||
2786 | pte_t *ptep, pte; | ||
2787 | spinlock_t *ptl; | ||
2788 | resource_size_t phys_addr = 0; | ||
2789 | struct mm_struct *mm = vma->vm_mm; | ||
2790 | |||
2791 | VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); | ||
2792 | |||
2793 | pgd = pgd_offset(mm, address); | ||
2794 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
2795 | goto no_page_table; | ||
2796 | |||
2797 | pud = pud_offset(pgd, address); | ||
2798 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
2799 | goto no_page_table; | ||
2800 | |||
2801 | pmd = pmd_offset(pud, address); | ||
2802 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
2803 | goto no_page_table; | ||
2804 | |||
2805 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ | ||
2806 | if (pmd_huge(*pmd)) | ||
2807 | goto no_page_table; | ||
2808 | |||
2809 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2810 | if (!ptep) | ||
2811 | goto out; | ||
2812 | |||
2813 | pte = *ptep; | ||
2814 | if (!pte_present(pte)) | ||
2815 | goto unlock; | ||
2816 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
2817 | goto unlock; | ||
2818 | phys_addr = pte_pfn(pte); | ||
2819 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
2820 | |||
2821 | *prot = pgprot_val(pte_pgprot(pte)); | ||
2822 | |||
2823 | unlock: | ||
2824 | pte_unmap_unlock(ptep, ptl); | ||
2825 | out: | ||
2826 | return phys_addr; | ||
2827 | no_page_table: | ||
2828 | return 0; | ||
2829 | } | ||
2830 | |||
2831 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | ||
2832 | void *buf, int len, int write) | ||
2833 | { | ||
2834 | resource_size_t phys_addr; | ||
2835 | unsigned long prot = 0; | ||
2836 | void *maddr; | ||
2837 | int offset = addr & (PAGE_SIZE-1); | ||
2838 | |||
2839 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
2840 | return -EINVAL; | ||
2841 | |||
2842 | phys_addr = follow_phys(vma, addr, write, &prot); | ||
2843 | |||
2844 | if (!phys_addr) | ||
2845 | return -EINVAL; | ||
2846 | |||
2847 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | ||
2848 | if (write) | ||
2849 | memcpy_toio(maddr + offset, buf, len); | ||
2850 | else | ||
2851 | memcpy_fromio(buf, maddr + offset, len); | ||
2852 | iounmap(maddr); | ||
2853 | |||
2854 | return len; | ||
2855 | } | ||
2856 | #endif | ||
2857 | |||
2807 | /* | 2858 | /* |
2808 | * Access another process' address space. | 2859 | * Access another process' address space. |
2809 | * Source/target buffer must be kernel space, | 2860 | * Source/target buffer must be kernel space, |
@@ -2813,7 +2864,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2813 | { | 2864 | { |
2814 | struct mm_struct *mm; | 2865 | struct mm_struct *mm; |
2815 | struct vm_area_struct *vma; | 2866 | struct vm_area_struct *vma; |
2816 | struct page *page; | ||
2817 | void *old_buf = buf; | 2867 | void *old_buf = buf; |
2818 | 2868 | ||
2819 | mm = get_task_mm(tsk); | 2869 | mm = get_task_mm(tsk); |
@@ -2825,28 +2875,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2825 | while (len) { | 2875 | while (len) { |
2826 | int bytes, ret, offset; | 2876 | int bytes, ret, offset; |
2827 | void *maddr; | 2877 | void *maddr; |
2878 | struct page *page = NULL; | ||
2828 | 2879 | ||
2829 | ret = get_user_pages(tsk, mm, addr, 1, | 2880 | ret = get_user_pages(tsk, mm, addr, 1, |
2830 | write, 1, &page, &vma); | 2881 | write, 1, &page, &vma); |
2831 | if (ret <= 0) | 2882 | if (ret <= 0) { |
2832 | break; | 2883 | /* |
2833 | 2884 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | |
2834 | bytes = len; | 2885 | * we can access using slightly different code. |
2835 | offset = addr & (PAGE_SIZE-1); | 2886 | */ |
2836 | if (bytes > PAGE_SIZE-offset) | 2887 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
2837 | bytes = PAGE_SIZE-offset; | 2888 | vma = find_vma(mm, addr); |
2838 | 2889 | if (!vma) | |
2839 | maddr = kmap(page); | 2890 | break; |
2840 | if (write) { | 2891 | if (vma->vm_ops && vma->vm_ops->access) |
2841 | copy_to_user_page(vma, page, addr, | 2892 | ret = vma->vm_ops->access(vma, addr, buf, |
2842 | maddr + offset, buf, bytes); | 2893 | len, write); |
2843 | set_page_dirty_lock(page); | 2894 | if (ret <= 0) |
2895 | #endif | ||
2896 | break; | ||
2897 | bytes = ret; | ||
2844 | } else { | 2898 | } else { |
2845 | copy_from_user_page(vma, page, addr, | 2899 | bytes = len; |
2846 | buf, maddr + offset, bytes); | 2900 | offset = addr & (PAGE_SIZE-1); |
2901 | if (bytes > PAGE_SIZE-offset) | ||
2902 | bytes = PAGE_SIZE-offset; | ||
2903 | |||
2904 | maddr = kmap(page); | ||
2905 | if (write) { | ||
2906 | copy_to_user_page(vma, page, addr, | ||
2907 | maddr + offset, buf, bytes); | ||
2908 | set_page_dirty_lock(page); | ||
2909 | } else { | ||
2910 | copy_from_user_page(vma, page, addr, | ||
2911 | buf, maddr + offset, bytes); | ||
2912 | } | ||
2913 | kunmap(page); | ||
2914 | page_cache_release(page); | ||
2847 | } | 2915 | } |
2848 | kunmap(page); | ||
2849 | page_cache_release(page); | ||
2850 | len -= bytes; | 2916 | len -= bytes; |
2851 | buf += bytes; | 2917 | buf += bytes; |
2852 | addr += bytes; | 2918 | addr += bytes; |