diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 243 |
1 files changed, 154 insertions, 89 deletions
diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..262e3eb6601a 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -61,6 +61,8 @@ | |||
61 | #include <linux/swapops.h> | 61 | #include <linux/swapops.h> |
62 | #include <linux/elf.h> | 62 | #include <linux/elf.h> |
63 | 63 | ||
64 | #include "internal.h" | ||
65 | |||
64 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 66 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
65 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 67 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
66 | unsigned long max_mapnr; | 68 | unsigned long max_mapnr; |
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
211 | * | 213 | * |
212 | * Must be called with pagetable lock held. | 214 | * Must be called with pagetable lock held. |
213 | */ | 215 | */ |
214 | void free_pgd_range(struct mmu_gather **tlb, | 216 | void free_pgd_range(struct mmu_gather *tlb, |
215 | unsigned long addr, unsigned long end, | 217 | unsigned long addr, unsigned long end, |
216 | unsigned long floor, unsigned long ceiling) | 218 | unsigned long floor, unsigned long ceiling) |
217 | { | 219 | { |
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
262 | return; | 264 | return; |
263 | 265 | ||
264 | start = addr; | 266 | start = addr; |
265 | pgd = pgd_offset((*tlb)->mm, addr); | 267 | pgd = pgd_offset(tlb->mm, addr); |
266 | do { | 268 | do { |
267 | next = pgd_addr_end(addr, end); | 269 | next = pgd_addr_end(addr, end); |
268 | if (pgd_none_or_clear_bad(pgd)) | 270 | if (pgd_none_or_clear_bad(pgd)) |
269 | continue; | 271 | continue; |
270 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 272 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
271 | } while (pgd++, addr = next, addr != end); | 273 | } while (pgd++, addr = next, addr != end); |
272 | } | 274 | } |
273 | 275 | ||
274 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | 276 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, |
275 | unsigned long floor, unsigned long ceiling) | 277 | unsigned long floor, unsigned long ceiling) |
276 | { | 278 | { |
277 | while (vma) { | 279 | while (vma) { |
@@ -899,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
899 | } | 901 | } |
900 | 902 | ||
901 | if (unlikely(is_vm_hugetlb_page(vma))) { | 903 | if (unlikely(is_vm_hugetlb_page(vma))) { |
902 | unmap_hugepage_range(vma, start, end); | 904 | /* |
903 | zap_work -= (end - start) / | 905 | * It is undesirable to test vma->vm_file as it |
904 | (HPAGE_SIZE / PAGE_SIZE); | 906 | * should be non-null for valid hugetlb area. |
907 | * However, vm_file will be NULL in the error | ||
908 | * cleanup path of do_mmap_pgoff. When | ||
909 | * hugetlbfs ->mmap method fails, | ||
910 | * do_mmap_pgoff() nullifies vma->vm_file | ||
911 | * before calling this function to clean up. | ||
912 | * Since no pte has actually been setup, it is | ||
913 | * safe to do nothing in this case. | ||
914 | */ | ||
915 | if (vma->vm_file) { | ||
916 | unmap_hugepage_range(vma, start, end, NULL); | ||
917 | zap_work -= (end - start) / | ||
918 | pages_per_huge_page(hstate_vma(vma)); | ||
919 | } | ||
920 | |||
905 | start = end; | 921 | start = end; |
906 | } else | 922 | } else |
907 | start = unmap_page_range(*tlbp, vma, | 923 | start = unmap_page_range(*tlbp, vma, |
@@ -982,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
982 | goto no_page_table; | 998 | goto no_page_table; |
983 | 999 | ||
984 | pud = pud_offset(pgd, address); | 1000 | pud = pud_offset(pgd, address); |
985 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 1001 | if (pud_none(*pud)) |
986 | goto no_page_table; | 1002 | goto no_page_table; |
987 | 1003 | if (pud_huge(*pud)) { | |
1004 | BUG_ON(flags & FOLL_GET); | ||
1005 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
1006 | goto out; | ||
1007 | } | ||
1008 | if (unlikely(pud_bad(*pud))) | ||
1009 | goto no_page_table; | ||
1010 | |||
988 | pmd = pmd_offset(pud, address); | 1011 | pmd = pmd_offset(pud, address); |
989 | if (pmd_none(*pmd)) | 1012 | if (pmd_none(*pmd)) |
990 | goto no_page_table; | 1013 | goto no_page_table; |
991 | |||
992 | if (pmd_huge(*pmd)) { | 1014 | if (pmd_huge(*pmd)) { |
993 | BUG_ON(flags & FOLL_GET); | 1015 | BUG_ON(flags & FOLL_GET); |
994 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1016 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
995 | goto out; | 1017 | goto out; |
996 | } | 1018 | } |
997 | |||
998 | if (unlikely(pmd_bad(*pmd))) | 1019 | if (unlikely(pmd_bad(*pmd))) |
999 | goto no_page_table; | 1020 | goto no_page_table; |
1000 | 1021 | ||
@@ -1058,11 +1079,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) | |||
1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | 1079 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) |
1059 | return 0; | 1080 | return 0; |
1060 | /* | 1081 | /* |
1061 | * And if we have a fault or a nopfn routine, it's not an | 1082 | * And if we have a fault routine, it's not an anonymous region. |
1062 | * anonymous region. | ||
1063 | */ | 1083 | */ |
1064 | return !vma->vm_ops || | 1084 | return !vma->vm_ops || !vma->vm_ops->fault; |
1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
1066 | } | 1085 | } |
1067 | 1086 | ||
1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1087 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
@@ -1338,6 +1357,11 @@ out: | |||
1338 | * | 1357 | * |
1339 | * This function should only be called from a vm_ops->fault handler, and | 1358 | * This function should only be called from a vm_ops->fault handler, and |
1340 | * in that case the handler should return NULL. | 1359 | * in that case the handler should return NULL. |
1360 | * | ||
1361 | * vma cannot be a COW mapping. | ||
1362 | * | ||
1363 | * As this is called only for pages that do not currently exist, we | ||
1364 | * do not need to flush old virtual caches or the TLB. | ||
1341 | */ | 1365 | */ |
1342 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1366 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1343 | unsigned long pfn) | 1367 | unsigned long pfn) |
@@ -1548,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
1548 | unsigned long next; | 1572 | unsigned long next; |
1549 | int err; | 1573 | int err; |
1550 | 1574 | ||
1575 | BUG_ON(pud_huge(*pud)); | ||
1576 | |||
1551 | pmd = pmd_alloc(mm, pud, addr); | 1577 | pmd = pmd_alloc(mm, pud, addr); |
1552 | if (!pmd) | 1578 | if (!pmd) |
1553 | return -ENOMEM; | 1579 | return -ENOMEM; |
@@ -2501,59 +2527,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2501 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2527 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
2502 | } | 2528 | } |
2503 | 2529 | ||
2504 | |||
2505 | /* | ||
2506 | * do_no_pfn() tries to create a new page mapping for a page without | ||
2507 | * a struct_page backing it | ||
2508 | * | ||
2509 | * As this is called only for pages that do not currently exist, we | ||
2510 | * do not need to flush old virtual caches or the TLB. | ||
2511 | * | ||
2512 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
2513 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
2514 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
2515 | * | ||
2516 | * It is expected that the ->nopfn handler always returns the same pfn | ||
2517 | * for a given virtual mapping. | ||
2518 | * | ||
2519 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
2520 | */ | ||
2521 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
2522 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
2523 | int write_access) | ||
2524 | { | ||
2525 | spinlock_t *ptl; | ||
2526 | pte_t entry; | ||
2527 | unsigned long pfn; | ||
2528 | |||
2529 | pte_unmap(page_table); | ||
2530 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
2531 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
2532 | |||
2533 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
2534 | |||
2535 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
2536 | |||
2537 | if (unlikely(pfn == NOPFN_OOM)) | ||
2538 | return VM_FAULT_OOM; | ||
2539 | else if (unlikely(pfn == NOPFN_SIGBUS)) | ||
2540 | return VM_FAULT_SIGBUS; | ||
2541 | else if (unlikely(pfn == NOPFN_REFAULT)) | ||
2542 | return 0; | ||
2543 | |||
2544 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2545 | |||
2546 | /* Only go through if we didn't race with anybody else... */ | ||
2547 | if (pte_none(*page_table)) { | ||
2548 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
2549 | if (write_access) | ||
2550 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
2551 | set_pte_at(mm, address, page_table, entry); | ||
2552 | } | ||
2553 | pte_unmap_unlock(page_table, ptl); | ||
2554 | return 0; | ||
2555 | } | ||
2556 | |||
2557 | /* | 2530 | /* |
2558 | * Fault of a previously existing named mapping. Repopulate the pte | 2531 | * Fault of a previously existing named mapping. Repopulate the pte |
2559 | * from the encoded file_pte if possible. This enables swappable | 2532 | * from the encoded file_pte if possible. This enables swappable |
@@ -2614,9 +2587,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2614 | if (likely(vma->vm_ops->fault)) | 2587 | if (likely(vma->vm_ops->fault)) |
2615 | return do_linear_fault(mm, vma, address, | 2588 | return do_linear_fault(mm, vma, address, |
2616 | pte, pmd, write_access, entry); | 2589 | pte, pmd, write_access, entry); |
2617 | if (unlikely(vma->vm_ops->nopfn)) | ||
2618 | return do_no_pfn(mm, vma, address, pte, | ||
2619 | pmd, write_access); | ||
2620 | } | 2590 | } |
2621 | return do_anonymous_page(mm, vma, address, | 2591 | return do_anonymous_page(mm, vma, address, |
2622 | pte, pmd, write_access); | 2592 | pte, pmd, write_access); |
@@ -2804,6 +2774,86 @@ int in_gate_area_no_task(unsigned long addr) | |||
2804 | 2774 | ||
2805 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2775 | #endif /* __HAVE_ARCH_GATE_AREA */ |
2806 | 2776 | ||
2777 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
2778 | static resource_size_t follow_phys(struct vm_area_struct *vma, | ||
2779 | unsigned long address, unsigned int flags, | ||
2780 | unsigned long *prot) | ||
2781 | { | ||
2782 | pgd_t *pgd; | ||
2783 | pud_t *pud; | ||
2784 | pmd_t *pmd; | ||
2785 | pte_t *ptep, pte; | ||
2786 | spinlock_t *ptl; | ||
2787 | resource_size_t phys_addr = 0; | ||
2788 | struct mm_struct *mm = vma->vm_mm; | ||
2789 | |||
2790 | VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); | ||
2791 | |||
2792 | pgd = pgd_offset(mm, address); | ||
2793 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
2794 | goto no_page_table; | ||
2795 | |||
2796 | pud = pud_offset(pgd, address); | ||
2797 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
2798 | goto no_page_table; | ||
2799 | |||
2800 | pmd = pmd_offset(pud, address); | ||
2801 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
2802 | goto no_page_table; | ||
2803 | |||
2804 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ | ||
2805 | if (pmd_huge(*pmd)) | ||
2806 | goto no_page_table; | ||
2807 | |||
2808 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
2809 | if (!ptep) | ||
2810 | goto out; | ||
2811 | |||
2812 | pte = *ptep; | ||
2813 | if (!pte_present(pte)) | ||
2814 | goto unlock; | ||
2815 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
2816 | goto unlock; | ||
2817 | phys_addr = pte_pfn(pte); | ||
2818 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
2819 | |||
2820 | *prot = pgprot_val(pte_pgprot(pte)); | ||
2821 | |||
2822 | unlock: | ||
2823 | pte_unmap_unlock(ptep, ptl); | ||
2824 | out: | ||
2825 | return phys_addr; | ||
2826 | no_page_table: | ||
2827 | return 0; | ||
2828 | } | ||
2829 | |||
2830 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | ||
2831 | void *buf, int len, int write) | ||
2832 | { | ||
2833 | resource_size_t phys_addr; | ||
2834 | unsigned long prot = 0; | ||
2835 | void *maddr; | ||
2836 | int offset = addr & (PAGE_SIZE-1); | ||
2837 | |||
2838 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
2839 | return -EINVAL; | ||
2840 | |||
2841 | phys_addr = follow_phys(vma, addr, write, &prot); | ||
2842 | |||
2843 | if (!phys_addr) | ||
2844 | return -EINVAL; | ||
2845 | |||
2846 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | ||
2847 | if (write) | ||
2848 | memcpy_toio(maddr + offset, buf, len); | ||
2849 | else | ||
2850 | memcpy_fromio(buf, maddr + offset, len); | ||
2851 | iounmap(maddr); | ||
2852 | |||
2853 | return len; | ||
2854 | } | ||
2855 | #endif | ||
2856 | |||
2807 | /* | 2857 | /* |
2808 | * Access another process' address space. | 2858 | * Access another process' address space. |
2809 | * Source/target buffer must be kernel space, | 2859 | * Source/target buffer must be kernel space, |
@@ -2813,7 +2863,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2813 | { | 2863 | { |
2814 | struct mm_struct *mm; | 2864 | struct mm_struct *mm; |
2815 | struct vm_area_struct *vma; | 2865 | struct vm_area_struct *vma; |
2816 | struct page *page; | ||
2817 | void *old_buf = buf; | 2866 | void *old_buf = buf; |
2818 | 2867 | ||
2819 | mm = get_task_mm(tsk); | 2868 | mm = get_task_mm(tsk); |
@@ -2825,28 +2874,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
2825 | while (len) { | 2874 | while (len) { |
2826 | int bytes, ret, offset; | 2875 | int bytes, ret, offset; |
2827 | void *maddr; | 2876 | void *maddr; |
2877 | struct page *page = NULL; | ||
2828 | 2878 | ||
2829 | ret = get_user_pages(tsk, mm, addr, 1, | 2879 | ret = get_user_pages(tsk, mm, addr, 1, |
2830 | write, 1, &page, &vma); | 2880 | write, 1, &page, &vma); |
2831 | if (ret <= 0) | 2881 | if (ret <= 0) { |
2832 | break; | 2882 | /* |
2833 | 2883 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | |
2834 | bytes = len; | 2884 | * we can access using slightly different code. |
2835 | offset = addr & (PAGE_SIZE-1); | 2885 | */ |
2836 | if (bytes > PAGE_SIZE-offset) | 2886 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
2837 | bytes = PAGE_SIZE-offset; | 2887 | vma = find_vma(mm, addr); |
2838 | 2888 | if (!vma) | |
2839 | maddr = kmap(page); | 2889 | break; |
2840 | if (write) { | 2890 | if (vma->vm_ops && vma->vm_ops->access) |
2841 | copy_to_user_page(vma, page, addr, | 2891 | ret = vma->vm_ops->access(vma, addr, buf, |
2842 | maddr + offset, buf, bytes); | 2892 | len, write); |
2843 | set_page_dirty_lock(page); | 2893 | if (ret <= 0) |
2894 | #endif | ||
2895 | break; | ||
2896 | bytes = ret; | ||
2844 | } else { | 2897 | } else { |
2845 | copy_from_user_page(vma, page, addr, | 2898 | bytes = len; |
2846 | buf, maddr + offset, bytes); | 2899 | offset = addr & (PAGE_SIZE-1); |
2900 | if (bytes > PAGE_SIZE-offset) | ||
2901 | bytes = PAGE_SIZE-offset; | ||
2902 | |||
2903 | maddr = kmap(page); | ||
2904 | if (write) { | ||
2905 | copy_to_user_page(vma, page, addr, | ||
2906 | maddr + offset, buf, bytes); | ||
2907 | set_page_dirty_lock(page); | ||
2908 | } else { | ||
2909 | copy_from_user_page(vma, page, addr, | ||
2910 | buf, maddr + offset, bytes); | ||
2911 | } | ||
2912 | kunmap(page); | ||
2913 | page_cache_release(page); | ||
2847 | } | 2914 | } |
2848 | kunmap(page); | ||
2849 | page_cache_release(page); | ||
2850 | len -= bytes; | 2915 | len -= bytes; |
2851 | buf += bytes; | 2916 | buf += bytes; |
2852 | addr += bytes; | 2917 | addr += bytes; |