aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c243
1 files changed, 154 insertions, 89 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..262e3eb6601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
61#include <linux/swapops.h> 61#include <linux/swapops.h>
62#include <linux/elf.h> 62#include <linux/elf.h>
63 63
64#include "internal.h"
65
64#ifndef CONFIG_NEED_MULTIPLE_NODES 66#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 67/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 68unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 213 *
212 * Must be called with pagetable lock held. 214 * Must be called with pagetable lock held.
213 */ 215 */
214void free_pgd_range(struct mmu_gather **tlb, 216void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 217 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 218 unsigned long floor, unsigned long ceiling)
217{ 219{
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 264 return;
263 265
264 start = addr; 266 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 267 pgd = pgd_offset(tlb->mm, addr);
266 do { 268 do {
267 next = pgd_addr_end(addr, end); 269 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 270 if (pgd_none_or_clear_bad(pgd))
269 continue; 271 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 272 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 273 } while (pgd++, addr = next, addr != end);
272} 274}
273 275
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 276void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 277 unsigned long floor, unsigned long ceiling)
276{ 278{
277 while (vma) { 279 while (vma) {
@@ -899,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 901 }
900 902
901 if (unlikely(is_vm_hugetlb_page(vma))) { 903 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 904 /*
903 zap_work -= (end - start) / 905 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 906 * should be non-null for valid hugetlb area.
907 * However, vm_file will be NULL in the error
908 * cleanup path of do_mmap_pgoff. When
909 * hugetlbfs ->mmap method fails,
910 * do_mmap_pgoff() nullifies vma->vm_file
911 * before calling this function to clean up.
912 * Since no pte has actually been setup, it is
913 * safe to do nothing in this case.
914 */
915 if (vma->vm_file) {
916 unmap_hugepage_range(vma, start, end, NULL);
917 zap_work -= (end - start) /
918 pages_per_huge_page(hstate_vma(vma));
919 }
920
905 start = end; 921 start = end;
906 } else 922 } else
907 start = unmap_page_range(*tlbp, vma, 923 start = unmap_page_range(*tlbp, vma,
@@ -982,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 998 goto no_page_table;
983 999
984 pud = pud_offset(pgd, address); 1000 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1001 if (pud_none(*pud))
986 goto no_page_table; 1002 goto no_page_table;
987 1003 if (pud_huge(*pud)) {
1004 BUG_ON(flags & FOLL_GET);
1005 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1006 goto out;
1007 }
1008 if (unlikely(pud_bad(*pud)))
1009 goto no_page_table;
1010
988 pmd = pmd_offset(pud, address); 1011 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1012 if (pmd_none(*pmd))
990 goto no_page_table; 1013 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1014 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1015 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1016 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1017 goto out;
996 } 1018 }
997
998 if (unlikely(pmd_bad(*pmd))) 1019 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1020 goto no_page_table;
1000 1021
@@ -1058,11 +1079,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1079 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1080 return 0;
1060 /* 1081 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1082 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1083 */
1064 return !vma->vm_ops || 1084 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1085}
1067 1086
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1087int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1357,11 @@ out:
1338 * 1357 *
1339 * This function should only be called from a vm_ops->fault handler, and 1358 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1359 * in that case the handler should return NULL.
1360 *
1361 * vma cannot be a COW mapping.
1362 *
1363 * As this is called only for pages that do not currently exist, we
1364 * do not need to flush old virtual caches or the TLB.
1341 */ 1365 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1366int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1367 unsigned long pfn)
@@ -1548,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1572 unsigned long next;
1549 int err; 1573 int err;
1550 1574
1575 BUG_ON(pud_huge(*pud));
1576
1551 pmd = pmd_alloc(mm, pud, addr); 1577 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1578 if (!pmd)
1553 return -ENOMEM; 1579 return -ENOMEM;
@@ -2501,59 +2527,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2527 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2528}
2503 2529
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2530/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2531 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2532 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2587,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2587 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2588 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2589 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2590 }
2621 return do_anonymous_page(mm, vma, address, 2591 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2592 pte, pmd, write_access);
@@ -2804,6 +2774,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2774
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2775#endif /* __HAVE_ARCH_GATE_AREA */
2806 2776
2777#ifdef CONFIG_HAVE_IOREMAP_PROT
2778static resource_size_t follow_phys(struct vm_area_struct *vma,
2779 unsigned long address, unsigned int flags,
2780 unsigned long *prot)
2781{
2782 pgd_t *pgd;
2783 pud_t *pud;
2784 pmd_t *pmd;
2785 pte_t *ptep, pte;
2786 spinlock_t *ptl;
2787 resource_size_t phys_addr = 0;
2788 struct mm_struct *mm = vma->vm_mm;
2789
2790 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2791
2792 pgd = pgd_offset(mm, address);
2793 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2794 goto no_page_table;
2795
2796 pud = pud_offset(pgd, address);
2797 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2798 goto no_page_table;
2799
2800 pmd = pmd_offset(pud, address);
2801 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2802 goto no_page_table;
2803
2804 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2805 if (pmd_huge(*pmd))
2806 goto no_page_table;
2807
2808 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2809 if (!ptep)
2810 goto out;
2811
2812 pte = *ptep;
2813 if (!pte_present(pte))
2814 goto unlock;
2815 if ((flags & FOLL_WRITE) && !pte_write(pte))
2816 goto unlock;
2817 phys_addr = pte_pfn(pte);
2818 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2819
2820 *prot = pgprot_val(pte_pgprot(pte));
2821
2822unlock:
2823 pte_unmap_unlock(ptep, ptl);
2824out:
2825 return phys_addr;
2826no_page_table:
2827 return 0;
2828}
2829
2830int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2831 void *buf, int len, int write)
2832{
2833 resource_size_t phys_addr;
2834 unsigned long prot = 0;
2835 void *maddr;
2836 int offset = addr & (PAGE_SIZE-1);
2837
2838 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2839 return -EINVAL;
2840
2841 phys_addr = follow_phys(vma, addr, write, &prot);
2842
2843 if (!phys_addr)
2844 return -EINVAL;
2845
2846 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2847 if (write)
2848 memcpy_toio(maddr + offset, buf, len);
2849 else
2850 memcpy_fromio(buf, maddr + offset, len);
2851 iounmap(maddr);
2852
2853 return len;
2854}
2855#endif
2856
2807/* 2857/*
2808 * Access another process' address space. 2858 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2859 * Source/target buffer must be kernel space,
@@ -2813,7 +2863,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2863{
2814 struct mm_struct *mm; 2864 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2865 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2866 void *old_buf = buf;
2818 2867
2819 mm = get_task_mm(tsk); 2868 mm = get_task_mm(tsk);
@@ -2825,28 +2874,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2874 while (len) {
2826 int bytes, ret, offset; 2875 int bytes, ret, offset;
2827 void *maddr; 2876 void *maddr;
2877 struct page *page = NULL;
2828 2878
2829 ret = get_user_pages(tsk, mm, addr, 1, 2879 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2880 write, 1, &page, &vma);
2831 if (ret <= 0) 2881 if (ret <= 0) {
2832 break; 2882 /*
2833 2883 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2884 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2885 */
2836 if (bytes > PAGE_SIZE-offset) 2886#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2887 vma = find_vma(mm, addr);
2838 2888 if (!vma)
2839 maddr = kmap(page); 2889 break;
2840 if (write) { 2890 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2891 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2892 len, write);
2843 set_page_dirty_lock(page); 2893 if (ret <= 0)
2894#endif
2895 break;
2896 bytes = ret;
2844 } else { 2897 } else {
2845 copy_from_user_page(vma, page, addr, 2898 bytes = len;
2846 buf, maddr + offset, bytes); 2899 offset = addr & (PAGE_SIZE-1);
2900 if (bytes > PAGE_SIZE-offset)
2901 bytes = PAGE_SIZE-offset;
2902
2903 maddr = kmap(page);
2904 if (write) {
2905 copy_to_user_page(vma, page, addr,
2906 maddr + offset, buf, bytes);
2907 set_page_dirty_lock(page);
2908 } else {
2909 copy_from_user_page(vma, page, addr,
2910 buf, maddr + offset, bytes);
2911 }
2912 kunmap(page);
2913 page_cache_release(page);
2847 } 2914 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 2915 len -= bytes;
2851 buf += bytes; 2916 buf += bytes;
2852 addr += bytes; 2917 addr += bytes;