aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c246
1 files changed, 156 insertions, 90 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..a8ca04faaea6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
61#include <linux/swapops.h> 61#include <linux/swapops.h>
62#include <linux/elf.h> 62#include <linux/elf.h>
63 63
64#include "internal.h"
65
64#ifndef CONFIG_NEED_MULTIPLE_NODES 66#ifndef CONFIG_NEED_MULTIPLE_NODES
65/* use the per-pgdat data instead for discontigmem - mbligh */ 67/* use the per-pgdat data instead for discontigmem - mbligh */
66unsigned long max_mapnr; 68unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
211 * 213 *
212 * Must be called with pagetable lock held. 214 * Must be called with pagetable lock held.
213 */ 215 */
214void free_pgd_range(struct mmu_gather **tlb, 216void free_pgd_range(struct mmu_gather *tlb,
215 unsigned long addr, unsigned long end, 217 unsigned long addr, unsigned long end,
216 unsigned long floor, unsigned long ceiling) 218 unsigned long floor, unsigned long ceiling)
217{ 219{
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
262 return; 264 return;
263 265
264 start = addr; 266 start = addr;
265 pgd = pgd_offset((*tlb)->mm, addr); 267 pgd = pgd_offset(tlb->mm, addr);
266 do { 268 do {
267 next = pgd_addr_end(addr, end); 269 next = pgd_addr_end(addr, end);
268 if (pgd_none_or_clear_bad(pgd)) 270 if (pgd_none_or_clear_bad(pgd))
269 continue; 271 continue;
270 free_pud_range(*tlb, pgd, addr, next, floor, ceiling); 272 free_pud_range(tlb, pgd, addr, next, floor, ceiling);
271 } while (pgd++, addr = next, addr != end); 273 } while (pgd++, addr = next, addr != end);
272} 274}
273 275
274void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, 276void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
275 unsigned long floor, unsigned long ceiling) 277 unsigned long floor, unsigned long ceiling)
276{ 278{
277 while (vma) { 279 while (vma) {
@@ -372,7 +374,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
372 * 374 *
373 * The calling function must still handle the error. 375 * The calling function must still handle the error.
374 */ 376 */
375void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) 377static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
378 unsigned long vaddr)
376{ 379{
377 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 380 printk(KERN_ERR "Bad pte = %08llx, process = %s, "
378 "vm_flags = %lx, vaddr = %lx\n", 381 "vm_flags = %lx, vaddr = %lx\n",
@@ -899,9 +902,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
899 } 902 }
900 903
901 if (unlikely(is_vm_hugetlb_page(vma))) { 904 if (unlikely(is_vm_hugetlb_page(vma))) {
902 unmap_hugepage_range(vma, start, end); 905 /*
903 zap_work -= (end - start) / 906 * It is undesirable to test vma->vm_file as it
904 (HPAGE_SIZE / PAGE_SIZE); 907 * should be non-null for valid hugetlb area.
908 * However, vm_file will be NULL in the error
909 * cleanup path of do_mmap_pgoff. When
910 * hugetlbfs ->mmap method fails,
911 * do_mmap_pgoff() nullifies vma->vm_file
912 * before calling this function to clean up.
913 * Since no pte has actually been setup, it is
914 * safe to do nothing in this case.
915 */
916 if (vma->vm_file) {
917 unmap_hugepage_range(vma, start, end, NULL);
918 zap_work -= (end - start) /
919 pages_per_huge_page(hstate_vma(vma));
920 }
921
905 start = end; 922 start = end;
906 } else 923 } else
907 start = unmap_page_range(*tlbp, vma, 924 start = unmap_page_range(*tlbp, vma,
@@ -982,19 +999,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
982 goto no_page_table; 999 goto no_page_table;
983 1000
984 pud = pud_offset(pgd, address); 1001 pud = pud_offset(pgd, address);
985 if (pud_none(*pud) || unlikely(pud_bad(*pud))) 1002 if (pud_none(*pud))
986 goto no_page_table; 1003 goto no_page_table;
987 1004 if (pud_huge(*pud)) {
1005 BUG_ON(flags & FOLL_GET);
1006 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
1007 goto out;
1008 }
1009 if (unlikely(pud_bad(*pud)))
1010 goto no_page_table;
1011
988 pmd = pmd_offset(pud, address); 1012 pmd = pmd_offset(pud, address);
989 if (pmd_none(*pmd)) 1013 if (pmd_none(*pmd))
990 goto no_page_table; 1014 goto no_page_table;
991
992 if (pmd_huge(*pmd)) { 1015 if (pmd_huge(*pmd)) {
993 BUG_ON(flags & FOLL_GET); 1016 BUG_ON(flags & FOLL_GET);
994 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 1017 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
995 goto out; 1018 goto out;
996 } 1019 }
997
998 if (unlikely(pmd_bad(*pmd))) 1020 if (unlikely(pmd_bad(*pmd)))
999 goto no_page_table; 1021 goto no_page_table;
1000 1022
@@ -1058,11 +1080,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1058 if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) 1080 if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
1059 return 0; 1081 return 0;
1060 /* 1082 /*
1061 * And if we have a fault or a nopfn routine, it's not an 1083 * And if we have a fault routine, it's not an anonymous region.
1062 * anonymous region.
1063 */ 1084 */
1064 return !vma->vm_ops || 1085 return !vma->vm_ops || !vma->vm_ops->fault;
1065 (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
1066} 1086}
1067 1087
1068int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1088int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1358,11 @@ out:
1338 * 1358 *
1339 * This function should only be called from a vm_ops->fault handler, and 1359 * This function should only be called from a vm_ops->fault handler, and
1340 * in that case the handler should return NULL. 1360 * in that case the handler should return NULL.
1361 *
1362 * vma cannot be a COW mapping.
1363 *
1364 * As this is called only for pages that do not currently exist, we
1365 * do not need to flush old virtual caches or the TLB.
1341 */ 1366 */
1342int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, 1367int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1343 unsigned long pfn) 1368 unsigned long pfn)
@@ -1548,6 +1573,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
1548 unsigned long next; 1573 unsigned long next;
1549 int err; 1574 int err;
1550 1575
1576 BUG_ON(pud_huge(*pud));
1577
1551 pmd = pmd_alloc(mm, pud, addr); 1578 pmd = pmd_alloc(mm, pud, addr);
1552 if (!pmd) 1579 if (!pmd)
1553 return -ENOMEM; 1580 return -ENOMEM;
@@ -2501,59 +2528,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2501 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); 2528 return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
2502} 2529}
2503 2530
2504
2505/*
2506 * do_no_pfn() tries to create a new page mapping for a page without
2507 * a struct_page backing it
2508 *
2509 * As this is called only for pages that do not currently exist, we
2510 * do not need to flush old virtual caches or the TLB.
2511 *
2512 * We enter with non-exclusive mmap_sem (to exclude vma changes,
2513 * but allow concurrent faults), and pte mapped but not yet locked.
2514 * We return with mmap_sem still held, but pte unmapped and unlocked.
2515 *
2516 * It is expected that the ->nopfn handler always returns the same pfn
2517 * for a given virtual mapping.
2518 *
2519 * Mark this `noinline' to prevent it from bloating the main pagefault code.
2520 */
2521static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2522 unsigned long address, pte_t *page_table, pmd_t *pmd,
2523 int write_access)
2524{
2525 spinlock_t *ptl;
2526 pte_t entry;
2527 unsigned long pfn;
2528
2529 pte_unmap(page_table);
2530 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2531 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2532
2533 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2534
2535 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2536
2537 if (unlikely(pfn == NOPFN_OOM))
2538 return VM_FAULT_OOM;
2539 else if (unlikely(pfn == NOPFN_SIGBUS))
2540 return VM_FAULT_SIGBUS;
2541 else if (unlikely(pfn == NOPFN_REFAULT))
2542 return 0;
2543
2544 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2545
2546 /* Only go through if we didn't race with anybody else... */
2547 if (pte_none(*page_table)) {
2548 entry = pfn_pte(pfn, vma->vm_page_prot);
2549 if (write_access)
2550 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2551 set_pte_at(mm, address, page_table, entry);
2552 }
2553 pte_unmap_unlock(page_table, ptl);
2554 return 0;
2555}
2556
2557/* 2531/*
2558 * Fault of a previously existing named mapping. Repopulate the pte 2532 * Fault of a previously existing named mapping. Repopulate the pte
2559 * from the encoded file_pte if possible. This enables swappable 2533 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2588,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2614 if (likely(vma->vm_ops->fault)) 2588 if (likely(vma->vm_ops->fault))
2615 return do_linear_fault(mm, vma, address, 2589 return do_linear_fault(mm, vma, address,
2616 pte, pmd, write_access, entry); 2590 pte, pmd, write_access, entry);
2617 if (unlikely(vma->vm_ops->nopfn))
2618 return do_no_pfn(mm, vma, address, pte,
2619 pmd, write_access);
2620 } 2591 }
2621 return do_anonymous_page(mm, vma, address, 2592 return do_anonymous_page(mm, vma, address,
2622 pte, pmd, write_access); 2593 pte, pmd, write_access);
@@ -2804,6 +2775,86 @@ int in_gate_area_no_task(unsigned long addr)
2804 2775
2805#endif /* __HAVE_ARCH_GATE_AREA */ 2776#endif /* __HAVE_ARCH_GATE_AREA */
2806 2777
2778#ifdef CONFIG_HAVE_IOREMAP_PROT
2779static resource_size_t follow_phys(struct vm_area_struct *vma,
2780 unsigned long address, unsigned int flags,
2781 unsigned long *prot)
2782{
2783 pgd_t *pgd;
2784 pud_t *pud;
2785 pmd_t *pmd;
2786 pte_t *ptep, pte;
2787 spinlock_t *ptl;
2788 resource_size_t phys_addr = 0;
2789 struct mm_struct *mm = vma->vm_mm;
2790
2791 VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
2792
2793 pgd = pgd_offset(mm, address);
2794 if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
2795 goto no_page_table;
2796
2797 pud = pud_offset(pgd, address);
2798 if (pud_none(*pud) || unlikely(pud_bad(*pud)))
2799 goto no_page_table;
2800
2801 pmd = pmd_offset(pud, address);
2802 if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
2803 goto no_page_table;
2804
2805 /* We cannot handle huge page PFN maps. Luckily they don't exist. */
2806 if (pmd_huge(*pmd))
2807 goto no_page_table;
2808
2809 ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
2810 if (!ptep)
2811 goto out;
2812
2813 pte = *ptep;
2814 if (!pte_present(pte))
2815 goto unlock;
2816 if ((flags & FOLL_WRITE) && !pte_write(pte))
2817 goto unlock;
2818 phys_addr = pte_pfn(pte);
2819 phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
2820
2821 *prot = pgprot_val(pte_pgprot(pte));
2822
2823unlock:
2824 pte_unmap_unlock(ptep, ptl);
2825out:
2826 return phys_addr;
2827no_page_table:
2828 return 0;
2829}
2830
2831int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2832 void *buf, int len, int write)
2833{
2834 resource_size_t phys_addr;
2835 unsigned long prot = 0;
2836 void *maddr;
2837 int offset = addr & (PAGE_SIZE-1);
2838
2839 if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
2840 return -EINVAL;
2841
2842 phys_addr = follow_phys(vma, addr, write, &prot);
2843
2844 if (!phys_addr)
2845 return -EINVAL;
2846
2847 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
2848 if (write)
2849 memcpy_toio(maddr + offset, buf, len);
2850 else
2851 memcpy_fromio(buf, maddr + offset, len);
2852 iounmap(maddr);
2853
2854 return len;
2855}
2856#endif
2857
2807/* 2858/*
2808 * Access another process' address space. 2859 * Access another process' address space.
2809 * Source/target buffer must be kernel space, 2860 * Source/target buffer must be kernel space,
@@ -2813,7 +2864,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2813{ 2864{
2814 struct mm_struct *mm; 2865 struct mm_struct *mm;
2815 struct vm_area_struct *vma; 2866 struct vm_area_struct *vma;
2816 struct page *page;
2817 void *old_buf = buf; 2867 void *old_buf = buf;
2818 2868
2819 mm = get_task_mm(tsk); 2869 mm = get_task_mm(tsk);
@@ -2825,28 +2875,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
2825 while (len) { 2875 while (len) {
2826 int bytes, ret, offset; 2876 int bytes, ret, offset;
2827 void *maddr; 2877 void *maddr;
2878 struct page *page = NULL;
2828 2879
2829 ret = get_user_pages(tsk, mm, addr, 1, 2880 ret = get_user_pages(tsk, mm, addr, 1,
2830 write, 1, &page, &vma); 2881 write, 1, &page, &vma);
2831 if (ret <= 0) 2882 if (ret <= 0) {
2832 break; 2883 /*
2833 2884 * Check if this is a VM_IO | VM_PFNMAP VMA, which
2834 bytes = len; 2885 * we can access using slightly different code.
2835 offset = addr & (PAGE_SIZE-1); 2886 */
2836 if (bytes > PAGE_SIZE-offset) 2887#ifdef CONFIG_HAVE_IOREMAP_PROT
2837 bytes = PAGE_SIZE-offset; 2888 vma = find_vma(mm, addr);
2838 2889 if (!vma)
2839 maddr = kmap(page); 2890 break;
2840 if (write) { 2891 if (vma->vm_ops && vma->vm_ops->access)
2841 copy_to_user_page(vma, page, addr, 2892 ret = vma->vm_ops->access(vma, addr, buf,
2842 maddr + offset, buf, bytes); 2893 len, write);
2843 set_page_dirty_lock(page); 2894 if (ret <= 0)
2895#endif
2896 break;
2897 bytes = ret;
2844 } else { 2898 } else {
2845 copy_from_user_page(vma, page, addr, 2899 bytes = len;
2846 buf, maddr + offset, bytes); 2900 offset = addr & (PAGE_SIZE-1);
2901 if (bytes > PAGE_SIZE-offset)
2902 bytes = PAGE_SIZE-offset;
2903
2904 maddr = kmap(page);
2905 if (write) {
2906 copy_to_user_page(vma, page, addr,
2907 maddr + offset, buf, bytes);
2908 set_page_dirty_lock(page);
2909 } else {
2910 copy_from_user_page(vma, page, addr,
2911 buf, maddr + offset, bytes);
2912 }
2913 kunmap(page);
2914 page_cache_release(page);
2847 } 2915 }
2848 kunmap(page);
2849 page_cache_release(page);
2850 len -= bytes; 2916 len -= bytes;
2851 buf += bytes; 2917 buf += bytes;
2852 addr += bytes; 2918 addr += bytes;