summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-07-26 18:25:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-07-26 19:19:19 -0400
commit7267ec008b5cd8b3579e188b1ff238815643e372 (patch)
tree06e45eb3b7b951799e452403dfaf77fefb726b54 /mm/memory.c
parentbae473a423f65e480db83c85b5e92254f6dfcb28 (diff)
mm: postpone page table allocation until we have page to map
The idea (and most of code) is borrowed again: from Hugh's patchset on huge tmpfs[1]. Instead of allocation pte page table upfront, we postpone this until we have page to map in hands. This approach opens possibility to map the page as huge if filesystem supports this. Comparing to Hugh's patch I've pushed page table allocation a bit further: into do_set_pte(). This way we can postpone allocation even in faultaround case without moving do_fault_around() after __do_fault(). do_set_pte() got renamed to alloc_set_pte() as it can allocate page table if required. [1] http://lkml.kernel.org/r/alpine.LSU.2.11.1502202015090.14414@eggly.anvils Link: http://lkml.kernel.org/r/1466021202-61880-10-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c298
1 files changed, 178 insertions, 120 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 72b520897339..1991105bf67c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2739,8 +2739,6 @@ static int do_anonymous_page(struct fault_env *fe)
2739 struct page *page; 2739 struct page *page;
2740 pte_t entry; 2740 pte_t entry;
2741 2741
2742 pte_unmap(fe->pte);
2743
2744 /* File mapping without ->vm_ops ? */ 2742 /* File mapping without ->vm_ops ? */
2745 if (vma->vm_flags & VM_SHARED) 2743 if (vma->vm_flags & VM_SHARED)
2746 return VM_FAULT_SIGBUS; 2744 return VM_FAULT_SIGBUS;
@@ -2749,6 +2747,23 @@ static int do_anonymous_page(struct fault_env *fe)
2749 if (check_stack_guard_page(vma, fe->address) < 0) 2747 if (check_stack_guard_page(vma, fe->address) < 0)
2750 return VM_FAULT_SIGSEGV; 2748 return VM_FAULT_SIGSEGV;
2751 2749
2750 /*
2751 * Use pte_alloc() instead of pte_alloc_map(). We can't run
2752 * pte_offset_map() on pmds where a huge pmd might be created
2753 * from a different thread.
2754 *
2755 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2756 * parallel threads are excluded by other means.
2757 *
2758 * Here we only have down_read(mmap_sem).
2759 */
2760 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
2761 return VM_FAULT_OOM;
2762
2763 /* See the comment in pte_alloc_one_map() */
2764 if (unlikely(pmd_trans_unstable(fe->pmd)))
2765 return 0;
2766
2752 /* Use the zero-page for reads */ 2767 /* Use the zero-page for reads */
2753 if (!(fe->flags & FAULT_FLAG_WRITE) && 2768 if (!(fe->flags & FAULT_FLAG_WRITE) &&
2754 !mm_forbids_zeropage(vma->vm_mm)) { 2769 !mm_forbids_zeropage(vma->vm_mm)) {
@@ -2865,23 +2880,76 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
2865 return ret; 2880 return ret;
2866} 2881}
2867 2882
2883static int pte_alloc_one_map(struct fault_env *fe)
2884{
2885 struct vm_area_struct *vma = fe->vma;
2886
2887 if (!pmd_none(*fe->pmd))
2888 goto map_pte;
2889 if (fe->prealloc_pte) {
2890 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2891 if (unlikely(!pmd_none(*fe->pmd))) {
2892 spin_unlock(fe->ptl);
2893 goto map_pte;
2894 }
2895
2896 atomic_long_inc(&vma->vm_mm->nr_ptes);
2897 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2898 spin_unlock(fe->ptl);
2899 fe->prealloc_pte = 0;
2900 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
2901 return VM_FAULT_OOM;
2902 }
2903map_pte:
2904 /*
2905 * If a huge pmd materialized under us just retry later. Use
2906 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
2907 * didn't become pmd_trans_huge under us and then back to pmd_none, as
2908 * a result of MADV_DONTNEED running immediately after a huge pmd fault
2909 * in a different thread of this mm, in turn leading to a misleading
2910 * pmd_trans_huge() retval. All we have to ensure is that it is a
2911 * regular pmd that we can walk with pte_offset_map() and we can do that
2912 * through an atomic read in C, which is what pmd_trans_unstable()
2913 * provides.
2914 */
2915 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
2916 return VM_FAULT_NOPAGE;
2917
2918 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2919 &fe->ptl);
2920 return 0;
2921}
2922
2868/** 2923/**
2869 * do_set_pte - setup new PTE entry for given page and add reverse page mapping. 2924 * alloc_set_pte - setup new PTE entry for given page and add reverse page
2925 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
2870 * 2926 *
2871 * @fe: fault environment 2927 * @fe: fault environment
2928 * @memcg: memcg to charge page (only for private mappings)
2872 * @page: page to map 2929 * @page: page to map
2873 * 2930 *
2874 * Caller must hold page table lock relevant for @fe->pte. 2931 * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
2875 * 2932 *
2876 * Target users are page handler itself and implementations of 2933 * Target users are page handler itself and implementations of
2877 * vm_ops->map_pages. 2934 * vm_ops->map_pages.
2878 */ 2935 */
2879void do_set_pte(struct fault_env *fe, struct page *page) 2936int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
2937 struct page *page)
2880{ 2938{
2881 struct vm_area_struct *vma = fe->vma; 2939 struct vm_area_struct *vma = fe->vma;
2882 bool write = fe->flags & FAULT_FLAG_WRITE; 2940 bool write = fe->flags & FAULT_FLAG_WRITE;
2883 pte_t entry; 2941 pte_t entry;
2884 2942
2943 if (!fe->pte) {
2944 int ret = pte_alloc_one_map(fe);
2945 if (ret)
2946 return ret;
2947 }
2948
2949 /* Re-check under ptl */
2950 if (unlikely(!pte_none(*fe->pte)))
2951 return VM_FAULT_NOPAGE;
2952
2885 flush_icache_page(vma, page); 2953 flush_icache_page(vma, page);
2886 entry = mk_pte(page, vma->vm_page_prot); 2954 entry = mk_pte(page, vma->vm_page_prot);
2887 if (write) 2955 if (write)
@@ -2890,6 +2958,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
2890 if (write && !(vma->vm_flags & VM_SHARED)) { 2958 if (write && !(vma->vm_flags & VM_SHARED)) {
2891 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2959 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2892 page_add_new_anon_rmap(page, vma, fe->address, false); 2960 page_add_new_anon_rmap(page, vma, fe->address, false);
2961 mem_cgroup_commit_charge(page, memcg, false, false);
2962 lru_cache_add_active_or_unevictable(page, vma);
2893 } else { 2963 } else {
2894 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 2964 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2895 page_add_file_rmap(page); 2965 page_add_file_rmap(page);
@@ -2898,6 +2968,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
2898 2968
2899 /* no need to invalidate: a not-present page won't be cached */ 2969 /* no need to invalidate: a not-present page won't be cached */
2900 update_mmu_cache(vma, fe->address, fe->pte); 2970 update_mmu_cache(vma, fe->address, fe->pte);
2971
2972 return 0;
2901} 2973}
2902 2974
2903static unsigned long fault_around_bytes __read_mostly = 2975static unsigned long fault_around_bytes __read_mostly =
@@ -2964,19 +3036,17 @@ late_initcall(fault_around_debugfs);
2964 * fault_around_pages() value (and therefore to page order). This way it's 3036 * fault_around_pages() value (and therefore to page order). This way it's
2965 * easier to guarantee that we don't cross page table boundaries. 3037 * easier to guarantee that we don't cross page table boundaries.
2966 */ 3038 */
2967static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) 3039static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2968{ 3040{
2969 unsigned long address = fe->address, start_addr, nr_pages, mask; 3041 unsigned long address = fe->address, nr_pages, mask;
2970 pte_t *pte = fe->pte;
2971 pgoff_t end_pgoff; 3042 pgoff_t end_pgoff;
2972 int off; 3043 int off, ret = 0;
2973 3044
2974 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 3045 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2975 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 3046 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2976 3047
2977 start_addr = max(fe->address & mask, fe->vma->vm_start); 3048 fe->address = max(address & mask, fe->vma->vm_start);
2978 off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 3049 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2979 fe->pte -= off;
2980 start_pgoff -= off; 3050 start_pgoff -= off;
2981 3051
2982 /* 3052 /*
@@ -2984,30 +3054,45 @@ static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2984 * or fault_around_pages() from start_pgoff, depending what is nearest. 3054 * or fault_around_pages() from start_pgoff, depending what is nearest.
2985 */ 3055 */
2986 end_pgoff = start_pgoff - 3056 end_pgoff = start_pgoff -
2987 ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 3057 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2988 PTRS_PER_PTE - 1; 3058 PTRS_PER_PTE - 1;
2989 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, 3059 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
2990 start_pgoff + nr_pages - 1); 3060 start_pgoff + nr_pages - 1);
2991 3061
2992 /* Check if it makes any sense to call ->map_pages */ 3062 if (pmd_none(*fe->pmd)) {
2993 fe->address = start_addr; 3063 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
2994 while (!pte_none(*fe->pte)) { 3064 smp_wmb(); /* See comment in __pte_alloc() */
2995 if (++start_pgoff > end_pgoff)
2996 goto out;
2997 fe->address += PAGE_SIZE;
2998 if (fe->address >= fe->vma->vm_end)
2999 goto out;
3000 fe->pte++;
3001 } 3065 }
3002 3066
3003 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); 3067 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3068
3069 /* preallocated pagetable is unused: free it */
3070 if (fe->prealloc_pte) {
3071 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3072 fe->prealloc_pte = 0;
3073 }
3074 /* Huge page is mapped? Page fault is solved */
3075 if (pmd_trans_huge(*fe->pmd)) {
3076 ret = VM_FAULT_NOPAGE;
3077 goto out;
3078 }
3079
3080 /* ->map_pages() haven't done anything useful. Cold page cache? */
3081 if (!fe->pte)
3082 goto out;
3083
3084 /* check if the page fault is solved */
3085 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3086 if (!pte_none(*fe->pte))
3087 ret = VM_FAULT_NOPAGE;
3088 pte_unmap_unlock(fe->pte, fe->ptl);
3004out: 3089out:
3005 /* restore fault_env */
3006 fe->pte = pte;
3007 fe->address = address; 3090 fe->address = address;
3091 fe->pte = NULL;
3092 return ret;
3008} 3093}
3009 3094
3010static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3095static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3011{ 3096{
3012 struct vm_area_struct *vma = fe->vma; 3097 struct vm_area_struct *vma = fe->vma;
3013 struct page *fault_page; 3098 struct page *fault_page;
@@ -3019,36 +3104,25 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3019 * something). 3104 * something).
3020 */ 3105 */
3021 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3106 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3022 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3107 ret = do_fault_around(fe, pgoff);
3023 &fe->ptl); 3108 if (ret)
3024 if (!pte_same(*fe->pte, orig_pte)) 3109 return ret;
3025 goto unlock_out;
3026 do_fault_around(fe, pgoff);
3027 /* Check if the fault is handled by faultaround */
3028 if (!pte_same(*fe->pte, orig_pte))
3029 goto unlock_out;
3030 pte_unmap_unlock(fe->pte, fe->ptl);
3031 } 3110 }
3032 3111
3033 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3112 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3034 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3113 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3035 return ret; 3114 return ret;
3036 3115
3037 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl); 3116 ret |= alloc_set_pte(fe, NULL, fault_page);
3038 if (unlikely(!pte_same(*fe->pte, orig_pte))) { 3117 if (fe->pte)
3039 pte_unmap_unlock(fe->pte, fe->ptl); 3118 pte_unmap_unlock(fe->pte, fe->ptl);
3040 unlock_page(fault_page);
3041 put_page(fault_page);
3042 return ret;
3043 }
3044 do_set_pte(fe, fault_page);
3045 unlock_page(fault_page); 3119 unlock_page(fault_page);
3046unlock_out: 3120 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3047 pte_unmap_unlock(fe->pte, fe->ptl); 3121 put_page(fault_page);
3048 return ret; 3122 return ret;
3049} 3123}
3050 3124
3051static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3125static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
3052{ 3126{
3053 struct vm_area_struct *vma = fe->vma; 3127 struct vm_area_struct *vma = fe->vma;
3054 struct page *fault_page, *new_page; 3128 struct page *fault_page, *new_page;
@@ -3077,29 +3151,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3077 copy_user_highpage(new_page, fault_page, fe->address, vma); 3151 copy_user_highpage(new_page, fault_page, fe->address, vma);
3078 __SetPageUptodate(new_page); 3152 __SetPageUptodate(new_page);
3079 3153
3080 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3154 ret |= alloc_set_pte(fe, memcg, new_page);
3081 &fe->ptl); 3155 if (fe->pte)
3082 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3083 pte_unmap_unlock(fe->pte, fe->ptl); 3156 pte_unmap_unlock(fe->pte, fe->ptl);
3084 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3085 unlock_page(fault_page);
3086 put_page(fault_page);
3087 } else {
3088 dax_unlock_mapping_entry(vma->vm_file->f_mapping,
3089 pgoff);
3090 }
3091 goto uncharge_out;
3092 }
3093 do_set_pte(fe, new_page);
3094 mem_cgroup_commit_charge(new_page, memcg, false, false);
3095 lru_cache_add_active_or_unevictable(new_page, vma);
3096 pte_unmap_unlock(fe->pte, fe->ptl);
3097 if (!(ret & VM_FAULT_DAX_LOCKED)) { 3157 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3098 unlock_page(fault_page); 3158 unlock_page(fault_page);
3099 put_page(fault_page); 3159 put_page(fault_page);
3100 } else { 3160 } else {
3101 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); 3161 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3102 } 3162 }
3163 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3164 goto uncharge_out;
3103 return ret; 3165 return ret;
3104uncharge_out: 3166uncharge_out:
3105 mem_cgroup_cancel_charge(new_page, memcg, false); 3167 mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -3107,7 +3169,7 @@ uncharge_out:
3107 return ret; 3169 return ret;
3108} 3170}
3109 3171
3110static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3172static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3111{ 3173{
3112 struct vm_area_struct *vma = fe->vma; 3174 struct vm_area_struct *vma = fe->vma;
3113 struct page *fault_page; 3175 struct page *fault_page;
@@ -3133,16 +3195,15 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3133 } 3195 }
3134 } 3196 }
3135 3197
3136 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3198 ret |= alloc_set_pte(fe, NULL, fault_page);
3137 &fe->ptl); 3199 if (fe->pte)
3138 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3139 pte_unmap_unlock(fe->pte, fe->ptl); 3200 pte_unmap_unlock(fe->pte, fe->ptl);
3201 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3202 VM_FAULT_RETRY))) {
3140 unlock_page(fault_page); 3203 unlock_page(fault_page);
3141 put_page(fault_page); 3204 put_page(fault_page);
3142 return ret; 3205 return ret;
3143 } 3206 }
3144 do_set_pte(fe, fault_page);
3145 pte_unmap_unlock(fe->pte, fe->ptl);
3146 3207
3147 if (set_page_dirty(fault_page)) 3208 if (set_page_dirty(fault_page))
3148 dirtied = 1; 3209 dirtied = 1;
@@ -3174,20 +3235,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3174 * The mmap_sem may have been released depending on flags and our 3235 * The mmap_sem may have been released depending on flags and our
3175 * return value. See filemap_fault() and __lock_page_or_retry(). 3236 * return value. See filemap_fault() and __lock_page_or_retry().
3176 */ 3237 */
3177static int do_fault(struct fault_env *fe, pte_t orig_pte) 3238static int do_fault(struct fault_env *fe)
3178{ 3239{
3179 struct vm_area_struct *vma = fe->vma; 3240 struct vm_area_struct *vma = fe->vma;
3180 pgoff_t pgoff = linear_page_index(vma, fe->address); 3241 pgoff_t pgoff = linear_page_index(vma, fe->address);
3181 3242
3182 pte_unmap(fe->pte);
3183 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3243 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3184 if (!vma->vm_ops->fault) 3244 if (!vma->vm_ops->fault)
3185 return VM_FAULT_SIGBUS; 3245 return VM_FAULT_SIGBUS;
3186 if (!(fe->flags & FAULT_FLAG_WRITE)) 3246 if (!(fe->flags & FAULT_FLAG_WRITE))
3187 return do_read_fault(fe, pgoff, orig_pte); 3247 return do_read_fault(fe, pgoff);
3188 if (!(vma->vm_flags & VM_SHARED)) 3248 if (!(vma->vm_flags & VM_SHARED))
3189 return do_cow_fault(fe, pgoff, orig_pte); 3249 return do_cow_fault(fe, pgoff);
3190 return do_shared_fault(fe, pgoff, orig_pte); 3250 return do_shared_fault(fe, pgoff);
3191} 3251}
3192 3252
3193static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3253static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3327,37 +3387,63 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3327 * with external mmu caches can use to update those (ie the Sparc or 3387 * with external mmu caches can use to update those (ie the Sparc or
3328 * PowerPC hashed page tables that act as extended TLBs). 3388 * PowerPC hashed page tables that act as extended TLBs).
3329 * 3389 *
3330 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3390 * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
3331 * but allow concurrent faults), and pte mapped but not yet locked. 3391 * concurrent faults).
3332 * We return with pte unmapped and unlocked.
3333 * 3392 *
3334 * The mmap_sem may have been released depending on flags and our 3393 * The mmap_sem may have been released depending on flags and our return value.
3335 * return value. See filemap_fault() and __lock_page_or_retry(). 3394 * See filemap_fault() and __lock_page_or_retry().
3336 */ 3395 */
3337static int handle_pte_fault(struct fault_env *fe) 3396static int handle_pte_fault(struct fault_env *fe)
3338{ 3397{
3339 pte_t entry; 3398 pte_t entry;
3340 3399
3341 /* 3400 if (unlikely(pmd_none(*fe->pmd))) {
3342 * some architectures can have larger ptes than wordsize, 3401 /*
3343 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, 3402 * Leave __pte_alloc() until later: because vm_ops->fault may
3344 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. 3403 * want to allocate huge page, and if we expose page table
3345 * The code below just needs a consistent view for the ifs and 3404 * for an instant, it will be difficult to retract from
3346 * we later double check anyway with the ptl lock held. So here 3405 * concurrent faults and from rmap lookups.
3347 * a barrier will do. 3406 */
3348 */ 3407 fe->pte = NULL;
3349 entry = *fe->pte; 3408 } else {
3350 barrier(); 3409 /* See comment in pte_alloc_one_map() */
3351 if (!pte_present(entry)) { 3410 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
3411 return 0;
3412 /*
3413 * A regular pmd is established and it can't morph into a huge
3414 * pmd from under us anymore at this point because we hold the
3415 * mmap_sem read mode and khugepaged takes it in write mode.
3416 * So now it's safe to run pte_offset_map().
3417 */
3418 fe->pte = pte_offset_map(fe->pmd, fe->address);
3419
3420 entry = *fe->pte;
3421
3422 /*
3423 * some architectures can have larger ptes than wordsize,
3424 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
3425 * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
3426 * atomic accesses. The code below just needs a consistent
3427 * view for the ifs and we later double check anyway with the
3428 * ptl lock held. So here a barrier will do.
3429 */
3430 barrier();
3352 if (pte_none(entry)) { 3431 if (pte_none(entry)) {
3353 if (vma_is_anonymous(fe->vma)) 3432 pte_unmap(fe->pte);
3354 return do_anonymous_page(fe); 3433 fe->pte = NULL;
3355 else
3356 return do_fault(fe, entry);
3357 } 3434 }
3358 return do_swap_page(fe, entry);
3359 } 3435 }
3360 3436
3437 if (!fe->pte) {
3438 if (vma_is_anonymous(fe->vma))
3439 return do_anonymous_page(fe);
3440 else
3441 return do_fault(fe);
3442 }
3443
3444 if (!pte_present(entry))
3445 return do_swap_page(fe, entry);
3446
3361 if (pte_protnone(entry)) 3447 if (pte_protnone(entry))
3362 return do_numa_page(fe, entry); 3448 return do_numa_page(fe, entry);
3363 3449
@@ -3439,34 +3525,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3439 } 3525 }
3440 } 3526 }
3441 3527
3442 /*
3443 * Use pte_alloc() instead of pte_alloc_map, because we can't
3444 * run pte_offset_map on the pmd, if an huge pmd could
3445 * materialize from under us from a different thread.
3446 */
3447 if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address)))
3448 return VM_FAULT_OOM;
3449 /*
3450 * If a huge pmd materialized under us just retry later. Use
3451 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
3452 * didn't become pmd_trans_huge under us and then back to pmd_none, as
3453 * a result of MADV_DONTNEED running immediately after a huge pmd fault
3454 * in a different thread of this mm, in turn leading to a misleading
3455 * pmd_trans_huge() retval. All we have to ensure is that it is a
3456 * regular pmd that we can walk with pte_offset_map() and we can do that
3457 * through an atomic read in C, which is what pmd_trans_unstable()
3458 * provides.
3459 */
3460 if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd)))
3461 return 0;
3462 /*
3463 * A regular pmd is established and it can't morph into a huge pmd
3464 * from under us anymore at this point because we hold the mmap_sem
3465 * read mode and khugepaged takes it in write mode. So now it's
3466 * safe to run pte_offset_map().
3467 */
3468 fe.pte = pte_offset_map(fe.pmd, fe.address);
3469
3470 return handle_pte_fault(&fe); 3528 return handle_pte_fault(&fe);
3471} 3529}
3472 3530