summaryrefslogtreecommitdiffstats
path: root/mm/memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memory.c')
-rw-r--r--mm/memory.c298
1 files changed, 178 insertions, 120 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 72b520897339..1991105bf67c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2739,8 +2739,6 @@ static int do_anonymous_page(struct fault_env *fe)
2739 struct page *page; 2739 struct page *page;
2740 pte_t entry; 2740 pte_t entry;
2741 2741
2742 pte_unmap(fe->pte);
2743
2744 /* File mapping without ->vm_ops ? */ 2742 /* File mapping without ->vm_ops ? */
2745 if (vma->vm_flags & VM_SHARED) 2743 if (vma->vm_flags & VM_SHARED)
2746 return VM_FAULT_SIGBUS; 2744 return VM_FAULT_SIGBUS;
@@ -2749,6 +2747,23 @@ static int do_anonymous_page(struct fault_env *fe)
2749 if (check_stack_guard_page(vma, fe->address) < 0) 2747 if (check_stack_guard_page(vma, fe->address) < 0)
2750 return VM_FAULT_SIGSEGV; 2748 return VM_FAULT_SIGSEGV;
2751 2749
2750 /*
2751 * Use pte_alloc() instead of pte_alloc_map(). We can't run
2752 * pte_offset_map() on pmds where a huge pmd might be created
2753 * from a different thread.
2754 *
2755 * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
2756 * parallel threads are excluded by other means.
2757 *
2758 * Here we only have down_read(mmap_sem).
2759 */
2760 if (pte_alloc(vma->vm_mm, fe->pmd, fe->address))
2761 return VM_FAULT_OOM;
2762
2763 /* See the comment in pte_alloc_one_map() */
2764 if (unlikely(pmd_trans_unstable(fe->pmd)))
2765 return 0;
2766
2752 /* Use the zero-page for reads */ 2767 /* Use the zero-page for reads */
2753 if (!(fe->flags & FAULT_FLAG_WRITE) && 2768 if (!(fe->flags & FAULT_FLAG_WRITE) &&
2754 !mm_forbids_zeropage(vma->vm_mm)) { 2769 !mm_forbids_zeropage(vma->vm_mm)) {
@@ -2865,23 +2880,76 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff,
2865 return ret; 2880 return ret;
2866} 2881}
2867 2882
2883static int pte_alloc_one_map(struct fault_env *fe)
2884{
2885 struct vm_area_struct *vma = fe->vma;
2886
2887 if (!pmd_none(*fe->pmd))
2888 goto map_pte;
2889 if (fe->prealloc_pte) {
2890 fe->ptl = pmd_lock(vma->vm_mm, fe->pmd);
2891 if (unlikely(!pmd_none(*fe->pmd))) {
2892 spin_unlock(fe->ptl);
2893 goto map_pte;
2894 }
2895
2896 atomic_long_inc(&vma->vm_mm->nr_ptes);
2897 pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte);
2898 spin_unlock(fe->ptl);
2899 fe->prealloc_pte = 0;
2900 } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) {
2901 return VM_FAULT_OOM;
2902 }
2903map_pte:
2904 /*
2905 * If a huge pmd materialized under us just retry later. Use
2906 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
2907 * didn't become pmd_trans_huge under us and then back to pmd_none, as
2908 * a result of MADV_DONTNEED running immediately after a huge pmd fault
2909 * in a different thread of this mm, in turn leading to a misleading
2910 * pmd_trans_huge() retval. All we have to ensure is that it is a
2911 * regular pmd that we can walk with pte_offset_map() and we can do that
2912 * through an atomic read in C, which is what pmd_trans_unstable()
2913 * provides.
2914 */
2915 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
2916 return VM_FAULT_NOPAGE;
2917
2918 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address,
2919 &fe->ptl);
2920 return 0;
2921}
2922
2868/** 2923/**
2869 * do_set_pte - setup new PTE entry for given page and add reverse page mapping. 2924 * alloc_set_pte - setup new PTE entry for given page and add reverse page
2925 * mapping. If needed, the fucntion allocates page table or use pre-allocated.
2870 * 2926 *
2871 * @fe: fault environment 2927 * @fe: fault environment
2928 * @memcg: memcg to charge page (only for private mappings)
2872 * @page: page to map 2929 * @page: page to map
2873 * 2930 *
2874 * Caller must hold page table lock relevant for @fe->pte. 2931 * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return.
2875 * 2932 *
2876 * Target users are page handler itself and implementations of 2933 * Target users are page handler itself and implementations of
2877 * vm_ops->map_pages. 2934 * vm_ops->map_pages.
2878 */ 2935 */
2879void do_set_pte(struct fault_env *fe, struct page *page) 2936int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg,
2937 struct page *page)
2880{ 2938{
2881 struct vm_area_struct *vma = fe->vma; 2939 struct vm_area_struct *vma = fe->vma;
2882 bool write = fe->flags & FAULT_FLAG_WRITE; 2940 bool write = fe->flags & FAULT_FLAG_WRITE;
2883 pte_t entry; 2941 pte_t entry;
2884 2942
2943 if (!fe->pte) {
2944 int ret = pte_alloc_one_map(fe);
2945 if (ret)
2946 return ret;
2947 }
2948
2949 /* Re-check under ptl */
2950 if (unlikely(!pte_none(*fe->pte)))
2951 return VM_FAULT_NOPAGE;
2952
2885 flush_icache_page(vma, page); 2953 flush_icache_page(vma, page);
2886 entry = mk_pte(page, vma->vm_page_prot); 2954 entry = mk_pte(page, vma->vm_page_prot);
2887 if (write) 2955 if (write)
@@ -2890,6 +2958,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
2890 if (write && !(vma->vm_flags & VM_SHARED)) { 2958 if (write && !(vma->vm_flags & VM_SHARED)) {
2891 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); 2959 inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
2892 page_add_new_anon_rmap(page, vma, fe->address, false); 2960 page_add_new_anon_rmap(page, vma, fe->address, false);
2961 mem_cgroup_commit_charge(page, memcg, false, false);
2962 lru_cache_add_active_or_unevictable(page, vma);
2893 } else { 2963 } else {
2894 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); 2964 inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
2895 page_add_file_rmap(page); 2965 page_add_file_rmap(page);
@@ -2898,6 +2968,8 @@ void do_set_pte(struct fault_env *fe, struct page *page)
2898 2968
2899 /* no need to invalidate: a not-present page won't be cached */ 2969 /* no need to invalidate: a not-present page won't be cached */
2900 update_mmu_cache(vma, fe->address, fe->pte); 2970 update_mmu_cache(vma, fe->address, fe->pte);
2971
2972 return 0;
2901} 2973}
2902 2974
2903static unsigned long fault_around_bytes __read_mostly = 2975static unsigned long fault_around_bytes __read_mostly =
@@ -2964,19 +3036,17 @@ late_initcall(fault_around_debugfs);
2964 * fault_around_pages() value (and therefore to page order). This way it's 3036 * fault_around_pages() value (and therefore to page order). This way it's
2965 * easier to guarantee that we don't cross page table boundaries. 3037 * easier to guarantee that we don't cross page table boundaries.
2966 */ 3038 */
2967static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) 3039static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2968{ 3040{
2969 unsigned long address = fe->address, start_addr, nr_pages, mask; 3041 unsigned long address = fe->address, nr_pages, mask;
2970 pte_t *pte = fe->pte;
2971 pgoff_t end_pgoff; 3042 pgoff_t end_pgoff;
2972 int off; 3043 int off, ret = 0;
2973 3044
2974 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; 3045 nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT;
2975 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; 3046 mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK;
2976 3047
2977 start_addr = max(fe->address & mask, fe->vma->vm_start); 3048 fe->address = max(address & mask, fe->vma->vm_start);
2978 off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); 3049 off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
2979 fe->pte -= off;
2980 start_pgoff -= off; 3050 start_pgoff -= off;
2981 3051
2982 /* 3052 /*
@@ -2984,30 +3054,45 @@ static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff)
2984 * or fault_around_pages() from start_pgoff, depending what is nearest. 3054 * or fault_around_pages() from start_pgoff, depending what is nearest.
2985 */ 3055 */
2986 end_pgoff = start_pgoff - 3056 end_pgoff = start_pgoff -
2987 ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + 3057 ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) +
2988 PTRS_PER_PTE - 1; 3058 PTRS_PER_PTE - 1;
2989 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, 3059 end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1,
2990 start_pgoff + nr_pages - 1); 3060 start_pgoff + nr_pages - 1);
2991 3061
2992 /* Check if it makes any sense to call ->map_pages */ 3062 if (pmd_none(*fe->pmd)) {
2993 fe->address = start_addr; 3063 fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address);
2994 while (!pte_none(*fe->pte)) { 3064 smp_wmb(); /* See comment in __pte_alloc() */
2995 if (++start_pgoff > end_pgoff)
2996 goto out;
2997 fe->address += PAGE_SIZE;
2998 if (fe->address >= fe->vma->vm_end)
2999 goto out;
3000 fe->pte++;
3001 } 3065 }
3002 3066
3003 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); 3067 fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff);
3068
3069 /* preallocated pagetable is unused: free it */
3070 if (fe->prealloc_pte) {
3071 pte_free(fe->vma->vm_mm, fe->prealloc_pte);
3072 fe->prealloc_pte = 0;
3073 }
3074 /* Huge page is mapped? Page fault is solved */
3075 if (pmd_trans_huge(*fe->pmd)) {
3076 ret = VM_FAULT_NOPAGE;
3077 goto out;
3078 }
3079
3080 /* ->map_pages() haven't done anything useful. Cold page cache? */
3081 if (!fe->pte)
3082 goto out;
3083
3084 /* check if the page fault is solved */
3085 fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT);
3086 if (!pte_none(*fe->pte))
3087 ret = VM_FAULT_NOPAGE;
3088 pte_unmap_unlock(fe->pte, fe->ptl);
3004out: 3089out:
3005 /* restore fault_env */
3006 fe->pte = pte;
3007 fe->address = address; 3090 fe->address = address;
3091 fe->pte = NULL;
3092 return ret;
3008} 3093}
3009 3094
3010static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3095static int do_read_fault(struct fault_env *fe, pgoff_t pgoff)
3011{ 3096{
3012 struct vm_area_struct *vma = fe->vma; 3097 struct vm_area_struct *vma = fe->vma;
3013 struct page *fault_page; 3098 struct page *fault_page;
@@ -3019,36 +3104,25 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3019 * something). 3104 * something).
3020 */ 3105 */
3021 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { 3106 if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
3022 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3107 ret = do_fault_around(fe, pgoff);
3023 &fe->ptl); 3108 if (ret)
3024 if (!pte_same(*fe->pte, orig_pte)) 3109 return ret;
3025 goto unlock_out;
3026 do_fault_around(fe, pgoff);
3027 /* Check if the fault is handled by faultaround */
3028 if (!pte_same(*fe->pte, orig_pte))
3029 goto unlock_out;
3030 pte_unmap_unlock(fe->pte, fe->ptl);
3031 } 3110 }
3032 3111
3033 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); 3112 ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL);
3034 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) 3113 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3035 return ret; 3114 return ret;
3036 3115
3037 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl); 3116 ret |= alloc_set_pte(fe, NULL, fault_page);
3038 if (unlikely(!pte_same(*fe->pte, orig_pte))) { 3117 if (fe->pte)
3039 pte_unmap_unlock(fe->pte, fe->ptl); 3118 pte_unmap_unlock(fe->pte, fe->ptl);
3040 unlock_page(fault_page);
3041 put_page(fault_page);
3042 return ret;
3043 }
3044 do_set_pte(fe, fault_page);
3045 unlock_page(fault_page); 3119 unlock_page(fault_page);
3046unlock_out: 3120 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3047 pte_unmap_unlock(fe->pte, fe->ptl); 3121 put_page(fault_page);
3048 return ret; 3122 return ret;
3049} 3123}
3050 3124
3051static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3125static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff)
3052{ 3126{
3053 struct vm_area_struct *vma = fe->vma; 3127 struct vm_area_struct *vma = fe->vma;
3054 struct page *fault_page, *new_page; 3128 struct page *fault_page, *new_page;
@@ -3077,29 +3151,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3077 copy_user_highpage(new_page, fault_page, fe->address, vma); 3151 copy_user_highpage(new_page, fault_page, fe->address, vma);
3078 __SetPageUptodate(new_page); 3152 __SetPageUptodate(new_page);
3079 3153
3080 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3154 ret |= alloc_set_pte(fe, memcg, new_page);
3081 &fe->ptl); 3155 if (fe->pte)
3082 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3083 pte_unmap_unlock(fe->pte, fe->ptl); 3156 pte_unmap_unlock(fe->pte, fe->ptl);
3084 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3085 unlock_page(fault_page);
3086 put_page(fault_page);
3087 } else {
3088 dax_unlock_mapping_entry(vma->vm_file->f_mapping,
3089 pgoff);
3090 }
3091 goto uncharge_out;
3092 }
3093 do_set_pte(fe, new_page);
3094 mem_cgroup_commit_charge(new_page, memcg, false, false);
3095 lru_cache_add_active_or_unevictable(new_page, vma);
3096 pte_unmap_unlock(fe->pte, fe->ptl);
3097 if (!(ret & VM_FAULT_DAX_LOCKED)) { 3157 if (!(ret & VM_FAULT_DAX_LOCKED)) {
3098 unlock_page(fault_page); 3158 unlock_page(fault_page);
3099 put_page(fault_page); 3159 put_page(fault_page);
3100 } else { 3160 } else {
3101 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); 3161 dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
3102 } 3162 }
3163 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
3164 goto uncharge_out;
3103 return ret; 3165 return ret;
3104uncharge_out: 3166uncharge_out:
3105 mem_cgroup_cancel_charge(new_page, memcg, false); 3167 mem_cgroup_cancel_charge(new_page, memcg, false);
@@ -3107,7 +3169,7 @@ uncharge_out:
3107 return ret; 3169 return ret;
3108} 3170}
3109 3171
3110static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) 3172static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff)
3111{ 3173{
3112 struct vm_area_struct *vma = fe->vma; 3174 struct vm_area_struct *vma = fe->vma;
3113 struct page *fault_page; 3175 struct page *fault_page;
@@ -3133,16 +3195,15 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3133 } 3195 }
3134 } 3196 }
3135 3197
3136 fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, 3198 ret |= alloc_set_pte(fe, NULL, fault_page);
3137 &fe->ptl); 3199 if (fe->pte)
3138 if (unlikely(!pte_same(*fe->pte, orig_pte))) {
3139 pte_unmap_unlock(fe->pte, fe->ptl); 3200 pte_unmap_unlock(fe->pte, fe->ptl);
3201 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3202 VM_FAULT_RETRY))) {
3140 unlock_page(fault_page); 3203 unlock_page(fault_page);
3141 put_page(fault_page); 3204 put_page(fault_page);
3142 return ret; 3205 return ret;
3143 } 3206 }
3144 do_set_pte(fe, fault_page);
3145 pte_unmap_unlock(fe->pte, fe->ptl);
3146 3207
3147 if (set_page_dirty(fault_page)) 3208 if (set_page_dirty(fault_page))
3148 dirtied = 1; 3209 dirtied = 1;
@@ -3174,20 +3235,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte)
3174 * The mmap_sem may have been released depending on flags and our 3235 * The mmap_sem may have been released depending on flags and our
3175 * return value. See filemap_fault() and __lock_page_or_retry(). 3236 * return value. See filemap_fault() and __lock_page_or_retry().
3176 */ 3237 */
3177static int do_fault(struct fault_env *fe, pte_t orig_pte) 3238static int do_fault(struct fault_env *fe)
3178{ 3239{
3179 struct vm_area_struct *vma = fe->vma; 3240 struct vm_area_struct *vma = fe->vma;
3180 pgoff_t pgoff = linear_page_index(vma, fe->address); 3241 pgoff_t pgoff = linear_page_index(vma, fe->address);
3181 3242
3182 pte_unmap(fe->pte);
3183 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ 3243 /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
3184 if (!vma->vm_ops->fault) 3244 if (!vma->vm_ops->fault)
3185 return VM_FAULT_SIGBUS; 3245 return VM_FAULT_SIGBUS;
3186 if (!(fe->flags & FAULT_FLAG_WRITE)) 3246 if (!(fe->flags & FAULT_FLAG_WRITE))
3187 return do_read_fault(fe, pgoff, orig_pte); 3247 return do_read_fault(fe, pgoff);
3188 if (!(vma->vm_flags & VM_SHARED)) 3248 if (!(vma->vm_flags & VM_SHARED))
3189 return do_cow_fault(fe, pgoff, orig_pte); 3249 return do_cow_fault(fe, pgoff);
3190 return do_shared_fault(fe, pgoff, orig_pte); 3250 return do_shared_fault(fe, pgoff);
3191} 3251}
3192 3252
3193static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, 3253static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
@@ -3327,37 +3387,63 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd)
3327 * with external mmu caches can use to update those (ie the Sparc or 3387 * with external mmu caches can use to update those (ie the Sparc or
3328 * PowerPC hashed page tables that act as extended TLBs). 3388 * PowerPC hashed page tables that act as extended TLBs).
3329 * 3389 *
3330 * We enter with non-exclusive mmap_sem (to exclude vma changes, 3390 * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
3331 * but allow concurrent faults), and pte mapped but not yet locked. 3391 * concurrent faults).
3332 * We return with pte unmapped and unlocked.
3333 * 3392 *
3334 * The mmap_sem may have been released depending on flags and our 3393 * The mmap_sem may have been released depending on flags and our return value.
3335 * return value. See filemap_fault() and __lock_page_or_retry(). 3394 * See filemap_fault() and __lock_page_or_retry().
3336 */ 3395 */
3337static int handle_pte_fault(struct fault_env *fe) 3396static int handle_pte_fault(struct fault_env *fe)
3338{ 3397{
3339 pte_t entry; 3398 pte_t entry;
3340 3399
3341 /* 3400 if (unlikely(pmd_none(*fe->pmd))) {
3342 * some architectures can have larger ptes than wordsize, 3401 /*
3343 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, 3402 * Leave __pte_alloc() until later: because vm_ops->fault may
3344 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. 3403 * want to allocate huge page, and if we expose page table
3345 * The code below just needs a consistent view for the ifs and 3404 * for an instant, it will be difficult to retract from
3346 * we later double check anyway with the ptl lock held. So here 3405 * concurrent faults and from rmap lookups.
3347 * a barrier will do. 3406 */
3348 */ 3407 fe->pte = NULL;
3349 entry = *fe->pte; 3408 } else {
3350 barrier(); 3409 /* See comment in pte_alloc_one_map() */
3351 if (!pte_present(entry)) { 3410 if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd))
3411 return 0;
3412 /*
3413 * A regular pmd is established and it can't morph into a huge
3414 * pmd from under us anymore at this point because we hold the
3415 * mmap_sem read mode and khugepaged takes it in write mode.
3416 * So now it's safe to run pte_offset_map().
3417 */
3418 fe->pte = pte_offset_map(fe->pmd, fe->address);
3419
3420 entry = *fe->pte;
3421
3422 /*
3423 * some architectures can have larger ptes than wordsize,
3424 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
3425 * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee
3426 * atomic accesses. The code below just needs a consistent
3427 * view for the ifs and we later double check anyway with the
3428 * ptl lock held. So here a barrier will do.
3429 */
3430 barrier();
3352 if (pte_none(entry)) { 3431 if (pte_none(entry)) {
3353 if (vma_is_anonymous(fe->vma)) 3432 pte_unmap(fe->pte);
3354 return do_anonymous_page(fe); 3433 fe->pte = NULL;
3355 else
3356 return do_fault(fe, entry);
3357 } 3434 }
3358 return do_swap_page(fe, entry);
3359 } 3435 }
3360 3436
3437 if (!fe->pte) {
3438 if (vma_is_anonymous(fe->vma))
3439 return do_anonymous_page(fe);
3440 else
3441 return do_fault(fe);
3442 }
3443
3444 if (!pte_present(entry))
3445 return do_swap_page(fe, entry);
3446
3361 if (pte_protnone(entry)) 3447 if (pte_protnone(entry))
3362 return do_numa_page(fe, entry); 3448 return do_numa_page(fe, entry);
3363 3449
@@ -3439,34 +3525,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
3439 } 3525 }
3440 } 3526 }
3441 3527
3442 /*
3443 * Use pte_alloc() instead of pte_alloc_map, because we can't
3444 * run pte_offset_map on the pmd, if an huge pmd could
3445 * materialize from under us from a different thread.
3446 */
3447 if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address)))
3448 return VM_FAULT_OOM;
3449 /*
3450 * If a huge pmd materialized under us just retry later. Use
3451 * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
3452 * didn't become pmd_trans_huge under us and then back to pmd_none, as
3453 * a result of MADV_DONTNEED running immediately after a huge pmd fault
3454 * in a different thread of this mm, in turn leading to a misleading
3455 * pmd_trans_huge() retval. All we have to ensure is that it is a
3456 * regular pmd that we can walk with pte_offset_map() and we can do that
3457 * through an atomic read in C, which is what pmd_trans_unstable()
3458 * provides.
3459 */
3460 if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd)))
3461 return 0;
3462 /*
3463 * A regular pmd is established and it can't morph into a huge pmd
3464 * from under us anymore at this point because we hold the mmap_sem
3465 * read mode and khugepaged takes it in write mode. So now it's
3466 * safe to run pte_offset_map().
3467 */
3468 fe.pte = pte_offset_map(fe.pmd, fe.address);
3469
3470 return handle_pte_fault(&fe); 3528 return handle_pte_fault(&fe);
3471} 3529}
3472 3530