diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 298 |
1 files changed, 178 insertions, 120 deletions
diff --git a/mm/memory.c b/mm/memory.c index 72b520897339..1991105bf67c 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -2739,8 +2739,6 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2739 | struct page *page; | 2739 | struct page *page; |
2740 | pte_t entry; | 2740 | pte_t entry; |
2741 | 2741 | ||
2742 | pte_unmap(fe->pte); | ||
2743 | |||
2744 | /* File mapping without ->vm_ops ? */ | 2742 | /* File mapping without ->vm_ops ? */ |
2745 | if (vma->vm_flags & VM_SHARED) | 2743 | if (vma->vm_flags & VM_SHARED) |
2746 | return VM_FAULT_SIGBUS; | 2744 | return VM_FAULT_SIGBUS; |
@@ -2749,6 +2747,23 @@ static int do_anonymous_page(struct fault_env *fe) | |||
2749 | if (check_stack_guard_page(vma, fe->address) < 0) | 2747 | if (check_stack_guard_page(vma, fe->address) < 0) |
2750 | return VM_FAULT_SIGSEGV; | 2748 | return VM_FAULT_SIGSEGV; |
2751 | 2749 | ||
2750 | /* | ||
2751 | * Use pte_alloc() instead of pte_alloc_map(). We can't run | ||
2752 | * pte_offset_map() on pmds where a huge pmd might be created | ||
2753 | * from a different thread. | ||
2754 | * | ||
2755 | * pte_alloc_map() is safe to use under down_write(mmap_sem) or when | ||
2756 | * parallel threads are excluded by other means. | ||
2757 | * | ||
2758 | * Here we only have down_read(mmap_sem). | ||
2759 | */ | ||
2760 | if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) | ||
2761 | return VM_FAULT_OOM; | ||
2762 | |||
2763 | /* See the comment in pte_alloc_one_map() */ | ||
2764 | if (unlikely(pmd_trans_unstable(fe->pmd))) | ||
2765 | return 0; | ||
2766 | |||
2752 | /* Use the zero-page for reads */ | 2767 | /* Use the zero-page for reads */ |
2753 | if (!(fe->flags & FAULT_FLAG_WRITE) && | 2768 | if (!(fe->flags & FAULT_FLAG_WRITE) && |
2754 | !mm_forbids_zeropage(vma->vm_mm)) { | 2769 | !mm_forbids_zeropage(vma->vm_mm)) { |
@@ -2865,23 +2880,76 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff, | |||
2865 | return ret; | 2880 | return ret; |
2866 | } | 2881 | } |
2867 | 2882 | ||
2883 | static int pte_alloc_one_map(struct fault_env *fe) | ||
2884 | { | ||
2885 | struct vm_area_struct *vma = fe->vma; | ||
2886 | |||
2887 | if (!pmd_none(*fe->pmd)) | ||
2888 | goto map_pte; | ||
2889 | if (fe->prealloc_pte) { | ||
2890 | fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); | ||
2891 | if (unlikely(!pmd_none(*fe->pmd))) { | ||
2892 | spin_unlock(fe->ptl); | ||
2893 | goto map_pte; | ||
2894 | } | ||
2895 | |||
2896 | atomic_long_inc(&vma->vm_mm->nr_ptes); | ||
2897 | pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); | ||
2898 | spin_unlock(fe->ptl); | ||
2899 | fe->prealloc_pte = 0; | ||
2900 | } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { | ||
2901 | return VM_FAULT_OOM; | ||
2902 | } | ||
2903 | map_pte: | ||
2904 | /* | ||
2905 | * If a huge pmd materialized under us just retry later. Use | ||
2906 | * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd | ||
2907 | * didn't become pmd_trans_huge under us and then back to pmd_none, as | ||
2908 | * a result of MADV_DONTNEED running immediately after a huge pmd fault | ||
2909 | * in a different thread of this mm, in turn leading to a misleading | ||
2910 | * pmd_trans_huge() retval. All we have to ensure is that it is a | ||
2911 | * regular pmd that we can walk with pte_offset_map() and we can do that | ||
2912 | * through an atomic read in C, which is what pmd_trans_unstable() | ||
2913 | * provides. | ||
2914 | */ | ||
2915 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) | ||
2916 | return VM_FAULT_NOPAGE; | ||
2917 | |||
2918 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | ||
2919 | &fe->ptl); | ||
2920 | return 0; | ||
2921 | } | ||
2922 | |||
2868 | /** | 2923 | /** |
2869 | * do_set_pte - setup new PTE entry for given page and add reverse page mapping. | 2924 | * alloc_set_pte - setup new PTE entry for given page and add reverse page |
2925 | * mapping. If needed, the fucntion allocates page table or use pre-allocated. | ||
2870 | * | 2926 | * |
2871 | * @fe: fault environment | 2927 | * @fe: fault environment |
2928 | * @memcg: memcg to charge page (only for private mappings) | ||
2872 | * @page: page to map | 2929 | * @page: page to map |
2873 | * | 2930 | * |
2874 | * Caller must hold page table lock relevant for @fe->pte. | 2931 | * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. |
2875 | * | 2932 | * |
2876 | * Target users are page handler itself and implementations of | 2933 | * Target users are page handler itself and implementations of |
2877 | * vm_ops->map_pages. | 2934 | * vm_ops->map_pages. |
2878 | */ | 2935 | */ |
2879 | void do_set_pte(struct fault_env *fe, struct page *page) | 2936 | int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, |
2937 | struct page *page) | ||
2880 | { | 2938 | { |
2881 | struct vm_area_struct *vma = fe->vma; | 2939 | struct vm_area_struct *vma = fe->vma; |
2882 | bool write = fe->flags & FAULT_FLAG_WRITE; | 2940 | bool write = fe->flags & FAULT_FLAG_WRITE; |
2883 | pte_t entry; | 2941 | pte_t entry; |
2884 | 2942 | ||
2943 | if (!fe->pte) { | ||
2944 | int ret = pte_alloc_one_map(fe); | ||
2945 | if (ret) | ||
2946 | return ret; | ||
2947 | } | ||
2948 | |||
2949 | /* Re-check under ptl */ | ||
2950 | if (unlikely(!pte_none(*fe->pte))) | ||
2951 | return VM_FAULT_NOPAGE; | ||
2952 | |||
2885 | flush_icache_page(vma, page); | 2953 | flush_icache_page(vma, page); |
2886 | entry = mk_pte(page, vma->vm_page_prot); | 2954 | entry = mk_pte(page, vma->vm_page_prot); |
2887 | if (write) | 2955 | if (write) |
@@ -2890,6 +2958,8 @@ void do_set_pte(struct fault_env *fe, struct page *page) | |||
2890 | if (write && !(vma->vm_flags & VM_SHARED)) { | 2958 | if (write && !(vma->vm_flags & VM_SHARED)) { |
2891 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2959 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2892 | page_add_new_anon_rmap(page, vma, fe->address, false); | 2960 | page_add_new_anon_rmap(page, vma, fe->address, false); |
2961 | mem_cgroup_commit_charge(page, memcg, false, false); | ||
2962 | lru_cache_add_active_or_unevictable(page, vma); | ||
2893 | } else { | 2963 | } else { |
2894 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 2964 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
2895 | page_add_file_rmap(page); | 2965 | page_add_file_rmap(page); |
@@ -2898,6 +2968,8 @@ void do_set_pte(struct fault_env *fe, struct page *page) | |||
2898 | 2968 | ||
2899 | /* no need to invalidate: a not-present page won't be cached */ | 2969 | /* no need to invalidate: a not-present page won't be cached */ |
2900 | update_mmu_cache(vma, fe->address, fe->pte); | 2970 | update_mmu_cache(vma, fe->address, fe->pte); |
2971 | |||
2972 | return 0; | ||
2901 | } | 2973 | } |
2902 | 2974 | ||
2903 | static unsigned long fault_around_bytes __read_mostly = | 2975 | static unsigned long fault_around_bytes __read_mostly = |
@@ -2964,19 +3036,17 @@ late_initcall(fault_around_debugfs); | |||
2964 | * fault_around_pages() value (and therefore to page order). This way it's | 3036 | * fault_around_pages() value (and therefore to page order). This way it's |
2965 | * easier to guarantee that we don't cross page table boundaries. | 3037 | * easier to guarantee that we don't cross page table boundaries. |
2966 | */ | 3038 | */ |
2967 | static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | 3039 | static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) |
2968 | { | 3040 | { |
2969 | unsigned long address = fe->address, start_addr, nr_pages, mask; | 3041 | unsigned long address = fe->address, nr_pages, mask; |
2970 | pte_t *pte = fe->pte; | ||
2971 | pgoff_t end_pgoff; | 3042 | pgoff_t end_pgoff; |
2972 | int off; | 3043 | int off, ret = 0; |
2973 | 3044 | ||
2974 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 3045 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
2975 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 3046 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
2976 | 3047 | ||
2977 | start_addr = max(fe->address & mask, fe->vma->vm_start); | 3048 | fe->address = max(address & mask, fe->vma->vm_start); |
2978 | off = ((fe->address - start_addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); | 3049 | off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); |
2979 | fe->pte -= off; | ||
2980 | start_pgoff -= off; | 3050 | start_pgoff -= off; |
2981 | 3051 | ||
2982 | /* | 3052 | /* |
@@ -2984,30 +3054,45 @@ static void do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) | |||
2984 | * or fault_around_pages() from start_pgoff, depending what is nearest. | 3054 | * or fault_around_pages() from start_pgoff, depending what is nearest. |
2985 | */ | 3055 | */ |
2986 | end_pgoff = start_pgoff - | 3056 | end_pgoff = start_pgoff - |
2987 | ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + | 3057 | ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + |
2988 | PTRS_PER_PTE - 1; | 3058 | PTRS_PER_PTE - 1; |
2989 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, | 3059 | end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, |
2990 | start_pgoff + nr_pages - 1); | 3060 | start_pgoff + nr_pages - 1); |
2991 | 3061 | ||
2992 | /* Check if it makes any sense to call ->map_pages */ | 3062 | if (pmd_none(*fe->pmd)) { |
2993 | fe->address = start_addr; | 3063 | fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); |
2994 | while (!pte_none(*fe->pte)) { | 3064 | smp_wmb(); /* See comment in __pte_alloc() */ |
2995 | if (++start_pgoff > end_pgoff) | ||
2996 | goto out; | ||
2997 | fe->address += PAGE_SIZE; | ||
2998 | if (fe->address >= fe->vma->vm_end) | ||
2999 | goto out; | ||
3000 | fe->pte++; | ||
3001 | } | 3065 | } |
3002 | 3066 | ||
3003 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); | 3067 | fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); |
3068 | |||
3069 | /* preallocated pagetable is unused: free it */ | ||
3070 | if (fe->prealloc_pte) { | ||
3071 | pte_free(fe->vma->vm_mm, fe->prealloc_pte); | ||
3072 | fe->prealloc_pte = 0; | ||
3073 | } | ||
3074 | /* Huge page is mapped? Page fault is solved */ | ||
3075 | if (pmd_trans_huge(*fe->pmd)) { | ||
3076 | ret = VM_FAULT_NOPAGE; | ||
3077 | goto out; | ||
3078 | } | ||
3079 | |||
3080 | /* ->map_pages() haven't done anything useful. Cold page cache? */ | ||
3081 | if (!fe->pte) | ||
3082 | goto out; | ||
3083 | |||
3084 | /* check if the page fault is solved */ | ||
3085 | fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); | ||
3086 | if (!pte_none(*fe->pte)) | ||
3087 | ret = VM_FAULT_NOPAGE; | ||
3088 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3004 | out: | 3089 | out: |
3005 | /* restore fault_env */ | ||
3006 | fe->pte = pte; | ||
3007 | fe->address = address; | 3090 | fe->address = address; |
3091 | fe->pte = NULL; | ||
3092 | return ret; | ||
3008 | } | 3093 | } |
3009 | 3094 | ||
3010 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | 3095 | static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) |
3011 | { | 3096 | { |
3012 | struct vm_area_struct *vma = fe->vma; | 3097 | struct vm_area_struct *vma = fe->vma; |
3013 | struct page *fault_page; | 3098 | struct page *fault_page; |
@@ -3019,36 +3104,25 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | |||
3019 | * something). | 3104 | * something). |
3020 | */ | 3105 | */ |
3021 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { | 3106 | if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { |
3022 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 3107 | ret = do_fault_around(fe, pgoff); |
3023 | &fe->ptl); | 3108 | if (ret) |
3024 | if (!pte_same(*fe->pte, orig_pte)) | 3109 | return ret; |
3025 | goto unlock_out; | ||
3026 | do_fault_around(fe, pgoff); | ||
3027 | /* Check if the fault is handled by faultaround */ | ||
3028 | if (!pte_same(*fe->pte, orig_pte)) | ||
3029 | goto unlock_out; | ||
3030 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3031 | } | 3110 | } |
3032 | 3111 | ||
3033 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); | 3112 | ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); |
3034 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | 3113 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3035 | return ret; | 3114 | return ret; |
3036 | 3115 | ||
3037 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, &fe->ptl); | 3116 | ret |= alloc_set_pte(fe, NULL, fault_page); |
3038 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { | 3117 | if (fe->pte) |
3039 | pte_unmap_unlock(fe->pte, fe->ptl); | 3118 | pte_unmap_unlock(fe->pte, fe->ptl); |
3040 | unlock_page(fault_page); | ||
3041 | put_page(fault_page); | ||
3042 | return ret; | ||
3043 | } | ||
3044 | do_set_pte(fe, fault_page); | ||
3045 | unlock_page(fault_page); | 3119 | unlock_page(fault_page); |
3046 | unlock_out: | 3120 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) |
3047 | pte_unmap_unlock(fe->pte, fe->ptl); | 3121 | put_page(fault_page); |
3048 | return ret; | 3122 | return ret; |
3049 | } | 3123 | } |
3050 | 3124 | ||
3051 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | 3125 | static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) |
3052 | { | 3126 | { |
3053 | struct vm_area_struct *vma = fe->vma; | 3127 | struct vm_area_struct *vma = fe->vma; |
3054 | struct page *fault_page, *new_page; | 3128 | struct page *fault_page, *new_page; |
@@ -3077,29 +3151,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | |||
3077 | copy_user_highpage(new_page, fault_page, fe->address, vma); | 3151 | copy_user_highpage(new_page, fault_page, fe->address, vma); |
3078 | __SetPageUptodate(new_page); | 3152 | __SetPageUptodate(new_page); |
3079 | 3153 | ||
3080 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 3154 | ret |= alloc_set_pte(fe, memcg, new_page); |
3081 | &fe->ptl); | 3155 | if (fe->pte) |
3082 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { | ||
3083 | pte_unmap_unlock(fe->pte, fe->ptl); | 3156 | pte_unmap_unlock(fe->pte, fe->ptl); |
3084 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | ||
3085 | unlock_page(fault_page); | ||
3086 | put_page(fault_page); | ||
3087 | } else { | ||
3088 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, | ||
3089 | pgoff); | ||
3090 | } | ||
3091 | goto uncharge_out; | ||
3092 | } | ||
3093 | do_set_pte(fe, new_page); | ||
3094 | mem_cgroup_commit_charge(new_page, memcg, false, false); | ||
3095 | lru_cache_add_active_or_unevictable(new_page, vma); | ||
3096 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3097 | if (!(ret & VM_FAULT_DAX_LOCKED)) { | 3157 | if (!(ret & VM_FAULT_DAX_LOCKED)) { |
3098 | unlock_page(fault_page); | 3158 | unlock_page(fault_page); |
3099 | put_page(fault_page); | 3159 | put_page(fault_page); |
3100 | } else { | 3160 | } else { |
3101 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); | 3161 | dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); |
3102 | } | 3162 | } |
3163 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) | ||
3164 | goto uncharge_out; | ||
3103 | return ret; | 3165 | return ret; |
3104 | uncharge_out: | 3166 | uncharge_out: |
3105 | mem_cgroup_cancel_charge(new_page, memcg, false); | 3167 | mem_cgroup_cancel_charge(new_page, memcg, false); |
@@ -3107,7 +3169,7 @@ uncharge_out: | |||
3107 | return ret; | 3169 | return ret; |
3108 | } | 3170 | } |
3109 | 3171 | ||
3110 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | 3172 | static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) |
3111 | { | 3173 | { |
3112 | struct vm_area_struct *vma = fe->vma; | 3174 | struct vm_area_struct *vma = fe->vma; |
3113 | struct page *fault_page; | 3175 | struct page *fault_page; |
@@ -3133,16 +3195,15 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | |||
3133 | } | 3195 | } |
3134 | } | 3196 | } |
3135 | 3197 | ||
3136 | fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, | 3198 | ret |= alloc_set_pte(fe, NULL, fault_page); |
3137 | &fe->ptl); | 3199 | if (fe->pte) |
3138 | if (unlikely(!pte_same(*fe->pte, orig_pte))) { | ||
3139 | pte_unmap_unlock(fe->pte, fe->ptl); | 3200 | pte_unmap_unlock(fe->pte, fe->ptl); |
3201 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | | ||
3202 | VM_FAULT_RETRY))) { | ||
3140 | unlock_page(fault_page); | 3203 | unlock_page(fault_page); |
3141 | put_page(fault_page); | 3204 | put_page(fault_page); |
3142 | return ret; | 3205 | return ret; |
3143 | } | 3206 | } |
3144 | do_set_pte(fe, fault_page); | ||
3145 | pte_unmap_unlock(fe->pte, fe->ptl); | ||
3146 | 3207 | ||
3147 | if (set_page_dirty(fault_page)) | 3208 | if (set_page_dirty(fault_page)) |
3148 | dirtied = 1; | 3209 | dirtied = 1; |
@@ -3174,20 +3235,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff, pte_t orig_pte) | |||
3174 | * The mmap_sem may have been released depending on flags and our | 3235 | * The mmap_sem may have been released depending on flags and our |
3175 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3236 | * return value. See filemap_fault() and __lock_page_or_retry(). |
3176 | */ | 3237 | */ |
3177 | static int do_fault(struct fault_env *fe, pte_t orig_pte) | 3238 | static int do_fault(struct fault_env *fe) |
3178 | { | 3239 | { |
3179 | struct vm_area_struct *vma = fe->vma; | 3240 | struct vm_area_struct *vma = fe->vma; |
3180 | pgoff_t pgoff = linear_page_index(vma, fe->address); | 3241 | pgoff_t pgoff = linear_page_index(vma, fe->address); |
3181 | 3242 | ||
3182 | pte_unmap(fe->pte); | ||
3183 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ | 3243 | /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ |
3184 | if (!vma->vm_ops->fault) | 3244 | if (!vma->vm_ops->fault) |
3185 | return VM_FAULT_SIGBUS; | 3245 | return VM_FAULT_SIGBUS; |
3186 | if (!(fe->flags & FAULT_FLAG_WRITE)) | 3246 | if (!(fe->flags & FAULT_FLAG_WRITE)) |
3187 | return do_read_fault(fe, pgoff, orig_pte); | 3247 | return do_read_fault(fe, pgoff); |
3188 | if (!(vma->vm_flags & VM_SHARED)) | 3248 | if (!(vma->vm_flags & VM_SHARED)) |
3189 | return do_cow_fault(fe, pgoff, orig_pte); | 3249 | return do_cow_fault(fe, pgoff); |
3190 | return do_shared_fault(fe, pgoff, orig_pte); | 3250 | return do_shared_fault(fe, pgoff); |
3191 | } | 3251 | } |
3192 | 3252 | ||
3193 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, | 3253 | static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, |
@@ -3327,37 +3387,63 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) | |||
3327 | * with external mmu caches can use to update those (ie the Sparc or | 3387 | * with external mmu caches can use to update those (ie the Sparc or |
3328 | * PowerPC hashed page tables that act as extended TLBs). | 3388 | * PowerPC hashed page tables that act as extended TLBs). |
3329 | * | 3389 | * |
3330 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | 3390 | * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow |
3331 | * but allow concurrent faults), and pte mapped but not yet locked. | 3391 | * concurrent faults). |
3332 | * We return with pte unmapped and unlocked. | ||
3333 | * | 3392 | * |
3334 | * The mmap_sem may have been released depending on flags and our | 3393 | * The mmap_sem may have been released depending on flags and our return value. |
3335 | * return value. See filemap_fault() and __lock_page_or_retry(). | 3394 | * See filemap_fault() and __lock_page_or_retry(). |
3336 | */ | 3395 | */ |
3337 | static int handle_pte_fault(struct fault_env *fe) | 3396 | static int handle_pte_fault(struct fault_env *fe) |
3338 | { | 3397 | { |
3339 | pte_t entry; | 3398 | pte_t entry; |
3340 | 3399 | ||
3341 | /* | 3400 | if (unlikely(pmd_none(*fe->pmd))) { |
3342 | * some architectures can have larger ptes than wordsize, | 3401 | /* |
3343 | * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, | 3402 | * Leave __pte_alloc() until later: because vm_ops->fault may |
3344 | * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. | 3403 | * want to allocate huge page, and if we expose page table |
3345 | * The code below just needs a consistent view for the ifs and | 3404 | * for an instant, it will be difficult to retract from |
3346 | * we later double check anyway with the ptl lock held. So here | 3405 | * concurrent faults and from rmap lookups. |
3347 | * a barrier will do. | 3406 | */ |
3348 | */ | 3407 | fe->pte = NULL; |
3349 | entry = *fe->pte; | 3408 | } else { |
3350 | barrier(); | 3409 | /* See comment in pte_alloc_one_map() */ |
3351 | if (!pte_present(entry)) { | 3410 | if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) |
3411 | return 0; | ||
3412 | /* | ||
3413 | * A regular pmd is established and it can't morph into a huge | ||
3414 | * pmd from under us anymore at this point because we hold the | ||
3415 | * mmap_sem read mode and khugepaged takes it in write mode. | ||
3416 | * So now it's safe to run pte_offset_map(). | ||
3417 | */ | ||
3418 | fe->pte = pte_offset_map(fe->pmd, fe->address); | ||
3419 | |||
3420 | entry = *fe->pte; | ||
3421 | |||
3422 | /* | ||
3423 | * some architectures can have larger ptes than wordsize, | ||
3424 | * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and | ||
3425 | * CONFIG_32BIT=y, so READ_ONCE or ACCESS_ONCE cannot guarantee | ||
3426 | * atomic accesses. The code below just needs a consistent | ||
3427 | * view for the ifs and we later double check anyway with the | ||
3428 | * ptl lock held. So here a barrier will do. | ||
3429 | */ | ||
3430 | barrier(); | ||
3352 | if (pte_none(entry)) { | 3431 | if (pte_none(entry)) { |
3353 | if (vma_is_anonymous(fe->vma)) | 3432 | pte_unmap(fe->pte); |
3354 | return do_anonymous_page(fe); | 3433 | fe->pte = NULL; |
3355 | else | ||
3356 | return do_fault(fe, entry); | ||
3357 | } | 3434 | } |
3358 | return do_swap_page(fe, entry); | ||
3359 | } | 3435 | } |
3360 | 3436 | ||
3437 | if (!fe->pte) { | ||
3438 | if (vma_is_anonymous(fe->vma)) | ||
3439 | return do_anonymous_page(fe); | ||
3440 | else | ||
3441 | return do_fault(fe); | ||
3442 | } | ||
3443 | |||
3444 | if (!pte_present(entry)) | ||
3445 | return do_swap_page(fe, entry); | ||
3446 | |||
3361 | if (pte_protnone(entry)) | 3447 | if (pte_protnone(entry)) |
3362 | return do_numa_page(fe, entry); | 3448 | return do_numa_page(fe, entry); |
3363 | 3449 | ||
@@ -3439,34 +3525,6 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, | |||
3439 | } | 3525 | } |
3440 | } | 3526 | } |
3441 | 3527 | ||
3442 | /* | ||
3443 | * Use pte_alloc() instead of pte_alloc_map, because we can't | ||
3444 | * run pte_offset_map on the pmd, if an huge pmd could | ||
3445 | * materialize from under us from a different thread. | ||
3446 | */ | ||
3447 | if (unlikely(pte_alloc(fe.vma->vm_mm, fe.pmd, fe.address))) | ||
3448 | return VM_FAULT_OOM; | ||
3449 | /* | ||
3450 | * If a huge pmd materialized under us just retry later. Use | ||
3451 | * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd | ||
3452 | * didn't become pmd_trans_huge under us and then back to pmd_none, as | ||
3453 | * a result of MADV_DONTNEED running immediately after a huge pmd fault | ||
3454 | * in a different thread of this mm, in turn leading to a misleading | ||
3455 | * pmd_trans_huge() retval. All we have to ensure is that it is a | ||
3456 | * regular pmd that we can walk with pte_offset_map() and we can do that | ||
3457 | * through an atomic read in C, which is what pmd_trans_unstable() | ||
3458 | * provides. | ||
3459 | */ | ||
3460 | if (unlikely(pmd_trans_unstable(fe.pmd) || pmd_devmap(*fe.pmd))) | ||
3461 | return 0; | ||
3462 | /* | ||
3463 | * A regular pmd is established and it can't morph into a huge pmd | ||
3464 | * from under us anymore at this point because we hold the mmap_sem | ||
3465 | * read mode and khugepaged takes it in write mode. So now it's | ||
3466 | * safe to run pte_offset_map(). | ||
3467 | */ | ||
3468 | fe.pte = pte_offset_map(fe.pmd, fe.address); | ||
3469 | |||
3470 | return handle_pte_fault(&fe); | 3528 | return handle_pte_fault(&fe); |
3471 | } | 3529 | } |
3472 | 3530 | ||