aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-01-15 19:54:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:56:32 -0500
commite9b61f19858a5d6c42ce2298cf138279375d0d9b (patch)
treeba2f5851d193c0ab96af67a9df9856b1dd9480ad /mm/huge_memory.c
parent4e41a30c6d506c884d3da9aeb316352e70679d4b (diff)
thp: reintroduce split_huge_page()
This patch adds implementation of split_huge_page() for new refcountings. Unlike previous implementation, new split_huge_page() can fail if somebody holds GUP pin on the page. It also means that pin on page would prevent it from bening split under you. It makes situation in many places much cleaner. The basic scheme of split_huge_page(): - Check that sum of mapcounts of all subpage is equal to page_count() plus one (caller pin). Foll off with -EBUSY. This way we can avoid useless PMD-splits. - Freeze the page counters by splitting all PMD and setup migration PTEs. - Re-check sum of mapcounts against page_count(). Page's counts are stable now. -EBUSY if page is pinned. - Split compound page. - Unfreeze the page by removing migration entries. Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Tested-by: Sasha Levin <sasha.levin@oracle.com> Tested-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: Jerome Marchand <jmarchan@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Christoph Lameter <cl@linux.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c374
1 files changed, 371 insertions, 3 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 913559388fda..b6ac6c43d6a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -16,6 +16,7 @@
16#include <linux/swap.h> 16#include <linux/swap.h>
17#include <linux/shrinker.h> 17#include <linux/shrinker.h>
18#include <linux/mm_inline.h> 18#include <linux/mm_inline.h>
19#include <linux/swapops.h>
19#include <linux/dax.h> 20#include <linux/dax.h>
20#include <linux/kthread.h> 21#include <linux/kthread.h>
21#include <linux/khugepaged.h> 22#include <linux/khugepaged.h>
@@ -2726,9 +2727,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2726 write = pmd_write(*pmd); 2727 write = pmd_write(*pmd);
2727 young = pmd_young(*pmd); 2728 young = pmd_young(*pmd);
2728 2729
2729 /* leave pmd empty until pte is filled */
2730 pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2731
2732 pgtable = pgtable_trans_huge_withdraw(mm, pmd); 2730 pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2733 pmd_populate(mm, &_pmd, pgtable); 2731 pmd_populate(mm, &_pmd, pgtable);
2734 2732
@@ -2778,7 +2776,36 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2778 } 2776 }
2779 2777
2780 smp_wmb(); /* make pte visible before pmd */ 2778 smp_wmb(); /* make pte visible before pmd */
2779 /*
2780 * Up to this point the pmd is present and huge and userland has the
2781 * whole access to the hugepage during the split (which happens in
2782 * place). If we overwrite the pmd with the not-huge version pointing
2783 * to the pte here (which of course we could if all CPUs were bug
2784 * free), userland could trigger a small page size TLB miss on the
2785 * small sized TLB while the hugepage TLB entry is still established in
2786 * the huge TLB. Some CPU doesn't like that.
2787 * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
2788 * 383 on page 93. Intel should be safe but is also warns that it's
2789 * only safe if the permission and cache attributes of the two entries
2790 * loaded in the two TLB is identical (which should be the case here).
2791 * But it is generally safer to never allow small and huge TLB entries
2792 * for the same virtual address to be loaded simultaneously. So instead
2793 * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2794 * current pmd notpresent (atomically because here the pmd_trans_huge
2795 * and pmd_trans_splitting must remain set at all times on the pmd
2796 * until the split is complete for this pmd), then we flush the SMP TLB
2797 * and finally we write the non-huge version of the pmd entry with
2798 * pmd_populate.
2799 */
2800 pmdp_invalidate(vma, haddr, pmd);
2781 pmd_populate(mm, pmd, pgtable); 2801 pmd_populate(mm, pmd, pgtable);
2802
2803 if (freeze) {
2804 for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2805 page_remove_rmap(page + i, false);
2806 put_page(page + i);
2807 }
2808 }
2782} 2809}
2783 2810
2784void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, 2811void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -2863,3 +2890,344 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
2863 split_huge_pmd_address(next, nstart); 2890 split_huge_pmd_address(next, nstart);
2864 } 2891 }
2865} 2892}
2893
2894static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
2895 unsigned long address)
2896{
2897 spinlock_t *ptl;
2898 pgd_t *pgd;
2899 pud_t *pud;
2900 pmd_t *pmd;
2901 pte_t *pte;
2902 int i, nr = HPAGE_PMD_NR;
2903
2904 /* Skip pages which doesn't belong to the VMA */
2905 if (address < vma->vm_start) {
2906 int off = (vma->vm_start - address) >> PAGE_SHIFT;
2907 page += off;
2908 nr -= off;
2909 address = vma->vm_start;
2910 }
2911
2912 pgd = pgd_offset(vma->vm_mm, address);
2913 if (!pgd_present(*pgd))
2914 return;
2915 pud = pud_offset(pgd, address);
2916 if (!pud_present(*pud))
2917 return;
2918 pmd = pmd_offset(pud, address);
2919 ptl = pmd_lock(vma->vm_mm, pmd);
2920 if (!pmd_present(*pmd)) {
2921 spin_unlock(ptl);
2922 return;
2923 }
2924 if (pmd_trans_huge(*pmd)) {
2925 if (page == pmd_page(*pmd))
2926 __split_huge_pmd_locked(vma, pmd, address, true);
2927 spin_unlock(ptl);
2928 return;
2929 }
2930 spin_unlock(ptl);
2931
2932 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
2933 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
2934 pte_t entry, swp_pte;
2935 swp_entry_t swp_entry;
2936
2937 if (!pte_present(pte[i]))
2938 continue;
2939 if (page_to_pfn(page) != pte_pfn(pte[i]))
2940 continue;
2941 flush_cache_page(vma, address, page_to_pfn(page));
2942 entry = ptep_clear_flush(vma, address, pte + i);
2943 swp_entry = make_migration_entry(page, pte_write(entry));
2944 swp_pte = swp_entry_to_pte(swp_entry);
2945 if (pte_soft_dirty(entry))
2946 swp_pte = pte_swp_mksoft_dirty(swp_pte);
2947 set_pte_at(vma->vm_mm, address, pte + i, swp_pte);
2948 page_remove_rmap(page, false);
2949 put_page(page);
2950 }
2951 pte_unmap_unlock(pte, ptl);
2952}
2953
2954static void freeze_page(struct anon_vma *anon_vma, struct page *page)
2955{
2956 struct anon_vma_chain *avc;
2957 pgoff_t pgoff = page_to_pgoff(page);
2958
2959 VM_BUG_ON_PAGE(!PageHead(page), page);
2960
2961 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
2962 pgoff + HPAGE_PMD_NR - 1) {
2963 unsigned long haddr;
2964
2965 haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
2966 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
2967 haddr, haddr + HPAGE_PMD_SIZE);
2968 freeze_page_vma(avc->vma, page, haddr);
2969 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
2970 haddr, haddr + HPAGE_PMD_SIZE);
2971 }
2972}
2973
2974static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
2975 unsigned long address)
2976{
2977 spinlock_t *ptl;
2978 pmd_t *pmd;
2979 pte_t *pte, entry;
2980 swp_entry_t swp_entry;
2981 int i, nr = HPAGE_PMD_NR;
2982
2983 /* Skip pages which doesn't belong to the VMA */
2984 if (address < vma->vm_start) {
2985 int off = (vma->vm_start - address) >> PAGE_SHIFT;
2986 page += off;
2987 nr -= off;
2988 address = vma->vm_start;
2989 }
2990
2991 pmd = mm_find_pmd(vma->vm_mm, address);
2992 if (!pmd)
2993 return;
2994 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
2995 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) {
2996 if (!is_swap_pte(pte[i]))
2997 continue;
2998
2999 swp_entry = pte_to_swp_entry(pte[i]);
3000 if (!is_migration_entry(swp_entry))
3001 continue;
3002 if (migration_entry_to_page(swp_entry) != page)
3003 continue;
3004
3005 get_page(page);
3006 page_add_anon_rmap(page, vma, address, false);
3007
3008 entry = pte_mkold(mk_pte(page, vma->vm_page_prot));
3009 entry = pte_mkdirty(entry);
3010 if (is_write_migration_entry(swp_entry))
3011 entry = maybe_mkwrite(entry, vma);
3012
3013 flush_dcache_page(page);
3014 set_pte_at(vma->vm_mm, address, pte + i, entry);
3015
3016 /* No need to invalidate - it was non-present before */
3017 update_mmu_cache(vma, address, pte + i);
3018 }
3019 pte_unmap_unlock(pte, ptl);
3020}
3021
3022static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
3023{
3024 struct anon_vma_chain *avc;
3025 pgoff_t pgoff = page_to_pgoff(page);
3026
3027 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
3028 pgoff, pgoff + HPAGE_PMD_NR - 1) {
3029 unsigned long address = __vma_address(page, avc->vma);
3030
3031 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3032 address, address + HPAGE_PMD_SIZE);
3033 unfreeze_page_vma(avc->vma, page, address);
3034 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3035 address, address + HPAGE_PMD_SIZE);
3036 }
3037}
3038
3039static int total_mapcount(struct page *page)
3040{
3041 int i, ret;
3042
3043 ret = compound_mapcount(page);
3044 for (i = 0; i < HPAGE_PMD_NR; i++)
3045 ret += atomic_read(&page[i]._mapcount) + 1;
3046
3047 if (PageDoubleMap(page))
3048 ret -= HPAGE_PMD_NR;
3049
3050 return ret;
3051}
3052
3053static int __split_huge_page_tail(struct page *head, int tail,
3054 struct lruvec *lruvec, struct list_head *list)
3055{
3056 int mapcount;
3057 struct page *page_tail = head + tail;
3058
3059 mapcount = atomic_read(&page_tail->_mapcount) + 1;
3060 VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
3061
3062 /*
3063 * tail_page->_count is zero and not changing from under us. But
3064 * get_page_unless_zero() may be running from under us on the
3065 * tail_page. If we used atomic_set() below instead of atomic_add(), we
3066 * would then run atomic_set() concurrently with
3067 * get_page_unless_zero(), and atomic_set() is implemented in C not
3068 * using locked ops. spin_unlock on x86 sometime uses locked ops
3069 * because of PPro errata 66, 92, so unless somebody can guarantee
3070 * atomic_set() here would be safe on all archs (and not only on x86),
3071 * it's safer to use atomic_add().
3072 */
3073 atomic_add(mapcount + 1, &page_tail->_count);
3074
3075
3076 page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
3077 page_tail->flags |= (head->flags &
3078 ((1L << PG_referenced) |
3079 (1L << PG_swapbacked) |
3080 (1L << PG_mlocked) |
3081 (1L << PG_uptodate) |
3082 (1L << PG_active) |
3083 (1L << PG_locked) |
3084 (1L << PG_unevictable)));
3085 page_tail->flags |= (1L << PG_dirty);
3086
3087 /*
3088 * After clearing PageTail the gup refcount can be released.
3089 * Page flags also must be visible before we make the page non-compound.
3090 */
3091 smp_wmb();
3092
3093 clear_compound_head(page_tail);
3094
3095 if (page_is_young(head))
3096 set_page_young(page_tail);
3097 if (page_is_idle(head))
3098 set_page_idle(page_tail);
3099
3100 /* ->mapping in first tail page is compound_mapcount */
3101 VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING,
3102 page_tail);
3103 page_tail->mapping = head->mapping;
3104
3105 page_tail->index = head->index + tail;
3106 page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
3107 lru_add_page_tail(head, page_tail, lruvec, list);
3108
3109 return mapcount;
3110}
3111
3112static void __split_huge_page(struct page *page, struct list_head *list)
3113{
3114 struct page *head = compound_head(page);
3115 struct zone *zone = page_zone(head);
3116 struct lruvec *lruvec;
3117 int i, tail_mapcount;
3118
3119 /* prevent PageLRU to go away from under us, and freeze lru stats */
3120 spin_lock_irq(&zone->lru_lock);
3121 lruvec = mem_cgroup_page_lruvec(head, zone);
3122
3123 /* complete memcg works before add pages to LRU */
3124 mem_cgroup_split_huge_fixup(head);
3125
3126 tail_mapcount = 0;
3127 for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
3128 tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
3129 atomic_sub(tail_mapcount, &head->_count);
3130
3131 ClearPageCompound(head);
3132 spin_unlock_irq(&zone->lru_lock);
3133
3134 unfreeze_page(page_anon_vma(head), head);
3135
3136 for (i = 0; i < HPAGE_PMD_NR; i++) {
3137 struct page *subpage = head + i;
3138 if (subpage == page)
3139 continue;
3140 unlock_page(subpage);
3141
3142 /*
3143 * Subpages may be freed if there wasn't any mapping
3144 * like if add_to_swap() is running on a lru page that
3145 * had its mapping zapped. And freeing these pages
3146 * requires taking the lru_lock so we do the put_page
3147 * of the tail pages after the split is complete.
3148 */
3149 put_page(subpage);
3150 }
3151}
3152
3153/*
3154 * This function splits huge page into normal pages. @page can point to any
3155 * subpage of huge page to split. Split doesn't change the position of @page.
3156 *
3157 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
3158 * The huge page must be locked.
3159 *
3160 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
3161 *
3162 * Both head page and tail pages will inherit mapping, flags, and so on from
3163 * the hugepage.
3164 *
3165 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
3166 * they are not mapped.
3167 *
3168 * Returns 0 if the hugepage is split successfully.
3169 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
3170 * us.
3171 */
3172int split_huge_page_to_list(struct page *page, struct list_head *list)
3173{
3174 struct page *head = compound_head(page);
3175 struct anon_vma *anon_vma;
3176 int count, mapcount, ret;
3177
3178 VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
3179 VM_BUG_ON_PAGE(!PageAnon(page), page);
3180 VM_BUG_ON_PAGE(!PageLocked(page), page);
3181 VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
3182 VM_BUG_ON_PAGE(!PageCompound(page), page);
3183
3184 /*
3185 * The caller does not necessarily hold an mmap_sem that would prevent
3186 * the anon_vma disappearing so we first we take a reference to it
3187 * and then lock the anon_vma for write. This is similar to
3188 * page_lock_anon_vma_read except the write lock is taken to serialise
3189 * against parallel split or collapse operations.
3190 */
3191 anon_vma = page_get_anon_vma(head);
3192 if (!anon_vma) {
3193 ret = -EBUSY;
3194 goto out;
3195 }
3196 anon_vma_lock_write(anon_vma);
3197
3198 /*
3199 * Racy check if we can split the page, before freeze_page() will
3200 * split PMDs
3201 */
3202 if (total_mapcount(head) != page_count(head) - 1) {
3203 ret = -EBUSY;
3204 goto out_unlock;
3205 }
3206
3207 freeze_page(anon_vma, head);
3208 VM_BUG_ON_PAGE(compound_mapcount(head), head);
3209
3210 count = page_count(head);
3211 mapcount = total_mapcount(head);
3212 if (mapcount == count - 1) {
3213 __split_huge_page(page, list);
3214 ret = 0;
3215 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) {
3216 pr_alert("total_mapcount: %u, page_count(): %u\n",
3217 mapcount, count);
3218 if (PageTail(page))
3219 dump_page(head, NULL);
3220 dump_page(page, "total_mapcount(head) > page_count(head) - 1");
3221 BUG();
3222 } else {
3223 unfreeze_page(anon_vma, head);
3224 ret = -EBUSY;
3225 }
3226
3227out_unlock:
3228 anon_vma_unlock_write(anon_vma);
3229 put_anon_vma(anon_vma);
3230out:
3231 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
3232 return ret;
3233}