aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorKirill A. Shutemov <kirill.shutemov@linux.intel.com>2016-01-15 19:55:46 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:56:32 -0500
commitbd56086f10186e2c205429cc12b16e43aacb1c7e (patch)
treeb615ef3c093b30c30511b9f2ff625fdff7eab65e /mm/huge_memory.c
parentb8d3c4c3009d42869dc03a1da0efc2aa687d0ab4 (diff)
thp: fix split_huge_page() after mremap() of THP
Sasha Levin has reported KASAN out-of-bounds bug[1]. It points to "if (!is_swap_pte(pte[i]))" in unfreeze_page_vma() as a problematic access. The cause is that split_huge_page() doesn't handle THP correctly if it's not allingned to PMD boundary. It can happen after mremap(). Test-case (not always triggers the bug): #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <sys/mman.h> #define MB (1024UL*1024) #define SIZE (2*MB) #define BASE ((void *)0x400000000000) int main() { char *p; p = mmap(BASE, SIZE, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); if (p == MAP_FAILED) perror("mmap"), exit(1); p = mremap(BASE, SIZE, SIZE, MREMAP_FIXED | MREMAP_MAYMOVE, BASE + SIZE + 8192); if (p == MAP_FAILED) perror("mremap"), exit(1); system("echo 1 > /sys/kernel/debug/split_huge_pages"); return 0; } The patch fixes freeze and unfreeze paths to handle page table boundary crossing. It also makes mapcount vs count check in split_huge_page_to_list() stricter: - after freeze we don't expect any subpage mapped as we remove them from rmap when setting up migration entries; - count must be 1, meaning only caller has reference to the page; [1] https://gist.github.com/sashalevin/c67fbea55e7c0576972a Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c70
1 files changed, 49 insertions, 21 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1a4989fef08f..9d12d63a0ddd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2990,6 +2990,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
2990static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, 2990static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
2991 unsigned long address) 2991 unsigned long address)
2992{ 2992{
2993 unsigned long haddr = address & HPAGE_PMD_MASK;
2993 spinlock_t *ptl; 2994 spinlock_t *ptl;
2994 pgd_t *pgd; 2995 pgd_t *pgd;
2995 pud_t *pud; 2996 pud_t *pud;
@@ -3019,34 +3020,47 @@ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page,
3019 } 3020 }
3020 if (pmd_trans_huge(*pmd)) { 3021 if (pmd_trans_huge(*pmd)) {
3021 if (page == pmd_page(*pmd)) 3022 if (page == pmd_page(*pmd))
3022 __split_huge_pmd_locked(vma, pmd, address, true); 3023 __split_huge_pmd_locked(vma, pmd, haddr, true);
3023 spin_unlock(ptl); 3024 spin_unlock(ptl);
3024 return; 3025 return;
3025 } 3026 }
3026 spin_unlock(ptl); 3027 spin_unlock(ptl);
3027 3028
3028 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); 3029 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3029 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) { 3030 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3030 pte_t entry, swp_pte; 3031 pte_t entry, swp_pte;
3031 swp_entry_t swp_entry; 3032 swp_entry_t swp_entry;
3032 3033
3033 if (!pte_present(pte[i])) 3034 /*
3035 * We've just crossed page table boundary: need to map next one.
3036 * It can happen if THP was mremaped to non PMD-aligned address.
3037 */
3038 if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3039 pte_unmap_unlock(pte - 1, ptl);
3040 pmd = mm_find_pmd(vma->vm_mm, address);
3041 if (!pmd)
3042 return;
3043 pte = pte_offset_map_lock(vma->vm_mm, pmd,
3044 address, &ptl);
3045 }
3046
3047 if (!pte_present(*pte))
3034 continue; 3048 continue;
3035 if (page_to_pfn(page) != pte_pfn(pte[i])) 3049 if (page_to_pfn(page) != pte_pfn(*pte))
3036 continue; 3050 continue;
3037 flush_cache_page(vma, address, page_to_pfn(page)); 3051 flush_cache_page(vma, address, page_to_pfn(page));
3038 entry = ptep_clear_flush(vma, address, pte + i); 3052 entry = ptep_clear_flush(vma, address, pte);
3039 if (pte_dirty(entry)) 3053 if (pte_dirty(entry))
3040 SetPageDirty(page); 3054 SetPageDirty(page);
3041 swp_entry = make_migration_entry(page, pte_write(entry)); 3055 swp_entry = make_migration_entry(page, pte_write(entry));
3042 swp_pte = swp_entry_to_pte(swp_entry); 3056 swp_pte = swp_entry_to_pte(swp_entry);
3043 if (pte_soft_dirty(entry)) 3057 if (pte_soft_dirty(entry))
3044 swp_pte = pte_swp_mksoft_dirty(swp_pte); 3058 swp_pte = pte_swp_mksoft_dirty(swp_pte);
3045 set_pte_at(vma->vm_mm, address, pte + i, swp_pte); 3059 set_pte_at(vma->vm_mm, address, pte, swp_pte);
3046 page_remove_rmap(page, false); 3060 page_remove_rmap(page, false);
3047 put_page(page); 3061 put_page(page);
3048 } 3062 }
3049 pte_unmap_unlock(pte, ptl); 3063 pte_unmap_unlock(pte - 1, ptl);
3050} 3064}
3051 3065
3052static void freeze_page(struct anon_vma *anon_vma, struct page *page) 3066static void freeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -3058,14 +3072,13 @@ static void freeze_page(struct anon_vma *anon_vma, struct page *page)
3058 3072
3059 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, 3073 anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff,
3060 pgoff + HPAGE_PMD_NR - 1) { 3074 pgoff + HPAGE_PMD_NR - 1) {
3061 unsigned long haddr; 3075 unsigned long address = __vma_address(page, avc->vma);
3062 3076
3063 haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK;
3064 mmu_notifier_invalidate_range_start(avc->vma->vm_mm, 3077 mmu_notifier_invalidate_range_start(avc->vma->vm_mm,
3065 haddr, haddr + HPAGE_PMD_SIZE); 3078 address, address + HPAGE_PMD_SIZE);
3066 freeze_page_vma(avc->vma, page, haddr); 3079 freeze_page_vma(avc->vma, page, address);
3067 mmu_notifier_invalidate_range_end(avc->vma->vm_mm, 3080 mmu_notifier_invalidate_range_end(avc->vma->vm_mm,
3068 haddr, haddr + HPAGE_PMD_SIZE); 3081 address, address + HPAGE_PMD_SIZE);
3069 } 3082 }
3070} 3083}
3071 3084
@@ -3076,6 +3089,7 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
3076 pmd_t *pmd; 3089 pmd_t *pmd;
3077 pte_t *pte, entry; 3090 pte_t *pte, entry;
3078 swp_entry_t swp_entry; 3091 swp_entry_t swp_entry;
3092 unsigned long haddr = address & HPAGE_PMD_MASK;
3079 int i, nr = HPAGE_PMD_NR; 3093 int i, nr = HPAGE_PMD_NR;
3080 3094
3081 /* Skip pages which doesn't belong to the VMA */ 3095 /* Skip pages which doesn't belong to the VMA */
@@ -3089,12 +3103,26 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
3089 pmd = mm_find_pmd(vma->vm_mm, address); 3103 pmd = mm_find_pmd(vma->vm_mm, address);
3090 if (!pmd) 3104 if (!pmd)
3091 return; 3105 return;
3106
3092 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); 3107 pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl);
3093 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) { 3108 for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) {
3094 if (!is_swap_pte(pte[i])) 3109 /*
3110 * We've just crossed page table boundary: need to map next one.
3111 * It can happen if THP was mremaped to non-PMD aligned address.
3112 */
3113 if (unlikely(address == haddr + HPAGE_PMD_SIZE)) {
3114 pte_unmap_unlock(pte - 1, ptl);
3115 pmd = mm_find_pmd(vma->vm_mm, address);
3116 if (!pmd)
3117 return;
3118 pte = pte_offset_map_lock(vma->vm_mm, pmd,
3119 address, &ptl);
3120 }
3121
3122 if (!is_swap_pte(*pte))
3095 continue; 3123 continue;
3096 3124
3097 swp_entry = pte_to_swp_entry(pte[i]); 3125 swp_entry = pte_to_swp_entry(*pte);
3098 if (!is_migration_entry(swp_entry)) 3126 if (!is_migration_entry(swp_entry))
3099 continue; 3127 continue;
3100 if (migration_entry_to_page(swp_entry) != page) 3128 if (migration_entry_to_page(swp_entry) != page)
@@ -3110,12 +3138,12 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page,
3110 entry = maybe_mkwrite(entry, vma); 3138 entry = maybe_mkwrite(entry, vma);
3111 3139
3112 flush_dcache_page(page); 3140 flush_dcache_page(page);
3113 set_pte_at(vma->vm_mm, address, pte + i, entry); 3141 set_pte_at(vma->vm_mm, address, pte, entry);
3114 3142
3115 /* No need to invalidate - it was non-present before */ 3143 /* No need to invalidate - it was non-present before */
3116 update_mmu_cache(vma, address, pte + i); 3144 update_mmu_cache(vma, address, pte);
3117 } 3145 }
3118 pte_unmap_unlock(pte, ptl); 3146 pte_unmap_unlock(pte - 1, ptl);
3119} 3147}
3120 3148
3121static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) 3149static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
@@ -3321,7 +3349,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3321 spin_lock(&split_queue_lock); 3349 spin_lock(&split_queue_lock);
3322 count = page_count(head); 3350 count = page_count(head);
3323 mapcount = total_mapcount(head); 3351 mapcount = total_mapcount(head);
3324 if (mapcount == count - 1) { 3352 if (!mapcount && count == 1) {
3325 if (!list_empty(page_deferred_list(head))) { 3353 if (!list_empty(page_deferred_list(head))) {
3326 split_queue_len--; 3354 split_queue_len--;
3327 list_del(page_deferred_list(head)); 3355 list_del(page_deferred_list(head));
@@ -3329,13 +3357,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
3329 spin_unlock(&split_queue_lock); 3357 spin_unlock(&split_queue_lock);
3330 __split_huge_page(page, list); 3358 __split_huge_page(page, list);
3331 ret = 0; 3359 ret = 0;
3332 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) { 3360 } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
3333 spin_unlock(&split_queue_lock); 3361 spin_unlock(&split_queue_lock);
3334 pr_alert("total_mapcount: %u, page_count(): %u\n", 3362 pr_alert("total_mapcount: %u, page_count(): %u\n",
3335 mapcount, count); 3363 mapcount, count);
3336 if (PageTail(page)) 3364 if (PageTail(page))
3337 dump_page(head, NULL); 3365 dump_page(head, NULL);
3338 dump_page(page, "total_mapcount(head) > page_count(head) - 1"); 3366 dump_page(page, "total_mapcount(head) > 0");
3339 BUG(); 3367 BUG();
3340 } else { 3368 } else {
3341 spin_unlock(&split_queue_lock); 3369 spin_unlock(&split_queue_lock);