diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2016-01-15 19:55:46 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-15 20:56:32 -0500 |
commit | bd56086f10186e2c205429cc12b16e43aacb1c7e (patch) | |
tree | b615ef3c093b30c30511b9f2ff625fdff7eab65e /mm/huge_memory.c | |
parent | b8d3c4c3009d42869dc03a1da0efc2aa687d0ab4 (diff) |
thp: fix split_huge_page() after mremap() of THP
Sasha Levin has reported KASAN out-of-bounds bug[1]. It points to "if
(!is_swap_pte(pte[i]))" in unfreeze_page_vma() as a problematic access.
The cause is that split_huge_page() doesn't handle THP correctly if it's
not allingned to PMD boundary. It can happen after mremap().
Test-case (not always triggers the bug):
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#define MB (1024UL*1024)
#define SIZE (2*MB)
#define BASE ((void *)0x400000000000)
int main()
{
char *p;
p = mmap(BASE, SIZE, PROT_READ | PROT_WRITE,
MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE,
-1, 0);
if (p == MAP_FAILED)
perror("mmap"), exit(1);
p = mremap(BASE, SIZE, SIZE, MREMAP_FIXED | MREMAP_MAYMOVE,
BASE + SIZE + 8192);
if (p == MAP_FAILED)
perror("mremap"), exit(1);
system("echo 1 > /sys/kernel/debug/split_huge_pages");
return 0;
}
The patch fixes freeze and unfreeze paths to handle page table boundary
crossing.
It also makes mapcount vs count check in split_huge_page_to_list()
stricter:
- after freeze we don't expect any subpage mapped as we remove them
from rmap when setting up migration entries;
- count must be 1, meaning only caller has reference to the page;
[1] https://gist.github.com/sashalevin/c67fbea55e7c0576972a
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 70 |
1 files changed, 49 insertions, 21 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1a4989fef08f..9d12d63a0ddd 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -2990,6 +2990,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
2990 | static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, | 2990 | static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, |
2991 | unsigned long address) | 2991 | unsigned long address) |
2992 | { | 2992 | { |
2993 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
2993 | spinlock_t *ptl; | 2994 | spinlock_t *ptl; |
2994 | pgd_t *pgd; | 2995 | pgd_t *pgd; |
2995 | pud_t *pud; | 2996 | pud_t *pud; |
@@ -3019,34 +3020,47 @@ static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, | |||
3019 | } | 3020 | } |
3020 | if (pmd_trans_huge(*pmd)) { | 3021 | if (pmd_trans_huge(*pmd)) { |
3021 | if (page == pmd_page(*pmd)) | 3022 | if (page == pmd_page(*pmd)) |
3022 | __split_huge_pmd_locked(vma, pmd, address, true); | 3023 | __split_huge_pmd_locked(vma, pmd, haddr, true); |
3023 | spin_unlock(ptl); | 3024 | spin_unlock(ptl); |
3024 | return; | 3025 | return; |
3025 | } | 3026 | } |
3026 | spin_unlock(ptl); | 3027 | spin_unlock(ptl); |
3027 | 3028 | ||
3028 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); | 3029 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); |
3029 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) { | 3030 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { |
3030 | pte_t entry, swp_pte; | 3031 | pte_t entry, swp_pte; |
3031 | swp_entry_t swp_entry; | 3032 | swp_entry_t swp_entry; |
3032 | 3033 | ||
3033 | if (!pte_present(pte[i])) | 3034 | /* |
3035 | * We've just crossed page table boundary: need to map next one. | ||
3036 | * It can happen if THP was mremaped to non PMD-aligned address. | ||
3037 | */ | ||
3038 | if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { | ||
3039 | pte_unmap_unlock(pte - 1, ptl); | ||
3040 | pmd = mm_find_pmd(vma->vm_mm, address); | ||
3041 | if (!pmd) | ||
3042 | return; | ||
3043 | pte = pte_offset_map_lock(vma->vm_mm, pmd, | ||
3044 | address, &ptl); | ||
3045 | } | ||
3046 | |||
3047 | if (!pte_present(*pte)) | ||
3034 | continue; | 3048 | continue; |
3035 | if (page_to_pfn(page) != pte_pfn(pte[i])) | 3049 | if (page_to_pfn(page) != pte_pfn(*pte)) |
3036 | continue; | 3050 | continue; |
3037 | flush_cache_page(vma, address, page_to_pfn(page)); | 3051 | flush_cache_page(vma, address, page_to_pfn(page)); |
3038 | entry = ptep_clear_flush(vma, address, pte + i); | 3052 | entry = ptep_clear_flush(vma, address, pte); |
3039 | if (pte_dirty(entry)) | 3053 | if (pte_dirty(entry)) |
3040 | SetPageDirty(page); | 3054 | SetPageDirty(page); |
3041 | swp_entry = make_migration_entry(page, pte_write(entry)); | 3055 | swp_entry = make_migration_entry(page, pte_write(entry)); |
3042 | swp_pte = swp_entry_to_pte(swp_entry); | 3056 | swp_pte = swp_entry_to_pte(swp_entry); |
3043 | if (pte_soft_dirty(entry)) | 3057 | if (pte_soft_dirty(entry)) |
3044 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | 3058 | swp_pte = pte_swp_mksoft_dirty(swp_pte); |
3045 | set_pte_at(vma->vm_mm, address, pte + i, swp_pte); | 3059 | set_pte_at(vma->vm_mm, address, pte, swp_pte); |
3046 | page_remove_rmap(page, false); | 3060 | page_remove_rmap(page, false); |
3047 | put_page(page); | 3061 | put_page(page); |
3048 | } | 3062 | } |
3049 | pte_unmap_unlock(pte, ptl); | 3063 | pte_unmap_unlock(pte - 1, ptl); |
3050 | } | 3064 | } |
3051 | 3065 | ||
3052 | static void freeze_page(struct anon_vma *anon_vma, struct page *page) | 3066 | static void freeze_page(struct anon_vma *anon_vma, struct page *page) |
@@ -3058,14 +3072,13 @@ static void freeze_page(struct anon_vma *anon_vma, struct page *page) | |||
3058 | 3072 | ||
3059 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, | 3073 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, |
3060 | pgoff + HPAGE_PMD_NR - 1) { | 3074 | pgoff + HPAGE_PMD_NR - 1) { |
3061 | unsigned long haddr; | 3075 | unsigned long address = __vma_address(page, avc->vma); |
3062 | 3076 | ||
3063 | haddr = __vma_address(page, avc->vma) & HPAGE_PMD_MASK; | ||
3064 | mmu_notifier_invalidate_range_start(avc->vma->vm_mm, | 3077 | mmu_notifier_invalidate_range_start(avc->vma->vm_mm, |
3065 | haddr, haddr + HPAGE_PMD_SIZE); | 3078 | address, address + HPAGE_PMD_SIZE); |
3066 | freeze_page_vma(avc->vma, page, haddr); | 3079 | freeze_page_vma(avc->vma, page, address); |
3067 | mmu_notifier_invalidate_range_end(avc->vma->vm_mm, | 3080 | mmu_notifier_invalidate_range_end(avc->vma->vm_mm, |
3068 | haddr, haddr + HPAGE_PMD_SIZE); | 3081 | address, address + HPAGE_PMD_SIZE); |
3069 | } | 3082 | } |
3070 | } | 3083 | } |
3071 | 3084 | ||
@@ -3076,6 +3089,7 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, | |||
3076 | pmd_t *pmd; | 3089 | pmd_t *pmd; |
3077 | pte_t *pte, entry; | 3090 | pte_t *pte, entry; |
3078 | swp_entry_t swp_entry; | 3091 | swp_entry_t swp_entry; |
3092 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
3079 | int i, nr = HPAGE_PMD_NR; | 3093 | int i, nr = HPAGE_PMD_NR; |
3080 | 3094 | ||
3081 | /* Skip pages which doesn't belong to the VMA */ | 3095 | /* Skip pages which doesn't belong to the VMA */ |
@@ -3089,12 +3103,26 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, | |||
3089 | pmd = mm_find_pmd(vma->vm_mm, address); | 3103 | pmd = mm_find_pmd(vma->vm_mm, address); |
3090 | if (!pmd) | 3104 | if (!pmd) |
3091 | return; | 3105 | return; |
3106 | |||
3092 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); | 3107 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); |
3093 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++) { | 3108 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { |
3094 | if (!is_swap_pte(pte[i])) | 3109 | /* |
3110 | * We've just crossed page table boundary: need to map next one. | ||
3111 | * It can happen if THP was mremaped to non-PMD aligned address. | ||
3112 | */ | ||
3113 | if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { | ||
3114 | pte_unmap_unlock(pte - 1, ptl); | ||
3115 | pmd = mm_find_pmd(vma->vm_mm, address); | ||
3116 | if (!pmd) | ||
3117 | return; | ||
3118 | pte = pte_offset_map_lock(vma->vm_mm, pmd, | ||
3119 | address, &ptl); | ||
3120 | } | ||
3121 | |||
3122 | if (!is_swap_pte(*pte)) | ||
3095 | continue; | 3123 | continue; |
3096 | 3124 | ||
3097 | swp_entry = pte_to_swp_entry(pte[i]); | 3125 | swp_entry = pte_to_swp_entry(*pte); |
3098 | if (!is_migration_entry(swp_entry)) | 3126 | if (!is_migration_entry(swp_entry)) |
3099 | continue; | 3127 | continue; |
3100 | if (migration_entry_to_page(swp_entry) != page) | 3128 | if (migration_entry_to_page(swp_entry) != page) |
@@ -3110,12 +3138,12 @@ static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, | |||
3110 | entry = maybe_mkwrite(entry, vma); | 3138 | entry = maybe_mkwrite(entry, vma); |
3111 | 3139 | ||
3112 | flush_dcache_page(page); | 3140 | flush_dcache_page(page); |
3113 | set_pte_at(vma->vm_mm, address, pte + i, entry); | 3141 | set_pte_at(vma->vm_mm, address, pte, entry); |
3114 | 3142 | ||
3115 | /* No need to invalidate - it was non-present before */ | 3143 | /* No need to invalidate - it was non-present before */ |
3116 | update_mmu_cache(vma, address, pte + i); | 3144 | update_mmu_cache(vma, address, pte); |
3117 | } | 3145 | } |
3118 | pte_unmap_unlock(pte, ptl); | 3146 | pte_unmap_unlock(pte - 1, ptl); |
3119 | } | 3147 | } |
3120 | 3148 | ||
3121 | static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) | 3149 | static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) |
@@ -3321,7 +3349,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3321 | spin_lock(&split_queue_lock); | 3349 | spin_lock(&split_queue_lock); |
3322 | count = page_count(head); | 3350 | count = page_count(head); |
3323 | mapcount = total_mapcount(head); | 3351 | mapcount = total_mapcount(head); |
3324 | if (mapcount == count - 1) { | 3352 | if (!mapcount && count == 1) { |
3325 | if (!list_empty(page_deferred_list(head))) { | 3353 | if (!list_empty(page_deferred_list(head))) { |
3326 | split_queue_len--; | 3354 | split_queue_len--; |
3327 | list_del(page_deferred_list(head)); | 3355 | list_del(page_deferred_list(head)); |
@@ -3329,13 +3357,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) | |||
3329 | spin_unlock(&split_queue_lock); | 3357 | spin_unlock(&split_queue_lock); |
3330 | __split_huge_page(page, list); | 3358 | __split_huge_page(page, list); |
3331 | ret = 0; | 3359 | ret = 0; |
3332 | } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) { | 3360 | } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { |
3333 | spin_unlock(&split_queue_lock); | 3361 | spin_unlock(&split_queue_lock); |
3334 | pr_alert("total_mapcount: %u, page_count(): %u\n", | 3362 | pr_alert("total_mapcount: %u, page_count(): %u\n", |
3335 | mapcount, count); | 3363 | mapcount, count); |
3336 | if (PageTail(page)) | 3364 | if (PageTail(page)) |
3337 | dump_page(head, NULL); | 3365 | dump_page(head, NULL); |
3338 | dump_page(page, "total_mapcount(head) > page_count(head) - 1"); | 3366 | dump_page(page, "total_mapcount(head) > 0"); |
3339 | BUG(); | 3367 | BUG(); |
3340 | } else { | 3368 | } else { |
3341 | spin_unlock(&split_queue_lock); | 3369 | spin_unlock(&split_queue_lock); |