aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>2015-02-11 18:25:22 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-11 20:06:01 -0500
commite66f17ff71772b209eed39de35aaa99ba819c93d (patch)
treefcf2db6086cb68e0c10354947c1c3d61bcd9f100
parentcbef8478bee55775ac312a574aad48af7bb9cf9f (diff)
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where move_pages() calls follow_page(FOLL_GET) for hugepages internally and tries to get its refcount without preventing concurrent freeing. This race crashes the kernel, so this patch fixes it by moving FOLL_GET code for hugepages into follow_huge_pmd() with taking the page table lock. This patch intentionally removes page==NULL check after pte_page. This is justified because pte_page() never returns NULL for any architectures or configurations. This patch changes the behavior of follow_huge_pmd() for tail pages and then tail pages can be pinned/returned. So the caller must be changed to properly handle the returned tail pages. We could have a choice to add the similar locking to follow_huge_(addr|pud) for consistency, but it's not necessary because currently these functions don't support FOLL_GET flag, so let's leave it for future development. Here is the reproducer: $ cat movepages.c #include <stdio.h> #include <stdlib.h> #include <numaif.h> #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 #define PS 0x1000 int main(int argc, char *argv[]) { int i; int nr_hp = strtol(argv[1], NULL, 0); int nr_p = nr_hp * HPS / PS; int ret; void **addrs; int *status; int *nodes; pid_t pid; pid = strtol(argv[2], NULL, 0); addrs = malloc(sizeof(char *) * nr_p + 1); status = malloc(sizeof(char *) * nr_p + 1); nodes = malloc(sizeof(char *) * nr_p + 1); while (1) { for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 1; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 0; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); } return 0; } $ cat hugepage.c #include <stdio.h> #include <sys/mman.h> #include <string.h> #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 int main(int argc, char *argv[]) { int nr_hp = strtol(argv[1], NULL, 0); char *p; while (1) { p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (p != (void *)ADDR_INPUT) { perror("mmap"); break; } memset(p, 0, nr_hp * HPS); munmap(p, nr_hp * HPS); } } $ sysctl vm.nr_hugepages=40 $ ./hugepage 10 & $ ./movepages 10 $(pgrep -f hugepage) Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()") Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reported-by: Hugh Dickins <hughd@google.com> Cc: James Hogan <james.hogan@imgtec.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: <stable@vger.kernel.org> [3.12+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--include/linux/swapops.h4
-rw-r--r--mm/gup.c25
-rw-r--r--mm/hugetlb.c48
-rw-r--r--mm/migrate.c5
5 files changed, 53 insertions, 37 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d7856359920..7b5785032049 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
99struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, 99struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
100 int write); 100 int write);
101struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, 101struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
102 pmd_t *pmd, int write); 102 pmd_t *pmd, int flags);
103struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, 103struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
104 pud_t *pud, int write); 104 pud_t *pud, int flags);
105int pmd_huge(pmd_t pmd); 105int pmd_huge(pmd_t pmd);
106int pud_huge(pud_t pmd); 106int pud_huge(pud_t pmd);
107unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 107unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
133static inline void hugetlb_show_meminfo(void) 133static inline void hugetlb_show_meminfo(void)
134{ 134{
135} 135}
136#define follow_huge_pmd(mm, addr, pmd, write) NULL 136#define follow_huge_pmd(mm, addr, pmd, flags) NULL
137#define follow_huge_pud(mm, addr, pud, write) NULL 137#define follow_huge_pud(mm, addr, pud, flags) NULL
138#define prepare_hugepage_range(file, addr, len) (-EINVAL) 138#define prepare_hugepage_range(file, addr, len) (-EINVAL)
139#define pmd_huge(x) 0 139#define pmd_huge(x) 0
140#define pud_huge(x) 0 140#define pud_huge(x) 0
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index 50cbc876be56..831a3168ab35 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
135 *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); 135 *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
136} 136}
137 137
138extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
139 spinlock_t *ptl);
138extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 140extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
139 unsigned long address); 141 unsigned long address);
140extern void migration_entry_wait_huge(struct vm_area_struct *vma, 142extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
148} 150}
149#define migration_entry_to_page(swp) NULL 151#define migration_entry_to_page(swp) NULL
150static inline void make_migration_entry_read(swp_entry_t *entryp) { } 152static inline void make_migration_entry_read(swp_entry_t *entryp) { }
153static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
154 spinlock_t *ptl) { }
151static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, 155static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
152 unsigned long address) { } 156 unsigned long address) { }
153static inline void migration_entry_wait_huge(struct vm_area_struct *vma, 157static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
diff --git a/mm/gup.c b/mm/gup.c
index 12bc2bc33da7..1a8ab05918e0 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
167 if (pud_none(*pud)) 167 if (pud_none(*pud))
168 return no_page_table(vma, flags); 168 return no_page_table(vma, flags);
169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { 169 if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
170 if (flags & FOLL_GET) 170 page = follow_huge_pud(mm, address, pud, flags);
171 return NULL; 171 if (page)
172 page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); 172 return page;
173 return page; 173 return no_page_table(vma, flags);
174 } 174 }
175 if (unlikely(pud_bad(*pud))) 175 if (unlikely(pud_bad(*pud)))
176 return no_page_table(vma, flags); 176 return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
179 if (pmd_none(*pmd)) 179 if (pmd_none(*pmd))
180 return no_page_table(vma, flags); 180 return no_page_table(vma, flags);
181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { 181 if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
182 page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); 182 page = follow_huge_pmd(mm, address, pmd, flags);
183 if (flags & FOLL_GET) { 183 if (page)
184 /* 184 return page;
185 * Refcount on tail pages are not well-defined and 185 return no_page_table(vma, flags);
186 * shouldn't be taken. The caller should handle a NULL
187 * return when trying to follow tail pages.
188 */
189 if (PageHead(page))
190 get_page(page);
191 else
192 page = NULL;
193 }
194 return page;
195 } 186 }
196 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 187 if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
197 return no_page_table(vma, flags); 188 return no_page_table(vma, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d96b8bfa748f..5aca3707450f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
3675 3675
3676struct page * __weak 3676struct page * __weak
3677follow_huge_pmd(struct mm_struct *mm, unsigned long address, 3677follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3678 pmd_t *pmd, int write) 3678 pmd_t *pmd, int flags)
3679{ 3679{
3680 struct page *page; 3680 struct page *page = NULL;
3681 3681 spinlock_t *ptl;
3682 if (!pmd_present(*pmd)) 3682retry:
3683 return NULL; 3683 ptl = pmd_lockptr(mm, pmd);
3684 page = pte_page(*(pte_t *)pmd); 3684 spin_lock(ptl);
3685 if (page) 3685 /*
3686 page += ((address & ~PMD_MASK) >> PAGE_SHIFT); 3686 * make sure that the address range covered by this pmd is not
3687 * unmapped from other threads.
3688 */
3689 if (!pmd_huge(*pmd))
3690 goto out;
3691 if (pmd_present(*pmd)) {
3692 page = pte_page(*(pte_t *)pmd) +
3693 ((address & ~PMD_MASK) >> PAGE_SHIFT);
3694 if (flags & FOLL_GET)
3695 get_page(page);
3696 } else {
3697 if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3698 spin_unlock(ptl);
3699 __migration_entry_wait(mm, (pte_t *)pmd, ptl);
3700 goto retry;
3701 }
3702 /*
3703 * hwpoisoned entry is treated as no_page_table in
3704 * follow_page_mask().
3705 */
3706 }
3707out:
3708 spin_unlock(ptl);
3687 return page; 3709 return page;
3688} 3710}
3689 3711
3690struct page * __weak 3712struct page * __weak
3691follow_huge_pud(struct mm_struct *mm, unsigned long address, 3713follow_huge_pud(struct mm_struct *mm, unsigned long address,
3692 pud_t *pud, int write) 3714 pud_t *pud, int flags)
3693{ 3715{
3694 struct page *page; 3716 if (flags & FOLL_GET)
3717 return NULL;
3695 3718
3696 page = pte_page(*(pte_t *)pud); 3719 return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
3697 if (page)
3698 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
3699 return page;
3700} 3720}
3701 3721
3702#ifdef CONFIG_MEMORY_FAILURE 3722#ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/migrate.c b/mm/migrate.c
index 6e284bcca8bb..f98067e5d353 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
197 * get to the page and wait until migration is finished. 197 * get to the page and wait until migration is finished.
198 * When we return from this function the fault will be retried. 198 * When we return from this function the fault will be retried.
199 */ 199 */
200static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, 200void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
201 spinlock_t *ptl) 201 spinlock_t *ptl)
202{ 202{
203 pte_t pte; 203 pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
1236 goto put_and_set; 1236 goto put_and_set;
1237 1237
1238 if (PageHuge(page)) { 1238 if (PageHuge(page)) {
1239 isolate_huge_page(page, &pagelist); 1239 if (PageHead(page))
1240 isolate_huge_page(page, &pagelist);
1240 goto put_and_set; 1241 goto put_and_set;
1241 } 1242 }
1242 1243