diff options
author | Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | 2015-02-11 18:25:22 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 20:06:01 -0500 |
commit | e66f17ff71772b209eed39de35aaa99ba819c93d (patch) | |
tree | fcf2db6086cb68e0c10354947c1c3d61bcd9f100 | |
parent | cbef8478bee55775ac312a574aad48af7bb9cf9f (diff) |
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where
move_pages() calls follow_page(FOLL_GET) for hugepages internally and
tries to get its refcount without preventing concurrent freeing. This
race crashes the kernel, so this patch fixes it by moving FOLL_GET code
for hugepages into follow_huge_pmd() with taking the page table lock.
This patch intentionally removes page==NULL check after pte_page.
This is justified because pte_page() never returns NULL for any
architectures or configurations.
This patch changes the behavior of follow_huge_pmd() for tail pages and
then tail pages can be pinned/returned. So the caller must be changed to
properly handle the returned tail pages.
We could have a choice to add the similar locking to
follow_huge_(addr|pud) for consistency, but it's not necessary because
currently these functions don't support FOLL_GET flag, so let's leave it
for future development.
Here is the reproducer:
$ cat movepages.c
#include <stdio.h>
#include <stdlib.h>
#include <numaif.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
#define PS 0x1000
int main(int argc, char *argv[]) {
int i;
int nr_hp = strtol(argv[1], NULL, 0);
int nr_p = nr_hp * HPS / PS;
int ret;
void **addrs;
int *status;
int *nodes;
pid_t pid;
pid = strtol(argv[2], NULL, 0);
addrs = malloc(sizeof(char *) * nr_p + 1);
status = malloc(sizeof(char *) * nr_p + 1);
nodes = malloc(sizeof(char *) * nr_p + 1);
while (1) {
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 1;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
for (i = 0; i < nr_p; i++) {
addrs[i] = (void *)ADDR_INPUT + i * PS;
nodes[i] = 0;
status[i] = 0;
}
ret = numa_move_pages(pid, nr_p, addrs, nodes, status,
MPOL_MF_MOVE_ALL);
if (ret == -1)
err("move_pages");
}
return 0;
}
$ cat hugepage.c
#include <stdio.h>
#include <sys/mman.h>
#include <string.h>
#define ADDR_INPUT 0x700000000000UL
#define HPS 0x200000
int main(int argc, char *argv[]) {
int nr_hp = strtol(argv[1], NULL, 0);
char *p;
while (1) {
p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
if (p != (void *)ADDR_INPUT) {
perror("mmap");
break;
}
memset(p, 0, nr_hp * HPS);
munmap(p, nr_hp * HPS);
}
}
$ sysctl vm.nr_hugepages=40
$ ./hugepage 10 &
$ ./movepages 10 $(pgrep -f hugepage)
Fixes: e632a938d914 ("mm: migrate: add hugepage migration code to move_pages()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org> [3.12+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/hugetlb.h | 8 | ||||
-rw-r--r-- | include/linux/swapops.h | 4 | ||||
-rw-r--r-- | mm/gup.c | 25 | ||||
-rw-r--r-- | mm/hugetlb.c | 48 | ||||
-rw-r--r-- | mm/migrate.c | 5 |
5 files changed, 53 insertions, 37 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d7856359920..7b5785032049 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep); | |||
99 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, | 99 | struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, |
100 | int write); | 100 | int write); |
101 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 101 | struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
102 | pmd_t *pmd, int write); | 102 | pmd_t *pmd, int flags); |
103 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, | 103 | struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, |
104 | pud_t *pud, int write); | 104 | pud_t *pud, int flags); |
105 | int pmd_huge(pmd_t pmd); | 105 | int pmd_huge(pmd_t pmd); |
106 | int pud_huge(pud_t pmd); | 106 | int pud_huge(pud_t pmd); |
107 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, | 107 | unsigned long hugetlb_change_protection(struct vm_area_struct *vma, |
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) | |||
133 | static inline void hugetlb_show_meminfo(void) | 133 | static inline void hugetlb_show_meminfo(void) |
134 | { | 134 | { |
135 | } | 135 | } |
136 | #define follow_huge_pmd(mm, addr, pmd, write) NULL | 136 | #define follow_huge_pmd(mm, addr, pmd, flags) NULL |
137 | #define follow_huge_pud(mm, addr, pud, write) NULL | 137 | #define follow_huge_pud(mm, addr, pud, flags) NULL |
138 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) | 138 | #define prepare_hugepage_range(file, addr, len) (-EINVAL) |
139 | #define pmd_huge(x) 0 | 139 | #define pmd_huge(x) 0 |
140 | #define pud_huge(x) 0 | 140 | #define pud_huge(x) 0 |
diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 50cbc876be56..831a3168ab35 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h | |||
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) | |||
135 | *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); | 135 | *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); |
136 | } | 136 | } |
137 | 137 | ||
138 | extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | ||
139 | spinlock_t *ptl); | ||
138 | extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 140 | extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
139 | unsigned long address); | 141 | unsigned long address); |
140 | extern void migration_entry_wait_huge(struct vm_area_struct *vma, | 142 | extern void migration_entry_wait_huge(struct vm_area_struct *vma, |
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp) | |||
148 | } | 150 | } |
149 | #define migration_entry_to_page(swp) NULL | 151 | #define migration_entry_to_page(swp) NULL |
150 | static inline void make_migration_entry_read(swp_entry_t *entryp) { } | 152 | static inline void make_migration_entry_read(swp_entry_t *entryp) { } |
153 | static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | ||
154 | spinlock_t *ptl) { } | ||
151 | static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | 155 | static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, |
152 | unsigned long address) { } | 156 | unsigned long address) { } |
153 | static inline void migration_entry_wait_huge(struct vm_area_struct *vma, | 157 | static inline void migration_entry_wait_huge(struct vm_area_struct *vma, |
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
167 | if (pud_none(*pud)) | 167 | if (pud_none(*pud)) |
168 | return no_page_table(vma, flags); | 168 | return no_page_table(vma, flags); |
169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { | 169 | if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { |
170 | if (flags & FOLL_GET) | 170 | page = follow_huge_pud(mm, address, pud, flags); |
171 | return NULL; | 171 | if (page) |
172 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | 172 | return page; |
173 | return page; | 173 | return no_page_table(vma, flags); |
174 | } | 174 | } |
175 | if (unlikely(pud_bad(*pud))) | 175 | if (unlikely(pud_bad(*pud))) |
176 | return no_page_table(vma, flags); | 176 | return no_page_table(vma, flags); |
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
179 | if (pmd_none(*pmd)) | 179 | if (pmd_none(*pmd)) |
180 | return no_page_table(vma, flags); | 180 | return no_page_table(vma, flags); |
181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { | 181 | if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { |
182 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 182 | page = follow_huge_pmd(mm, address, pmd, flags); |
183 | if (flags & FOLL_GET) { | 183 | if (page) |
184 | /* | 184 | return page; |
185 | * Refcount on tail pages are not well-defined and | 185 | return no_page_table(vma, flags); |
186 | * shouldn't be taken. The caller should handle a NULL | ||
187 | * return when trying to follow tail pages. | ||
188 | */ | ||
189 | if (PageHead(page)) | ||
190 | get_page(page); | ||
191 | else | ||
192 | page = NULL; | ||
193 | } | ||
194 | return page; | ||
195 | } | 186 | } |
196 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) | 187 | if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) |
197 | return no_page_table(vma, flags); | 188 | return no_page_table(vma, flags); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d96b8bfa748f..5aca3707450f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address, | |||
3675 | 3675 | ||
3676 | struct page * __weak | 3676 | struct page * __weak |
3677 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, | 3677 | follow_huge_pmd(struct mm_struct *mm, unsigned long address, |
3678 | pmd_t *pmd, int write) | 3678 | pmd_t *pmd, int flags) |
3679 | { | 3679 | { |
3680 | struct page *page; | 3680 | struct page *page = NULL; |
3681 | 3681 | spinlock_t *ptl; | |
3682 | if (!pmd_present(*pmd)) | 3682 | retry: |
3683 | return NULL; | 3683 | ptl = pmd_lockptr(mm, pmd); |
3684 | page = pte_page(*(pte_t *)pmd); | 3684 | spin_lock(ptl); |
3685 | if (page) | 3685 | /* |
3686 | page += ((address & ~PMD_MASK) >> PAGE_SHIFT); | 3686 | * make sure that the address range covered by this pmd is not |
3687 | * unmapped from other threads. | ||
3688 | */ | ||
3689 | if (!pmd_huge(*pmd)) | ||
3690 | goto out; | ||
3691 | if (pmd_present(*pmd)) { | ||
3692 | page = pte_page(*(pte_t *)pmd) + | ||
3693 | ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
3694 | if (flags & FOLL_GET) | ||
3695 | get_page(page); | ||
3696 | } else { | ||
3697 | if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) { | ||
3698 | spin_unlock(ptl); | ||
3699 | __migration_entry_wait(mm, (pte_t *)pmd, ptl); | ||
3700 | goto retry; | ||
3701 | } | ||
3702 | /* | ||
3703 | * hwpoisoned entry is treated as no_page_table in | ||
3704 | * follow_page_mask(). | ||
3705 | */ | ||
3706 | } | ||
3707 | out: | ||
3708 | spin_unlock(ptl); | ||
3687 | return page; | 3709 | return page; |
3688 | } | 3710 | } |
3689 | 3711 | ||
3690 | struct page * __weak | 3712 | struct page * __weak |
3691 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | 3713 | follow_huge_pud(struct mm_struct *mm, unsigned long address, |
3692 | pud_t *pud, int write) | 3714 | pud_t *pud, int flags) |
3693 | { | 3715 | { |
3694 | struct page *page; | 3716 | if (flags & FOLL_GET) |
3717 | return NULL; | ||
3695 | 3718 | ||
3696 | page = pte_page(*(pte_t *)pud); | 3719 | return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); |
3697 | if (page) | ||
3698 | page += ((address & ~PUD_MASK) >> PAGE_SHIFT); | ||
3699 | return page; | ||
3700 | } | 3720 | } |
3701 | 3721 | ||
3702 | #ifdef CONFIG_MEMORY_FAILURE | 3722 | #ifdef CONFIG_MEMORY_FAILURE |
diff --git a/mm/migrate.c b/mm/migrate.c index 6e284bcca8bb..f98067e5d353 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) | |||
197 | * get to the page and wait until migration is finished. | 197 | * get to the page and wait until migration is finished. |
198 | * When we return from this function the fault will be retried. | 198 | * When we return from this function the fault will be retried. |
199 | */ | 199 | */ |
200 | static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, | 200 | void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, |
201 | spinlock_t *ptl) | 201 | spinlock_t *ptl) |
202 | { | 202 | { |
203 | pte_t pte; | 203 | pte_t pte; |
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
1236 | goto put_and_set; | 1236 | goto put_and_set; |
1237 | 1237 | ||
1238 | if (PageHuge(page)) { | 1238 | if (PageHuge(page)) { |
1239 | isolate_huge_page(page, &pagelist); | 1239 | if (PageHead(page)) |
1240 | isolate_huge_page(page, &pagelist); | ||
1240 | goto put_and_set; | 1241 | goto put_and_set; |
1241 | } | 1242 | } |
1242 | 1243 | ||