diff options
author | Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> | 2015-02-11 18:25:25 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 20:06:01 -0500 |
commit | 0f792cf949a0be506c2aa8bfac0605746b146dda (patch) | |
tree | 5d96ac2ff92c4e65265a9d4c990b9872bd7de8eb /mm | |
parent | e66f17ff71772b209eed39de35aaa99ba819c93d (diff) |
mm/hugetlb: fix getting refcount 0 page in hugetlb_fault()
When running the test which causes the race as shown in the previous patch,
we can hit the BUG "get_page() on refcount 0 page" in hugetlb_fault().
This race happens when pte turns into migration entry just after the first
check of is_hugetlb_entry_migration() in hugetlb_fault() passed with false.
To fix this, we need to check pte_present() again after huge_ptep_get().
This patch also reorders taking ptl and doing pte_page(), because
pte_page() should be done in ptl. Due to this reordering, we need use
trylock_page() in page != pagecache_page case to respect locking order.
Fixes: 66aebce747ea ("hugetlb: fix race condition in hugetlb_fault()")
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Steve Capper <steve.capper@linaro.org>
Cc: <stable@vger.kernel.org> [3.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 52 |
1 files changed, 36 insertions, 16 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5aca3707450f..385c3a1aced7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -3134,6 +3134,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3134 | struct page *pagecache_page = NULL; | 3134 | struct page *pagecache_page = NULL; |
3135 | struct hstate *h = hstate_vma(vma); | 3135 | struct hstate *h = hstate_vma(vma); |
3136 | struct address_space *mapping; | 3136 | struct address_space *mapping; |
3137 | int need_wait_lock = 0; | ||
3137 | 3138 | ||
3138 | address &= huge_page_mask(h); | 3139 | address &= huge_page_mask(h); |
3139 | 3140 | ||
@@ -3172,6 +3173,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3172 | ret = 0; | 3173 | ret = 0; |
3173 | 3174 | ||
3174 | /* | 3175 | /* |
3176 | * entry could be a migration/hwpoison entry at this point, so this | ||
3177 | * check prevents the kernel from going below assuming that we have | ||
3178 | * a active hugepage in pagecache. This goto expects the 2nd page fault, | ||
3179 | * and is_hugetlb_entry_(migration|hwpoisoned) check will properly | ||
3180 | * handle it. | ||
3181 | */ | ||
3182 | if (!pte_present(entry)) | ||
3183 | goto out_mutex; | ||
3184 | |||
3185 | /* | ||
3175 | * If we are going to COW the mapping later, we examine the pending | 3186 | * If we are going to COW the mapping later, we examine the pending |
3176 | * reservations for this page now. This will ensure that any | 3187 | * reservations for this page now. This will ensure that any |
3177 | * allocations necessary to record that reservation occur outside the | 3188 | * allocations necessary to record that reservation occur outside the |
@@ -3190,30 +3201,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3190 | vma, address); | 3201 | vma, address); |
3191 | } | 3202 | } |
3192 | 3203 | ||
3204 | ptl = huge_pte_lock(h, mm, ptep); | ||
3205 | |||
3206 | /* Check for a racing update before calling hugetlb_cow */ | ||
3207 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3208 | goto out_ptl; | ||
3209 | |||
3193 | /* | 3210 | /* |
3194 | * hugetlb_cow() requires page locks of pte_page(entry) and | 3211 | * hugetlb_cow() requires page locks of pte_page(entry) and |
3195 | * pagecache_page, so here we need take the former one | 3212 | * pagecache_page, so here we need take the former one |
3196 | * when page != pagecache_page or !pagecache_page. | 3213 | * when page != pagecache_page or !pagecache_page. |
3197 | * Note that locking order is always pagecache_page -> page, | ||
3198 | * so no worry about deadlock. | ||
3199 | */ | 3214 | */ |
3200 | page = pte_page(entry); | 3215 | page = pte_page(entry); |
3201 | get_page(page); | ||
3202 | if (page != pagecache_page) | 3216 | if (page != pagecache_page) |
3203 | lock_page(page); | 3217 | if (!trylock_page(page)) { |
3204 | 3218 | need_wait_lock = 1; | |
3205 | ptl = huge_pte_lockptr(h, mm, ptep); | 3219 | goto out_ptl; |
3206 | spin_lock(ptl); | 3220 | } |
3207 | /* Check for a racing update before calling hugetlb_cow */ | ||
3208 | if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) | ||
3209 | goto out_ptl; | ||
3210 | 3221 | ||
3222 | get_page(page); | ||
3211 | 3223 | ||
3212 | if (flags & FAULT_FLAG_WRITE) { | 3224 | if (flags & FAULT_FLAG_WRITE) { |
3213 | if (!huge_pte_write(entry)) { | 3225 | if (!huge_pte_write(entry)) { |
3214 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 3226 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
3215 | pagecache_page, ptl); | 3227 | pagecache_page, ptl); |
3216 | goto out_ptl; | 3228 | goto out_put_page; |
3217 | } | 3229 | } |
3218 | entry = huge_pte_mkdirty(entry); | 3230 | entry = huge_pte_mkdirty(entry); |
3219 | } | 3231 | } |
@@ -3221,7 +3233,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3221 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 3233 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
3222 | flags & FAULT_FLAG_WRITE)) | 3234 | flags & FAULT_FLAG_WRITE)) |
3223 | update_mmu_cache(vma, address, ptep); | 3235 | update_mmu_cache(vma, address, ptep); |
3224 | 3236 | out_put_page: | |
3237 | if (page != pagecache_page) | ||
3238 | unlock_page(page); | ||
3239 | put_page(page); | ||
3225 | out_ptl: | 3240 | out_ptl: |
3226 | spin_unlock(ptl); | 3241 | spin_unlock(ptl); |
3227 | 3242 | ||
@@ -3229,12 +3244,17 @@ out_ptl: | |||
3229 | unlock_page(pagecache_page); | 3244 | unlock_page(pagecache_page); |
3230 | put_page(pagecache_page); | 3245 | put_page(pagecache_page); |
3231 | } | 3246 | } |
3232 | if (page != pagecache_page) | ||
3233 | unlock_page(page); | ||
3234 | put_page(page); | ||
3235 | |||
3236 | out_mutex: | 3247 | out_mutex: |
3237 | mutex_unlock(&htlb_fault_mutex_table[hash]); | 3248 | mutex_unlock(&htlb_fault_mutex_table[hash]); |
3249 | /* | ||
3250 | * Generally it's safe to hold refcount during waiting page lock. But | ||
3251 | * here we just wait to defer the next page fault to avoid busy loop and | ||
3252 | * the page is not used after unlocked before returning from the current | ||
3253 | * page fault. So we are safe from accessing freed page, even if we wait | ||
3254 | * here without taking refcount. | ||
3255 | */ | ||
3256 | if (need_wait_lock) | ||
3257 | wait_on_page_locked(page); | ||
3238 | return ret; | 3258 | return ret; |
3239 | } | 3259 | } |
3240 | 3260 | ||