aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2012-06-20 15:52:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-06-20 17:39:35 -0400
commite4eed03fd06578571c01d4f1478c874bb432c815 (patch)
tree9cfd16247d8208a1fe55ad81b0d4d85c17b80415 /arch/x86
parentabca7c4965845924f65d40e0aa1092bdd895e314 (diff)
thp: avoid atomic64_read in pmd_read_atomic for 32bit PAE
In the x86 32bit PAE CONFIG_TRANSPARENT_HUGEPAGE=y case while holding the mmap_sem for reading, cmpxchg8b cannot be used to read pmd contents under Xen. So instead of dealing only with "consistent" pmdvals in pmd_none_or_trans_huge_or_clear_bad() (which would be conceptually simpler) we let pmd_none_or_trans_huge_or_clear_bad() deal with pmdvals where the low 32bit and high 32bit could be inconsistent (to avoid having to use cmpxchg8b). The only guarantee we get from pmd_read_atomic is that if the low part of the pmd was found null, the high part will be null too (so the pmd will be considered unstable). And if the low part of the pmd is found "stable" later, then it means the whole pmd was read atomically (because after a pmd is stable, neither MADV_DONTNEED nor page faults can alter it anymore, and we read the high part after the low part). In the 32bit PAE x86 case, it is enough to read the low part of the pmdval atomically to declare the pmd as "stable" and that's true for THP and no THP, furthermore in the THP case we also have a barrier() that will prevent any inconsistent pmdvals to be cached by a later re-read of the *pmd. Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Jonathan Nieder <jrnieder@gmail.com> Cc: Ulrich Obergfell <uobergfe@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Hugh Dickins <hughd@google.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Petr Matousek <pmatouse@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jan Beulich <jbeulich@suse.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Tested-by: Andrew Jones <drjones@redhat.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/pgtable-3level.h30
1 files changed, 17 insertions, 13 deletions
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 43876f16caf1..cb00ccc7d571 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
47 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd 47 * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
48 * operations. 48 * operations.
49 * 49 *
50 * Without THP if the mmap_sem is hold for reading, the 50 * Without THP if the mmap_sem is hold for reading, the pmd can only
51 * pmd can only transition from null to not null while pmd_read_atomic runs. 51 * transition from null to not null while pmd_read_atomic runs. So
52 * So there's no need of literally reading it atomically. 52 * we can always return atomic pmd values with this function.
53 * 53 *
54 * With THP if the mmap_sem is hold for reading, the pmd can become 54 * With THP if the mmap_sem is hold for reading, the pmd can become
55 * THP or null or point to a pte (and in turn become "stable") at any 55 * trans_huge or none or point to a pte (and in turn become "stable")
56 * time under pmd_read_atomic, so it's mandatory to read it atomically 56 * at any time under pmd_read_atomic. We could read it really
57 * with cmpxchg8b. 57 * atomically here with a atomic64_read for the THP enabled case (and
58 * it would be a whole lot simpler), but to avoid using cmpxchg8b we
59 * only return an atomic pmdval if the low part of the pmdval is later
60 * found stable (i.e. pointing to a pte). And we're returning a none
61 * pmdval if the low part of the pmd is none. In some cases the high
62 * and low part of the pmdval returned may not be consistent if THP is
63 * enabled (the low part may point to previously mapped hugepage,
64 * while the high part may point to a more recently mapped hugepage),
65 * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part
66 * of the pmd to be read atomically to decide if the pmd is unstable
67 * or not, with the only exception of when the low part of the pmd is
68 * zero in which case we return a none pmd.
58 */ 69 */
59#ifndef CONFIG_TRANSPARENT_HUGEPAGE
60static inline pmd_t pmd_read_atomic(pmd_t *pmdp) 70static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
61{ 71{
62 pmdval_t ret; 72 pmdval_t ret;
@@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
74 84
75 return (pmd_t) { ret }; 85 return (pmd_t) { ret };
76} 86}
77#else /* CONFIG_TRANSPARENT_HUGEPAGE */
78static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
79{
80 return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
81}
82#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
83 87
84static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) 88static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
85{ 89{