diff options
-rw-r--r-- | arch/x86/include/asm/pgtable-3level.h | 50 | ||||
-rw-r--r-- | include/asm-generic/pgtable.h | 22 |
2 files changed, 70 insertions, 2 deletions
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index effff47a3c82..43876f16caf1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h | |||
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) | |||
31 | ptep->pte_low = pte.pte_low; | 31 | ptep->pte_low = pte.pte_low; |
32 | } | 32 | } |
33 | 33 | ||
34 | #define pmd_read_atomic pmd_read_atomic | ||
35 | /* | ||
36 | * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with | ||
37 | * a "*pmdp" dereference done by gcc. Problem is, in certain places | ||
38 | * where pte_offset_map_lock is called, concurrent page faults are | ||
39 | * allowed, if the mmap_sem is hold for reading. An example is mincore | ||
40 | * vs page faults vs MADV_DONTNEED. On the page fault side | ||
41 | * pmd_populate rightfully does a set_64bit, but if we're reading the | ||
42 | * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen | ||
43 | * because gcc will not read the 64bit of the pmd atomically. To fix | ||
44 | * this all places running pmd_offset_map_lock() while holding the | ||
45 | * mmap_sem in read mode, shall read the pmdp pointer using this | ||
46 | * function to know if the pmd is null nor not, and in turn to know if | ||
47 | * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd | ||
48 | * operations. | ||
49 | * | ||
50 | * Without THP if the mmap_sem is hold for reading, the | ||
51 | * pmd can only transition from null to not null while pmd_read_atomic runs. | ||
52 | * So there's no need of literally reading it atomically. | ||
53 | * | ||
54 | * With THP if the mmap_sem is hold for reading, the pmd can become | ||
55 | * THP or null or point to a pte (and in turn become "stable") at any | ||
56 | * time under pmd_read_atomic, so it's mandatory to read it atomically | ||
57 | * with cmpxchg8b. | ||
58 | */ | ||
59 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | ||
60 | static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
61 | { | ||
62 | pmdval_t ret; | ||
63 | u32 *tmp = (u32 *)pmdp; | ||
64 | |||
65 | ret = (pmdval_t) (*tmp); | ||
66 | if (ret) { | ||
67 | /* | ||
68 | * If the low part is null, we must not read the high part | ||
69 | * or we can end up with a partial pmd. | ||
70 | */ | ||
71 | smp_rmb(); | ||
72 | ret |= ((pmdval_t)*(tmp + 1)) << 32; | ||
73 | } | ||
74 | |||
75 | return (pmd_t) { ret }; | ||
76 | } | ||
77 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
78 | static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
79 | { | ||
80 | return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; | ||
81 | } | ||
82 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
83 | |||
34 | static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) | 84 | static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) |
35 | { | 85 | { |
36 | set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); | 86 | set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2768f188f55..6f2b45a9b6bc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd) | |||
445 | #endif /* __HAVE_ARCH_PMD_WRITE */ | 445 | #endif /* __HAVE_ARCH_PMD_WRITE */ |
446 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 446 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
447 | 447 | ||
448 | #ifndef pmd_read_atomic | ||
449 | static inline pmd_t pmd_read_atomic(pmd_t *pmdp) | ||
450 | { | ||
451 | /* | ||
452 | * Depend on compiler for an atomic pmd read. NOTE: this is | ||
453 | * only going to work, if the pmdval_t isn't larger than | ||
454 | * an unsigned long. | ||
455 | */ | ||
456 | return *pmdp; | ||
457 | } | ||
458 | #endif | ||
459 | |||
448 | /* | 460 | /* |
449 | * This function is meant to be used by sites walking pagetables with | 461 | * This function is meant to be used by sites walking pagetables with |
450 | * the mmap_sem hold in read mode to protect against MADV_DONTNEED and | 462 | * the mmap_sem hold in read mode to protect against MADV_DONTNEED and |
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd) | |||
458 | * undefined so behaving like if the pmd was none is safe (because it | 470 | * undefined so behaving like if the pmd was none is safe (because it |
459 | * can return none anyway). The compiler level barrier() is critically | 471 | * can return none anyway). The compiler level barrier() is critically |
460 | * important to compute the two checks atomically on the same pmdval. | 472 | * important to compute the two checks atomically on the same pmdval. |
473 | * | ||
474 | * For 32bit kernels with a 64bit large pmd_t this automatically takes | ||
475 | * care of reading the pmd atomically to avoid SMP race conditions | ||
476 | * against pmd_populate() when the mmap_sem is hold for reading by the | ||
477 | * caller (a special atomic read not done by "gcc" as in the generic | ||
478 | * version above, is also needed when THP is disabled because the page | ||
479 | * fault can populate the pmd from under us). | ||
461 | */ | 480 | */ |
462 | static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) | 481 | static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) |
463 | { | 482 | { |
464 | /* depend on compiler for an atomic pmd read */ | 483 | pmd_t pmdval = pmd_read_atomic(pmd); |
465 | pmd_t pmdval = *pmd; | ||
466 | /* | 484 | /* |
467 | * The barrier will stabilize the pmdval in a register or on | 485 | * The barrier will stabilize the pmdval in a register or on |
468 | * the stack so that it will stop changing under the code. | 486 | * the stack so that it will stop changing under the code. |