2 files changed, 70 insertions, 2 deletions
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index effff47a3c82..43876f16caf1 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
        ptep->pte_low = pte.pte_low;
 }
+#define pmd_read_atomic pmd_read_atomic
+/*
+ * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
+ * a "*pmdp" dereference done by gcc. Problem is, in certain places
+ * where pte_offset_map_lock is called, concurrent page faults are
+ * allowed, if the mmap_sem is hold for reading. An example is mincore
+ * vs page faults vs MADV_DONTNEED. On the page fault side
+ * pmd_populate rightfully does a set_64bit, but if we're reading the
+ * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+ * because gcc will not read the 64bit of the pmd atomically. To fix
+ * this all places running pmd_offset_map_lock() while holding the
+ * mmap_sem in read mode, shall read the pmdp pointer using this
+ * function to know if the pmd is null nor not, and in turn to know if
+ * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
+ * operations.
+ *
+ * Without THP if the mmap_sem is hold for reading, the
+ * pmd can only transition from null to not null while pmd_read_atomic runs.
+ * So there's no need of literally reading it atomically.
+ *
+ * With THP if the mmap_sem is hold for reading, the pmd can become
+ * THP or null or point to a pte (and in turn become "stable") at any
+ * time under pmd_read_atomic, so it's mandatory to read it atomically
+ * with cmpxchg8b.
+ */
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+        pmdval_t ret;
+        u32 *tmp = (u32 *)pmdp;
+        ret = (pmdval_t) (*tmp);
+        if (ret) {
+                /*
+                 * If the low part is null, we must not read the high part
+                 * or we can end up with a partial pmd.
+                 */
+                smp_rmb();
+                ret |= ((pmdval_t)*(tmp + 1)) << 32;
+        }
+        return (pmd_t) { ret };
+}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+        return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
 {
        set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2768f188f55..6f2b45a9b6bc 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
 #endif /* __HAVE_ARCH_PMD_WRITE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#ifndef pmd_read_atomic
+static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+{
+        /*
+         * Depend on compiler for an atomic pmd read. NOTE: this is
+         * only going to work, if the pmdval_t isn't larger than
+         * an unsigned long.
+         */
+        return *pmdp;
+}
+#endif
 /*
 * This function is meant to be used by sites walking pagetables with
 * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
 * undefined so behaving like if the pmd was none is safe (because it
 * can return none anyway). The compiler level barrier() is critically
 * important to compute the two checks atomically on the same pmdval.
+ *
+ * For 32bit kernels with a 64bit large pmd_t this automatically takes
+ * care of reading the pmd atomically to avoid SMP race conditions
+ * against pmd_populate() when the mmap_sem is hold for reading by the
+ * caller (a special atomic read not done by "gcc" as in the generic
+ * version above, is also needed when THP is disabled because the page
+ * fault can populate the pmd from under us).
 */
 static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
 {
-        /* depend on compiler for an atomic pmd read */
+        pmd_t pmdval = pmd_read_atomic(pmd);
-        pmd_t pmdval = *pmd;
        /*
         * The barrier will stabilize the pmdval in a register or on
         * the stack so that it will stop changing under the code.

diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index effff47a3c82..43876f16caf1 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h
@@ -31,6 +31,56 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
31	ptep->pte_low = pte.pte_low;	31	ptep->pte_low = pte.pte_low;
32	}	32	}
33		33
		34	#define pmd_read_atomic pmd_read_atomic
		35	/*
		36	* pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with
		37	* a "*pmdp" dereference done by gcc. Problem is, in certain places
		38	* where pte_offset_map_lock is called, concurrent page faults are
		39	* allowed, if the mmap_sem is hold for reading. An example is mincore
		40	* vs page faults vs MADV_DONTNEED. On the page fault side
		41	* pmd_populate rightfully does a set_64bit, but if we're reading the
		42	* pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
		43	* because gcc will not read the 64bit of the pmd atomically. To fix
		44	* this all places running pmd_offset_map_lock() while holding the
		45	* mmap_sem in read mode, shall read the pmdp pointer using this
		46	* function to know if the pmd is null nor not, and in turn to know if
		47	* they can run pmd_offset_map_lock or pmd_trans_huge or other pmd
		48	* operations.
		49	*
		50	* Without THP if the mmap_sem is hold for reading, the
		51	* pmd can only transition from null to not null while pmd_read_atomic runs.
		52	* So there's no need of literally reading it atomically.
		53	*
		54	* With THP if the mmap_sem is hold for reading, the pmd can become
		55	* THP or null or point to a pte (and in turn become "stable") at any
		56	* time under pmd_read_atomic, so it's mandatory to read it atomically
		57	* with cmpxchg8b.
		58	*/
		59	#ifndef CONFIG_TRANSPARENT_HUGEPAGE
		60	static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
		61	{
		62	pmdval_t ret;
		63	u32 tmp = (u32 )pmdp;
		64
		65	ret = (pmdval_t) (*tmp);
		66	if (ret) {
		67	/*
		68	* If the low part is null, we must not read the high part
		69	* or we can end up with a partial pmd.
		70	*/
		71	smp_rmb();
		72	ret \|= ((pmdval_t)*(tmp + 1)) << 32;
		73	}
		74
		75	return (pmd_t) { ret };
		76	}
		77	#else /* CONFIG_TRANSPARENT_HUGEPAGE */
		78	static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
		79	{
		80	return (pmd_t) { atomic64_read((atomic64_t *)pmdp) };
		81	}
		82	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
		83
34	static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)	84	static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
35	{	85	{
36	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));	86	set_64bit((unsigned long long *)(ptep), native_pte_val(pte));


diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index e2768f188f55..6f2b45a9b6bc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h
@@ -445,6 +445,18 @@ static inline int pmd_write(pmd_t pmd)
445	#endif /* __HAVE_ARCH_PMD_WRITE */	445	#endif /* __HAVE_ARCH_PMD_WRITE */
446	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */	446	#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
447		447
		448	#ifndef pmd_read_atomic
		449	static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
		450	{
		451	/*
		452	* Depend on compiler for an atomic pmd read. NOTE: this is
		453	* only going to work, if the pmdval_t isn't larger than
		454	* an unsigned long.
		455	*/
		456	return *pmdp;
		457	}
		458	#endif
		459
448	/*	460	/*
449	* This function is meant to be used by sites walking pagetables with	461	* This function is meant to be used by sites walking pagetables with
450	* the mmap_sem hold in read mode to protect against MADV_DONTNEED and	462	* the mmap_sem hold in read mode to protect against MADV_DONTNEED and
@@ -458,11 +470,17 @@ static inline int pmd_write(pmd_t pmd)
458	* undefined so behaving like if the pmd was none is safe (because it	470	* undefined so behaving like if the pmd was none is safe (because it
459	* can return none anyway). The compiler level barrier() is critically	471	* can return none anyway). The compiler level barrier() is critically
460	* important to compute the two checks atomically on the same pmdval.	472	* important to compute the two checks atomically on the same pmdval.
		473	*
		474	* For 32bit kernels with a 64bit large pmd_t this automatically takes
		475	* care of reading the pmd atomically to avoid SMP race conditions
		476	* against pmd_populate() when the mmap_sem is hold for reading by the
		477	* caller (a special atomic read not done by "gcc" as in the generic
		478	* version above, is also needed when THP is disabled because the page
		479	* fault can populate the pmd from under us).
461	*/	480	*/
462	static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)	481	static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
463	{	482	{
464	/* depend on compiler for an atomic pmd read */	483	pmd_t pmdval = pmd_read_atomic(pmd);
465	pmd_t pmdval = *pmd;
466	/*	484	/*
467	* The barrier will stabilize the pmdval in a register or on	485	* The barrier will stabilize the pmdval in a register or on
468	* the stack so that it will stop changing under the code.	486	* the stack so that it will stop changing under the code.