aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/include
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2015-02-12 17:58:32 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 21:54:08 -0500
commit21d9ee3eda7792c45880b2f11bff8e95c9a061fb (patch)
tree2e20932b8f1526e6d1f48add9e818ed43d7be8ee /arch/x86/include
parent4d9424669946532be754a6e116618dcb58430cb4 (diff)
mm: remove remaining references to NUMA hinting bits and helpers
This patch removes the NUMA PTE bits and associated helpers. As a side-effect it increases the maximum possible swap space on x86-64. One potential source of problems is races between the marking of PTEs PROT_NONE, NUMA hinting faults and migration. It must be guaranteed that a PTE being protected is not faulted in parallel, seen as a pte_none and corrupting memory. The base case is safe but transhuge has problems in the past due to an different migration mechanism and a dependance on page lock to serialise migrations and warrants a closer look. task_work hinting update parallel fault ------------------------ -------------- change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault pmd_none do_huge_pmd_anonymous_page read? pmd_lock blocks until hinting complete, fail !pmd_none test write? __do_huge_pmd_anonymous_page acquires pmd_lock, checks pmd_none pmd_modify set_pmd_at task_work hinting update parallel migration ------------------------ ------------------ change_pmd_range change_huge_pmd __pmd_trans_huge_lock pmdp_get_and_clear __handle_mm_fault do_huge_pmd_numa_page migrate_misplaced_transhuge_page pmd_lock waits for updates to complete, recheck pmd_same pmd_modify set_pmd_at Both of those are safe and the case where a transhuge page is inserted during a protection update is unchanged. The case where two processes try migrating at the same time is unchanged by this series so should still be ok. I could not find a case where we are accidentally depending on the PTE not being cleared and flushed. If one is missed, it'll manifest as corruption problems that start triggering shortly after this series is merged and only happen when NUMA balancing is enabled. Signed-off-by: Mel Gorman <mgorman@suse.de> Tested-by: Sasha Levin <sasha.levin@oracle.com> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Dave Jones <davej@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Rik van Riel <riel@redhat.com> Cc: Mark Brown <broonie@kernel.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86/include')
-rw-r--r--arch/x86/include/asm/pgtable.h22
-rw-r--r--arch/x86/include/asm/pgtable_64.h5
-rw-r--r--arch/x86/include/asm/pgtable_types.h41
3 files changed, 5 insertions, 63 deletions
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f519b0b529dd..34d42a7d5595 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -300,7 +300,7 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd)
300 300
301static inline pmd_t pmd_mknotpresent(pmd_t pmd) 301static inline pmd_t pmd_mknotpresent(pmd_t pmd)
302{ 302{
303 return pmd_clear_flags(pmd, _PAGE_PRESENT); 303 return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE);
304} 304}
305 305
306#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 306#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
@@ -443,13 +443,6 @@ static inline int pte_same(pte_t a, pte_t b)
443 443
444static inline int pte_present(pte_t a) 444static inline int pte_present(pte_t a)
445{ 445{
446 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE |
447 _PAGE_NUMA);
448}
449
450#define pte_present_nonuma pte_present_nonuma
451static inline int pte_present_nonuma(pte_t a)
452{
453 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 446 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
454} 447}
455 448
@@ -459,7 +452,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
459 if (pte_flags(a) & _PAGE_PRESENT) 452 if (pte_flags(a) & _PAGE_PRESENT)
460 return true; 453 return true;
461 454
462 if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && 455 if ((pte_flags(a) & _PAGE_PROTNONE) &&
463 mm_tlb_flush_pending(mm)) 456 mm_tlb_flush_pending(mm))
464 return true; 457 return true;
465 458
@@ -479,8 +472,7 @@ static inline int pmd_present(pmd_t pmd)
479 * the _PAGE_PSE flag will remain set at all times while the 472 * the _PAGE_PSE flag will remain set at all times while the
480 * _PAGE_PRESENT bit is clear). 473 * _PAGE_PRESENT bit is clear).
481 */ 474 */
482 return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | 475 return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
483 _PAGE_NUMA);
484} 476}
485 477
486#ifdef CONFIG_NUMA_BALANCING 478#ifdef CONFIG_NUMA_BALANCING
@@ -555,11 +547,6 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
555 547
556static inline int pmd_bad(pmd_t pmd) 548static inline int pmd_bad(pmd_t pmd)
557{ 549{
558#ifdef CONFIG_NUMA_BALANCING
559 /* pmd_numa check */
560 if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA)
561 return 0;
562#endif
563 return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; 550 return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
564} 551}
565 552
@@ -878,19 +865,16 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
878#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 865#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
879static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 866static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
880{ 867{
881 VM_BUG_ON(pte_present_nonuma(pte));
882 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); 868 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
883} 869}
884 870
885static inline int pte_swp_soft_dirty(pte_t pte) 871static inline int pte_swp_soft_dirty(pte_t pte)
886{ 872{
887 VM_BUG_ON(pte_present_nonuma(pte));
888 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; 873 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
889} 874}
890 875
891static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 876static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
892{ 877{
893 VM_BUG_ON(pte_present_nonuma(pte));
894 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 878 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
895} 879}
896#endif 880#endif
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e227970f983e..2ee781114d34 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -142,12 +142,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
142 142
143/* Encode and de-code a swap entry */ 143/* Encode and de-code a swap entry */
144#define SWP_TYPE_BITS 5 144#define SWP_TYPE_BITS 5
145#ifdef CONFIG_NUMA_BALANCING
146/* Automatic NUMA balancing needs to be distinguishable from swap entries */
147#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
148#else
149#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 145#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
150#endif
151 146
152#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 147#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
153 148
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 3e0230c94cff..8c7c10802e9c 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -27,14 +27,6 @@
27#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 27#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
28#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 28#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
29 29
30/*
31 * Swap offsets on configurations that allow automatic NUMA balancing use the
32 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
33 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
34 * maximum possible swap space from 16TB to 8TB.
35 */
36#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
37
38/* If _PAGE_BIT_PRESENT is clear, we use these: */ 30/* If _PAGE_BIT_PRESENT is clear, we use these: */
39/* - if the user mapped it with PROT_NONE; pte_present gives true */ 31/* - if the user mapped it with PROT_NONE; pte_present gives true */
40#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL 32#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -76,21 +68,6 @@
76#endif 68#endif
77 69
78/* 70/*
79 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
80 * that is not present. The hinting fault gathers numa placement statistics
81 * (see pte_numa()). The bit is always zero when the PTE is not present.
82 *
83 * The bit picked must be always zero when the pmd is present and not
84 * present, so that we don't lose information when we set it while
85 * atomically clearing the present bit.
86 */
87#ifdef CONFIG_NUMA_BALANCING
88#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
89#else
90#define _PAGE_NUMA (_AT(pteval_t, 0))
91#endif
92
93/*
94 * Tracking soft dirty bit when a page goes to a swap is tricky. 71 * Tracking soft dirty bit when a page goes to a swap is tricky.
95 * We need a bit which can be stored in pte _and_ not conflict 72 * We need a bit which can be stored in pte _and_ not conflict
96 * with swap entry format. On x86 bits 6 and 7 are *not* involved 73 * with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -122,8 +99,8 @@
122/* Set of bits not changed in pte_modify */ 99/* Set of bits not changed in pte_modify */
123#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 100#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 101 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
125 _PAGE_SOFT_DIRTY | _PAGE_NUMA) 102 _PAGE_SOFT_DIRTY)
126#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) 103#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
127 104
128/* 105/*
129 * The cache modes defined here are used to translate between pure SW usage 106 * The cache modes defined here are used to translate between pure SW usage
@@ -324,20 +301,6 @@ static inline pteval_t pte_flags(pte_t pte)
324 return native_pte_val(pte) & PTE_FLAGS_MASK; 301 return native_pte_val(pte) & PTE_FLAGS_MASK;
325} 302}
326 303
327#ifdef CONFIG_NUMA_BALANCING
328/* Set of bits that distinguishes present, prot_none and numa ptes */
329#define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)
330static inline pteval_t ptenuma_flags(pte_t pte)
331{
332 return pte_flags(pte) & _PAGE_NUMA_MASK;
333}
334
335static inline pmdval_t pmdnuma_flags(pmd_t pmd)
336{
337 return pmd_flags(pmd) & _PAGE_NUMA_MASK;
338}
339#endif /* CONFIG_NUMA_BALANCING */
340
341#define pgprot_val(x) ((x).pgprot) 304#define pgprot_val(x) ((x).pgprot)
342#define __pgprot(x) ((pgprot_t) { (x) } ) 305#define __pgprot(x) ((pgprot_t) { (x) } )
343 306