aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2014-06-04 19:06:30 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-04 19:53:55 -0400
commitc46a7c817e662a820373bb76b88d0ad67d6abe5d (patch)
tree41a7d24c818c9cfe71f335f7fdae162896aa3b23 /arch/x86
parent4468dd76f51f8be75d4f04f1d721e379596e7262 (diff)
x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels
_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Vrabel <david.vrabel@citrix.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Anvin <hpa@zytor.com> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Steven Noonan <steven@uplinklabs.net> Cc: Rik van Riel <riel@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/pgtable.h15
-rw-r--r--arch/x86/include/asm/pgtable_64.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h66
-rw-r--r--arch/x86/mm/pageattr-test.c2
4 files changed, 55 insertions, 36 deletions
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b459ddf27d64..66276c1d23bb 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte)
131 131
132static inline int pte_special(pte_t pte) 132static inline int pte_special(pte_t pte)
133{ 133{
134 return pte_flags(pte) & _PAGE_SPECIAL; 134 return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) ==
135 (_PAGE_PRESENT|_PAGE_SPECIAL);
135} 136}
136 137
137static inline unsigned long pte_pfn(pte_t pte) 138static inline unsigned long pte_pfn(pte_t pte)
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a)
452 _PAGE_NUMA); 453 _PAGE_NUMA);
453} 454}
454 455
456#define pte_present_nonuma pte_present_nonuma
457static inline int pte_present_nonuma(pte_t a)
458{
459 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
460}
461
455#define pte_accessible pte_accessible 462#define pte_accessible pte_accessible
456static inline bool pte_accessible(struct mm_struct *mm, pte_t a) 463static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
457{ 464{
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
860 867
861static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 868static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
862{ 869{
863 VM_BUG_ON(pte_present(pte)); 870 VM_BUG_ON(pte_present_nonuma(pte));
864 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); 871 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
865} 872}
866 873
867static inline int pte_swp_soft_dirty(pte_t pte) 874static inline int pte_swp_soft_dirty(pte_t pte)
868{ 875{
869 VM_BUG_ON(pte_present(pte)); 876 VM_BUG_ON(pte_present_nonuma(pte));
870 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; 877 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
871} 878}
872 879
873static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 880static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
874{ 881{
875 VM_BUG_ON(pte_present(pte)); 882 VM_BUG_ON(pte_present_nonuma(pte));
876 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 883 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
877} 884}
878 885
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index e22c1dbf7feb..6d6ecd09883c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
145/* Encode and de-code a swap entry */ 145/* Encode and de-code a swap entry */
146#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 146#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
147#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) 147#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
148#ifdef CONFIG_NUMA_BALANCING
149/* Automatic NUMA balancing needs to be distinguishable from swap entries */
150#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2)
151#else
148#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 152#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
153#endif
149#else 154#else
155#ifdef CONFIG_NUMA_BALANCING
156#error Incompatible format for automatic NUMA balancing
157#endif
150#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) 158#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
151#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) 159#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
152#endif 160#endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index eb3d44945133..f216963760e5 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -16,15 +16,26 @@
16#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ 16#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
17#define _PAGE_BIT_PAT 7 /* on 4KB pages */ 17#define _PAGE_BIT_PAT 7 /* on 4KB pages */
18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ 18#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
19#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ 19#define _PAGE_BIT_SOFTW1 9 /* available for programmer */
20#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ 20#define _PAGE_BIT_SOFTW2 10 /* " */
21#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ 21#define _PAGE_BIT_SOFTW3 11 /* " */
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ 25#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
26#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
27#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
28#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 29#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
27 30
31/*
32 * Swap offsets on configurations that allow automatic NUMA balancing use the
33 * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from
34 * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the
35 * maximum possible swap space from 16TB to 8TB.
36 */
37#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1)
38
28/* If _PAGE_BIT_PRESENT is clear, we use these: */ 39/* If _PAGE_BIT_PRESENT is clear, we use these: */
29/* - if the user mapped it with PROT_NONE; pte_present gives true */ 40/* - if the user mapped it with PROT_NONE; pte_present gives true */
30#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL 41#define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL
@@ -40,7 +51,7 @@
40#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) 51#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
41#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) 52#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
42#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) 53#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
43#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) 54#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
44#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) 55#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
45#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) 56#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 57#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
@@ -61,8 +72,6 @@
61 * they do not conflict with each other. 72 * they do not conflict with each other.
62 */ 73 */
63 74
64#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN
65
66#ifdef CONFIG_MEM_SOFT_DIRTY 75#ifdef CONFIG_MEM_SOFT_DIRTY
67#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) 76#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
68#else 77#else
@@ -70,6 +79,21 @@
70#endif 79#endif
71 80
72/* 81/*
82 * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page
83 * that is not present. The hinting fault gathers numa placement statistics
84 * (see pte_numa()). The bit is always zero when the PTE is not present.
85 *
86 * The bit picked must be always zero when the pmd is present and not
87 * present, so that we don't lose information when we set it while
88 * atomically clearing the present bit.
89 */
90#ifdef CONFIG_NUMA_BALANCING
91#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA)
92#else
93#define _PAGE_NUMA (_AT(pteval_t, 0))
94#endif
95
96/*
73 * Tracking soft dirty bit when a page goes to a swap is tricky. 97 * Tracking soft dirty bit when a page goes to a swap is tricky.
74 * We need a bit which can be stored in pte _and_ not conflict 98 * We need a bit which can be stored in pte _and_ not conflict
75 * with swap entry format. On x86 bits 6 and 7 are *not* involved 99 * with swap entry format. On x86 bits 6 and 7 are *not* involved
@@ -94,26 +118,6 @@
94#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) 118#define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE)
95#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 119#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
96 120
97/*
98 * _PAGE_NUMA indicates that this page will trigger a numa hinting
99 * minor page fault to gather numa placement statistics (see
100 * pte_numa()). The bit picked (8) is within the range between
101 * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't
102 * require changes to the swp entry format because that bit is always
103 * zero when the pte is not present.
104 *
105 * The bit picked must be always zero when the pmd is present and not
106 * present, so that we don't lose information when we set it while
107 * atomically clearing the present bit.
108 *
109 * Because we shared the same bit (8) with _PAGE_PROTNONE this can be
110 * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE
111 * couldn't reach, like handle_mm_fault() (see access_error in
112 * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for
113 * handle_mm_fault() to be invoked).
114 */
115#define _PAGE_NUMA _PAGE_PROTNONE
116
117#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 121#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
118 _PAGE_ACCESSED | _PAGE_DIRTY) 122 _PAGE_ACCESSED | _PAGE_DIRTY)
119#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 123#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
@@ -122,8 +126,8 @@
122/* Set of bits not changed in pte_modify */ 126/* Set of bits not changed in pte_modify */
123#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 127#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 128 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \
125 _PAGE_SOFT_DIRTY) 129 _PAGE_SOFT_DIRTY | _PAGE_NUMA)
126#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) 130#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA)
127 131
128#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 132#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
129#define _PAGE_CACHE_WB (0) 133#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 461bc8289024..6629f397b467 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -35,7 +35,7 @@ enum {
35 35
36static int pte_testbit(pte_t pte) 36static int pte_testbit(pte_t pte)
37{ 37{
38 return pte_flags(pte) & _PAGE_UNUSED1; 38 return pte_flags(pte) & _PAGE_SOFTW1;
39} 39}
40 40
41struct split_state { 41struct split_state {