diff options
author | Mel Gorman <mgorman@suse.de> | 2014-06-04 19:06:30 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-04 19:53:55 -0400 |
commit | c46a7c817e662a820373bb76b88d0ad67d6abe5d (patch) | |
tree | 41a7d24c818c9cfe71f335f7fdae162896aa3b23 /arch/x86 | |
parent | 4468dd76f51f8be75d4f04f1d721e379596e7262 (diff) |
x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels
_PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting
faults on x86. Care is taken such that _PAGE_NUMA is used only in
situations where the VMA flags distinguish between NUMA hinting faults
and prot_none faults. This decision was x86-specific and conceptually
it is difficult requiring special casing to distinguish between PROTNONE
and NUMA ptes based on context.
Fundamentally, we only need the _PAGE_NUMA bit to tell the difference
between an entry that is really unmapped and a page that is protected
for NUMA hinting faults as if the PTE is not present then a fault will
be trapped.
Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset.
This patch shrinks the maximum possible swap size and uses the bit to
uniquely distinguish between NUMA hinting ptes and swap ptes.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Anvin <hpa@zytor.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/include/asm/pgtable.h | 15 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_64.h | 8 | ||||
-rw-r--r-- | arch/x86/include/asm/pgtable_types.h | 66 | ||||
-rw-r--r-- | arch/x86/mm/pageattr-test.c | 2 |
4 files changed, 55 insertions, 36 deletions
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b459ddf27d64..66276c1d23bb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte) | |||
131 | 131 | ||
132 | static inline int pte_special(pte_t pte) | 132 | static inline int pte_special(pte_t pte) |
133 | { | 133 | { |
134 | return pte_flags(pte) & _PAGE_SPECIAL; | 134 | return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == |
135 | (_PAGE_PRESENT|_PAGE_SPECIAL); | ||
135 | } | 136 | } |
136 | 137 | ||
137 | static inline unsigned long pte_pfn(pte_t pte) | 138 | static inline unsigned long pte_pfn(pte_t pte) |
@@ -452,6 +453,12 @@ static inline int pte_present(pte_t a) | |||
452 | _PAGE_NUMA); | 453 | _PAGE_NUMA); |
453 | } | 454 | } |
454 | 455 | ||
456 | #define pte_present_nonuma pte_present_nonuma | ||
457 | static inline int pte_present_nonuma(pte_t a) | ||
458 | { | ||
459 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); | ||
460 | } | ||
461 | |||
455 | #define pte_accessible pte_accessible | 462 | #define pte_accessible pte_accessible |
456 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) | 463 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) |
457 | { | 464 | { |
@@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, | |||
860 | 867 | ||
861 | static inline pte_t pte_swp_mksoft_dirty(pte_t pte) | 868 | static inline pte_t pte_swp_mksoft_dirty(pte_t pte) |
862 | { | 869 | { |
863 | VM_BUG_ON(pte_present(pte)); | 870 | VM_BUG_ON(pte_present_nonuma(pte)); |
864 | return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); | 871 | return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); |
865 | } | 872 | } |
866 | 873 | ||
867 | static inline int pte_swp_soft_dirty(pte_t pte) | 874 | static inline int pte_swp_soft_dirty(pte_t pte) |
868 | { | 875 | { |
869 | VM_BUG_ON(pte_present(pte)); | 876 | VM_BUG_ON(pte_present_nonuma(pte)); |
870 | return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; | 877 | return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; |
871 | } | 878 | } |
872 | 879 | ||
873 | static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) | 880 | static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) |
874 | { | 881 | { |
875 | VM_BUG_ON(pte_present(pte)); | 882 | VM_BUG_ON(pte_present_nonuma(pte)); |
876 | return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); | 883 | return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); |
877 | } | 884 | } |
878 | 885 | ||
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e22c1dbf7feb..6d6ecd09883c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h | |||
@@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; } | |||
145 | /* Encode and de-code a swap entry */ | 145 | /* Encode and de-code a swap entry */ |
146 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE | 146 | #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE |
147 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) | 147 | #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) |
148 | #ifdef CONFIG_NUMA_BALANCING | ||
149 | /* Automatic NUMA balancing needs to be distinguishable from swap entries */ | ||
150 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) | ||
151 | #else | ||
148 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) | 152 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) |
153 | #endif | ||
149 | #else | 154 | #else |
155 | #ifdef CONFIG_NUMA_BALANCING | ||
156 | #error Incompatible format for automatic NUMA balancing | ||
157 | #endif | ||
150 | #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) | 158 | #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) |
151 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) | 159 | #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) |
152 | #endif | 160 | #endif |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index eb3d44945133..f216963760e5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -16,15 +16,26 @@ | |||
16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ | 16 | #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ |
17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ | 17 | #define _PAGE_BIT_PAT 7 /* on 4KB pages */ |
18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ | 18 | #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ |
19 | #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ | 19 | #define _PAGE_BIT_SOFTW1 9 /* available for programmer */ |
20 | #define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ | 20 | #define _PAGE_BIT_SOFTW2 10 /* " */ |
21 | #define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ | 21 | #define _PAGE_BIT_SOFTW3 11 /* " */ |
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 |
25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ | 25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ |
26 | #define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ | ||
27 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ | ||
28 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ | ||
26 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | 29 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ |
27 | 30 | ||
31 | /* | ||
32 | * Swap offsets on configurations that allow automatic NUMA balancing use the | ||
33 | * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from | ||
34 | * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the | ||
35 | * maximum possible swap space from 16TB to 8TB. | ||
36 | */ | ||
37 | #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) | ||
38 | |||
28 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ | 39 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ |
29 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ | 40 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ |
30 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL | 41 | #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL |
@@ -40,7 +51,7 @@ | |||
40 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) | 51 | #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) |
41 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) | 52 | #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) |
42 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) | 53 | #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) |
43 | #define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) | 54 | #define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) |
44 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) | 55 | #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) |
45 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) | 56 | #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) |
46 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 57 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
@@ -61,8 +72,6 @@ | |||
61 | * they do not conflict with each other. | 72 | * they do not conflict with each other. |
62 | */ | 73 | */ |
63 | 74 | ||
64 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN | ||
65 | |||
66 | #ifdef CONFIG_MEM_SOFT_DIRTY | 75 | #ifdef CONFIG_MEM_SOFT_DIRTY |
67 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) | 76 | #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) |
68 | #else | 77 | #else |
@@ -70,6 +79,21 @@ | |||
70 | #endif | 79 | #endif |
71 | 80 | ||
72 | /* | 81 | /* |
82 | * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page | ||
83 | * that is not present. The hinting fault gathers numa placement statistics | ||
84 | * (see pte_numa()). The bit is always zero when the PTE is not present. | ||
85 | * | ||
86 | * The bit picked must be always zero when the pmd is present and not | ||
87 | * present, so that we don't lose information when we set it while | ||
88 | * atomically clearing the present bit. | ||
89 | */ | ||
90 | #ifdef CONFIG_NUMA_BALANCING | ||
91 | #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) | ||
92 | #else | ||
93 | #define _PAGE_NUMA (_AT(pteval_t, 0)) | ||
94 | #endif | ||
95 | |||
96 | /* | ||
73 | * Tracking soft dirty bit when a page goes to a swap is tricky. | 97 | * Tracking soft dirty bit when a page goes to a swap is tricky. |
74 | * We need a bit which can be stored in pte _and_ not conflict | 98 | * We need a bit which can be stored in pte _and_ not conflict |
75 | * with swap entry format. On x86 bits 6 and 7 are *not* involved | 99 | * with swap entry format. On x86 bits 6 and 7 are *not* involved |
@@ -94,26 +118,6 @@ | |||
94 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) | 118 | #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) |
95 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | 119 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
96 | 120 | ||
97 | /* | ||
98 | * _PAGE_NUMA indicates that this page will trigger a numa hinting | ||
99 | * minor page fault to gather numa placement statistics (see | ||
100 | * pte_numa()). The bit picked (8) is within the range between | ||
101 | * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't | ||
102 | * require changes to the swp entry format because that bit is always | ||
103 | * zero when the pte is not present. | ||
104 | * | ||
105 | * The bit picked must be always zero when the pmd is present and not | ||
106 | * present, so that we don't lose information when we set it while | ||
107 | * atomically clearing the present bit. | ||
108 | * | ||
109 | * Because we shared the same bit (8) with _PAGE_PROTNONE this can be | ||
110 | * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE | ||
111 | * couldn't reach, like handle_mm_fault() (see access_error in | ||
112 | * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for | ||
113 | * handle_mm_fault() to be invoked). | ||
114 | */ | ||
115 | #define _PAGE_NUMA _PAGE_PROTNONE | ||
116 | |||
117 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ | 121 | #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ |
118 | _PAGE_ACCESSED | _PAGE_DIRTY) | 122 | _PAGE_ACCESSED | _PAGE_DIRTY) |
119 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ | 123 | #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ |
@@ -122,8 +126,8 @@ | |||
122 | /* Set of bits not changed in pte_modify */ | 126 | /* Set of bits not changed in pte_modify */ |
123 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ | 127 | #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ |
124 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ | 128 | _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ |
125 | _PAGE_SOFT_DIRTY) | 129 | _PAGE_SOFT_DIRTY | _PAGE_NUMA) |
126 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) | 130 | #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) |
127 | 131 | ||
128 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) | 132 | #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) |
129 | #define _PAGE_CACHE_WB (0) | 133 | #define _PAGE_CACHE_WB (0) |
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 461bc8289024..6629f397b467 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c | |||
@@ -35,7 +35,7 @@ enum { | |||
35 | 35 | ||
36 | static int pte_testbit(pte_t pte) | 36 | static int pte_testbit(pte_t pte) |
37 | { | 37 | { |
38 | return pte_flags(pte) & _PAGE_UNUSED1; | 38 | return pte_flags(pte) & _PAGE_SOFTW1; |
39 | } | 39 | } |
40 | 40 | ||
41 | struct split_state { | 41 | struct split_state { |