aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Hansen <dave@linux.vnet.ibm.com>2013-01-30 19:56:16 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2013-01-31 17:12:30 -0500
commitf03574f2d5b2d6229dcdf2d322848065f72953c7 (patch)
treeec9c4000c006f74d8cd67626b7ff0367376fca2c
parent1e9209edc71b851d81f0316ca03a0e6335c0ef9a (diff)
x86-32, mm: Rip out x86_32 NUMA remapping code
This code was an optimization for 32-bit NUMA systems. It has probably been the cause of a number of subtle bugs over the years, although the conditions to excite them would have been hard to trigger. Essentially, we remap part of the kernel linear mapping area, and then sometimes part of that area gets freed back in to the bootmem allocator. If those pages get used by kernel data structures (say mem_map[] or a dentry), there's no big deal. But, if anyone ever tried to use the linear mapping for these pages _and_ cared about their physical address, bad things happen. For instance, say you passed __GFP_ZERO to the page allocator and then happened to get handed one of these pages, it zero the remapped page, but it would make a pte to the _old_ page. There are probably a hundred other ways that it could screw with things. We don't need to hang on to performance optimizations for these old boxes any more. All my 32-bit NUMA systems are long dead and buried, and I probably had access to more than most people. This code is causing real things to break today: https://lkml.org/lkml/2013/1/9/376 I looked in to actually fixing this, but it requires surgery to way too much brittle code, as well as stuff like per_cpu_ptr_to_phys(). [ hpa: Cc: this for -stable, since it is a memory corruption issue. However, an alternative is to simply mark NUMA as depends BROKEN rather than EXPERIMENTAL in the X86_32 subclause... ] Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> Cc: <stable@vger.kernel.org>
-rw-r--r--arch/x86/Kconfig4
-rw-r--r--arch/x86/mm/numa.c3
-rw-r--r--arch/x86/mm/numa_32.c161
-rw-r--r--arch/x86/mm/numa_internal.h6
4 files changed, 0 insertions, 174 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 79795af59810..108efcb21c9e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1253,10 +1253,6 @@ config NODES_SHIFT
1253 Specify the maximum number of NUMA Nodes available on the target 1253 Specify the maximum number of NUMA Nodes available on the target
1254 system. Increases memory reserved to accommodate various tables. 1254 system. Increases memory reserved to accommodate various tables.
1255 1255
1256config HAVE_ARCH_ALLOC_REMAP
1257 def_bool y
1258 depends on X86_32 && NUMA
1259
1260config ARCH_HAVE_MEMORY_PRESENT 1256config ARCH_HAVE_MEMORY_PRESENT
1261 def_bool y 1257 def_bool y
1262 depends on X86_32 && DISCONTIGMEM 1258 depends on X86_32 && DISCONTIGMEM
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index b2313c6739f5..61c2b6f5ff88 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end)
205 if (end && (end - start) < NODE_MIN_SIZE) 205 if (end && (end - start) < NODE_MIN_SIZE)
206 return; 206 return;
207 207
208 /* initialize remap allocator before aligning to ZONE_ALIGN */
209 init_alloc_remap(nid, start, end);
210
211 start = roundup(start, ZONE_ALIGN); 208 start = roundup(start, ZONE_ALIGN);
212 209
213 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", 210 printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 534255a36b6b..73a6d7395bd3 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
73 73
74extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
75 75
76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
77
78static void *node_remap_start_vaddr[MAX_NUMNODES];
79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
80
81/*
82 * Remap memory allocator
83 */
84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
85static void *node_remap_end_vaddr[MAX_NUMNODES];
86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
107void *alloc_remap(int nid, unsigned long size)
108{
109 void *allocation = node_remap_alloc_vaddr[nid];
110
111 size = ALIGN(size, L1_CACHE_BYTES);
112
113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
114 return NULL;
115
116 node_remap_alloc_vaddr[nid] += size;
117 memset(allocation, 0, size);
118
119 return allocation;
120}
121
122#ifdef CONFIG_HIBERNATION
123/**
124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
125 * during resume from hibernation
126 * @pgd_base - temporary resume page directory
127 */
128void resume_map_numa_kva(pgd_t *pgd_base)
129{
130 int node;
131
132 for_each_online_node(node) {
133 unsigned long start_va, start_pfn, nr_pages, pfn;
134
135 start_va = (unsigned long)node_remap_start_vaddr[node];
136 start_pfn = node_remap_start_pfn[node];
137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
139
140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
141
142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
145 pud_t *pud = pud_offset(pgd, vaddr);
146 pmd_t *pmd = pmd_offset(pud, vaddr);
147
148 set_pmd(pmd, pfn_pmd(start_pfn + pfn,
149 PAGE_KERNEL_LARGE_EXEC));
150
151 printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n",
152 __func__, vaddr, start_pfn + pfn);
153 }
154 }
155}
156#endif
157
158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
181{
182 unsigned long start_pfn = start >> PAGE_SHIFT;
183 unsigned long end_pfn = end >> PAGE_SHIFT;
184 unsigned long size, pfn;
185 u64 node_pa, remap_pa;
186 void *remap_va;
187
188 /*
189 * The acpi/srat node info can show hot-add memroy zones where
190 * memory could be added but not currently present.
191 */
192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
193 nid, start_pfn, end_pfn);
194
195 /* calculate the necessary space aligned to large page size */
196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
198 size = ALIGN(size, LARGE_PAGE_BYTES);
199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (!node_pa) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_reserve(node_pa, size);
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (!remap_pa) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_free(node_pa, size);
216 return;
217 }
218 memblock_reserve(remap_pa, size);
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
235}
236
237void __init initmem_init(void) 76void __init initmem_init(void)
238{ 77{
239 x86_numa_init(); 78 x86_numa_init();
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index 7178c3afe05e..ad86ec91e640 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -21,12 +21,6 @@ void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void); 22void __init x86_numa_init(void);
23 23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
30#ifdef CONFIG_NUMA_EMU 24#ifdef CONFIG_NUMA_EMU
31void __init numa_emulation(struct numa_meminfo *numa_meminfo, 25void __init numa_emulation(struct numa_meminfo *numa_meminfo,
32 int numa_dist_cnt); 26 int numa_dist_cnt);