aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/init.c
diff options
context:
space:
mode:
authorJacob Shin <jacob.shin@amd.com>2012-11-16 22:38:52 -0500
committerH. Peter Anvin <hpa@linux.intel.com>2012-11-17 14:59:14 -0500
commit66520ebc2df3fe52eb4792f8101fac573b766baf (patch)
treecc31359fa406833d946eff89ee8804b18e81c673 /arch/x86/mm/init.c
parente8c57d40519d7226acb8e662f3ab496202ebc7a6 (diff)
x86, mm: Only direct map addresses that are marked as E820_RAM
Currently direct mappings are created for [ 0 to max_low_pfn<<PAGE_SHIFT ) and [ 4GB to max_pfn<<PAGE_SHIFT ), which may include regions that are not backed by actual DRAM. This is fine for holes under 4GB which are covered by fixed and variable range MTRRs to be UC. However, we run into trouble on higher memory addresses which cannot be covered by MTRRs. Our system with 1TB of RAM has an e820 that looks like this: BIOS-e820: [mem 0x0000000000000000-0x00000000000983ff] usable BIOS-e820: [mem 0x0000000000098400-0x000000000009ffff] reserved BIOS-e820: [mem 0x00000000000d0000-0x00000000000fffff] reserved BIOS-e820: [mem 0x0000000000100000-0x00000000c7ebffff] usable BIOS-e820: [mem 0x00000000c7ec0000-0x00000000c7ed7fff] ACPI data BIOS-e820: [mem 0x00000000c7ed8000-0x00000000c7ed9fff] ACPI NVS BIOS-e820: [mem 0x00000000c7eda000-0x00000000c7ffffff] reserved BIOS-e820: [mem 0x00000000fec00000-0x00000000fec0ffff] reserved BIOS-e820: [mem 0x00000000fee00000-0x00000000fee00fff] reserved BIOS-e820: [mem 0x00000000fff00000-0x00000000ffffffff] reserved BIOS-e820: [mem 0x0000000100000000-0x000000e037ffffff] usable BIOS-e820: [mem 0x000000e038000000-0x000000fcffffffff] reserved BIOS-e820: [mem 0x0000010000000000-0x0000011ffeffffff] usable and so direct mappings are created for huge memory hole between 0x000000e038000000 to 0x0000010000000000. Even though the kernel never generates memory accesses in that region, since the page tables mark them incorrectly as being WB, our (AMD) processor ends up causing a MCE while doing some memory bookkeeping/optimizations around that area. This patch iterates through e820 and only direct maps ranges that are marked as E820_RAM, and keeps track of those pfn ranges. Depending on the alignment of E820 ranges, this may possibly result in using smaller size (i.e. 4K instead of 2M or 1G) page tables. -v2: move changes from setup.c to mm/init.c, also use for_each_mem_pfn_range instead. - Yinghai Lu -v3: add calculate_all_table_space_size() to get correct needed page table size. - Yinghai Lu -v4: fix add_pfn_range_mapped() to get correct max_low_pfn_mapped when mem map does have hole under 4g that is found by Konard on xen domU with 8g ram. - Yinghai Signed-off-by: Jacob Shin <jacob.shin@amd.com> Link: http://lkml.kernel.org/r/1353123563-3103-16-git-send-email-yinghai@kernel.org Signed-off-by: Yinghai Lu <yinghai@kernel.org> Reviewed-by: Pekka Enberg <penberg@kernel.org> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Diffstat (limited to 'arch/x86/mm/init.c')
-rw-r--r--arch/x86/mm/init.c120
1 files changed, 109 insertions, 11 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7b961d0b1389..bb44e9f2cc49 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -243,6 +243,38 @@ static unsigned long __init calculate_table_space_size(unsigned long start, unsi
243 return tables; 243 return tables;
244} 244}
245 245
246static unsigned long __init calculate_all_table_space_size(void)
247{
248 unsigned long start_pfn, end_pfn;
249 unsigned long tables;
250 int i;
251
252 /* the ISA range is always mapped regardless of memory holes */
253 tables = calculate_table_space_size(0, ISA_END_ADDRESS);
254
255 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
256 u64 start = start_pfn << PAGE_SHIFT;
257 u64 end = end_pfn << PAGE_SHIFT;
258
259 if (end <= ISA_END_ADDRESS)
260 continue;
261
262 if (start < ISA_END_ADDRESS)
263 start = ISA_END_ADDRESS;
264#ifdef CONFIG_X86_32
265 /* on 32 bit, we only map up to max_low_pfn */
266 if ((start >> PAGE_SHIFT) >= max_low_pfn)
267 continue;
268
269 if ((end >> PAGE_SHIFT) > max_low_pfn)
270 end = max_low_pfn << PAGE_SHIFT;
271#endif
272 tables += calculate_table_space_size(start, end);
273 }
274
275 return tables;
276}
277
246static void __init find_early_table_space(unsigned long start, 278static void __init find_early_table_space(unsigned long start,
247 unsigned long good_end, 279 unsigned long good_end,
248 unsigned long tables) 280 unsigned long tables)
@@ -258,6 +290,34 @@ static void __init find_early_table_space(unsigned long start,
258 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT); 290 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
259} 291}
260 292
293static struct range pfn_mapped[E820_X_MAX];
294static int nr_pfn_mapped;
295
296static void add_pfn_range_mapped(unsigned long start_pfn, unsigned long end_pfn)
297{
298 nr_pfn_mapped = add_range_with_merge(pfn_mapped, E820_X_MAX,
299 nr_pfn_mapped, start_pfn, end_pfn);
300 nr_pfn_mapped = clean_sort_range(pfn_mapped, E820_X_MAX);
301
302 max_pfn_mapped = max(max_pfn_mapped, end_pfn);
303
304 if (start_pfn < (1UL<<(32-PAGE_SHIFT)))
305 max_low_pfn_mapped = max(max_low_pfn_mapped,
306 min(end_pfn, 1UL<<(32-PAGE_SHIFT)));
307}
308
309bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
310{
311 int i;
312
313 for (i = 0; i < nr_pfn_mapped; i++)
314 if ((start_pfn >= pfn_mapped[i].start) &&
315 (end_pfn <= pfn_mapped[i].end))
316 return true;
317
318 return false;
319}
320
261/* 321/*
262 * Setup the direct mapping of the physical memory at PAGE_OFFSET. 322 * Setup the direct mapping of the physical memory at PAGE_OFFSET.
263 * This runs before bootmem is initialized and gets pages directly from 323 * This runs before bootmem is initialized and gets pages directly from
@@ -288,9 +348,55 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
288 348
289 __flush_tlb_all(); 349 __flush_tlb_all();
290 350
351 add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
352
291 return ret >> PAGE_SHIFT; 353 return ret >> PAGE_SHIFT;
292} 354}
293 355
356/*
357 * Iterate through E820 memory map and create direct mappings for only E820_RAM
358 * regions. We cannot simply create direct mappings for all pfns from
359 * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
360 * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
361 * Depending on the alignment of E820 ranges, this may possibly result in using
362 * smaller size (i.e. 4K instead of 2M or 1G) page tables.
363 */
364static void __init init_all_memory_mapping(void)
365{
366 unsigned long start_pfn, end_pfn;
367 int i;
368
369 /* the ISA range is always mapped regardless of memory holes */
370 init_memory_mapping(0, ISA_END_ADDRESS);
371
372 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
373 u64 start = (u64)start_pfn << PAGE_SHIFT;
374 u64 end = (u64)end_pfn << PAGE_SHIFT;
375
376 if (end <= ISA_END_ADDRESS)
377 continue;
378
379 if (start < ISA_END_ADDRESS)
380 start = ISA_END_ADDRESS;
381#ifdef CONFIG_X86_32
382 /* on 32 bit, we only map up to max_low_pfn */
383 if ((start >> PAGE_SHIFT) >= max_low_pfn)
384 continue;
385
386 if ((end >> PAGE_SHIFT) > max_low_pfn)
387 end = max_low_pfn << PAGE_SHIFT;
388#endif
389 init_memory_mapping(start, end);
390 }
391
392#ifdef CONFIG_X86_64
393 if (max_pfn > max_low_pfn) {
394 /* can we preseve max_low_pfn ?*/
395 max_low_pfn = max_pfn;
396 }
397#endif
398}
399
294void __init init_mem_mapping(void) 400void __init init_mem_mapping(void)
295{ 401{
296 unsigned long tables, good_end, end; 402 unsigned long tables, good_end, end;
@@ -311,23 +417,15 @@ void __init init_mem_mapping(void)
311 end = max_low_pfn << PAGE_SHIFT; 417 end = max_low_pfn << PAGE_SHIFT;
312 good_end = max_pfn_mapped << PAGE_SHIFT; 418 good_end = max_pfn_mapped << PAGE_SHIFT;
313#endif 419#endif
314 tables = calculate_table_space_size(0, end); 420 tables = calculate_all_table_space_size();
315 find_early_table_space(0, good_end, tables); 421 find_early_table_space(0, good_end, tables);
316 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n", 422 printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n",
317 end - 1, pgt_buf_start << PAGE_SHIFT, 423 end - 1, pgt_buf_start << PAGE_SHIFT,
318 (pgt_buf_top << PAGE_SHIFT) - 1); 424 (pgt_buf_top << PAGE_SHIFT) - 1);
319 425
320 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 426 max_pfn_mapped = 0; /* will get exact value next */
321 max_pfn_mapped = max_low_pfn_mapped; 427 init_all_memory_mapping();
322 428
323#ifdef CONFIG_X86_64
324 if (max_pfn > max_low_pfn) {
325 max_pfn_mapped = init_memory_mapping(1UL<<32,
326 max_pfn<<PAGE_SHIFT);
327 /* can we preseve max_low_pfn ?*/
328 max_low_pfn = max_pfn;
329 }
330#endif
331 /* 429 /*
332 * Reserve the kernel pagetable pages we used (pgt_buf_start - 430 * Reserve the kernel pagetable pages we used (pgt_buf_start -
333 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) 431 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)