aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorJan Beulich <JBeulich@suse.com>2014-12-19 11:10:54 -0500
committerIngo Molnar <mingo@kernel.org>2014-12-23 05:39:34 -0500
commit132978b94e66f8ad7d20790f8332f0e9c1426029 (patch)
treee928ea7e3fe2abc0d129e4cb42de9c1b3c90ce89 /arch/x86
parentfbe1bf140671619508dfa575d74a185ae53c5dbb (diff)
x86: Fix step size adjustment during initial memory mapping
The old scheme can lead to failure in certain cases - the problem is that after bumping step_size the next (non-final) iteration is only guaranteed to make available a memory block the size of what step_size was before. E.g. for a memory block [0,3004600000) we'd have: iter start end step amount 1 3004400000 30045fffff 2M 2M 2 3004000000 30043fffff 64M 4M 3 3000000000 3003ffffff 2G 64M 4 2000000000 2fffffffff 64G 64G Yet to map 64G with 4k pages (as happens e.g. under PV Xen) we need slightly over 128M, but the first three iterations made only about 70M available. The condition (new_mapped_ram_size > mapped_ram_size) for bumping step_size is just not suitable. Instead we want to bump it when we know we have enough memory available to cover a block of the new step_size. And rather than making that condition more complicated than needed, simply adjust step_size by the largest possible factor we know we can cover at that point - which is shifting it left by one less than the difference between page table level shifts. (Interestingly the original STEP_SIZE_SHIFT definition had a comment hinting at that having been the intention, just that it should have been PUD_SHIFT-PMD_SHIFT-1 instead of (PUD_SHIFT-PMD_SHIFT)/2, and of course for non-PAE 32-bit we can't really use these two constants as they're equal there.) Furthermore the comment in get_new_step_size() didn't get updated when the bottom-down mapping logic got added. Yet while an overflow (flushing step_size to zero) of the shift doesn't matter for the top-down method, it does for bottom-up because round_up(x, 0) = 0, and an upper range boundary of zero can't really work well. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/54945C1E020000780005114E@mail.emea.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/mm/init.c37
1 files changed, 17 insertions, 20 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a97ee0801475..08a7d313538a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -438,20 +438,20 @@ static unsigned long __init init_range_memory_mapping(
438static unsigned long __init get_new_step_size(unsigned long step_size) 438static unsigned long __init get_new_step_size(unsigned long step_size)
439{ 439{
440 /* 440 /*
441 * Explain why we shift by 5 and why we don't have to worry about 441 * Initial mapped size is PMD_SIZE (2M).
442 * 'step_size << 5' overflowing:
443 *
444 * initial mapped size is PMD_SIZE (2M).
445 * We can not set step_size to be PUD_SIZE (1G) yet. 442 * We can not set step_size to be PUD_SIZE (1G) yet.
446 * In worse case, when we cross the 1G boundary, and 443 * In worse case, when we cross the 1G boundary, and
447 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k) 444 * PG_LEVEL_2M is not set, we will need 1+1+512 pages (2M + 8k)
448 * to map 1G range with PTE. Use 5 as shift for now. 445 * to map 1G range with PTE. Hence we use one less than the
446 * difference of page table level shifts.
449 * 447 *
450 * Don't need to worry about overflow, on 32bit, when step_size 448 * Don't need to worry about overflow in the top-down case, on 32bit,
451 * is 0, round_down() returns 0 for start, and that turns it 449 * when step_size is 0, round_down() returns 0 for start, and that
452 * into 0x100000000ULL. 450 * turns it into 0x100000000ULL.
451 * In the bottom-up case, round_up(x, 0) returns 0 though too, which
452 * needs to be taken into consideration by the code below.
453 */ 453 */
454 return step_size << 5; 454 return step_size << (PMD_SHIFT - PAGE_SHIFT - 1);
455} 455}
456 456
457/** 457/**
@@ -471,7 +471,6 @@ static void __init memory_map_top_down(unsigned long map_start,
471 unsigned long step_size; 471 unsigned long step_size;
472 unsigned long addr; 472 unsigned long addr;
473 unsigned long mapped_ram_size = 0; 473 unsigned long mapped_ram_size = 0;
474 unsigned long new_mapped_ram_size;
475 474
476 /* xen has big range in reserved near end of ram, skip it at first.*/ 475 /* xen has big range in reserved near end of ram, skip it at first.*/
477 addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE); 476 addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
@@ -496,14 +495,12 @@ static void __init memory_map_top_down(unsigned long map_start,
496 start = map_start; 495 start = map_start;
497 } else 496 } else
498 start = map_start; 497 start = map_start;
499 new_mapped_ram_size = init_range_memory_mapping(start, 498 mapped_ram_size += init_range_memory_mapping(start,
500 last_start); 499 last_start);
501 last_start = start; 500 last_start = start;
502 min_pfn_mapped = last_start >> PAGE_SHIFT; 501 min_pfn_mapped = last_start >> PAGE_SHIFT;
503 /* only increase step_size after big range get mapped */ 502 if (mapped_ram_size >= step_size)
504 if (new_mapped_ram_size > mapped_ram_size)
505 step_size = get_new_step_size(step_size); 503 step_size = get_new_step_size(step_size);
506 mapped_ram_size += new_mapped_ram_size;
507 } 504 }
508 505
509 if (real_end < map_end) 506 if (real_end < map_end)
@@ -524,7 +521,7 @@ static void __init memory_map_top_down(unsigned long map_start,
524static void __init memory_map_bottom_up(unsigned long map_start, 521static void __init memory_map_bottom_up(unsigned long map_start,
525 unsigned long map_end) 522 unsigned long map_end)
526{ 523{
527 unsigned long next, new_mapped_ram_size, start; 524 unsigned long next, start;
528 unsigned long mapped_ram_size = 0; 525 unsigned long mapped_ram_size = 0;
529 /* step_size need to be small so pgt_buf from BRK could cover it */ 526 /* step_size need to be small so pgt_buf from BRK could cover it */
530 unsigned long step_size = PMD_SIZE; 527 unsigned long step_size = PMD_SIZE;
@@ -539,19 +536,19 @@ static void __init memory_map_bottom_up(unsigned long map_start,
539 * for page table. 536 * for page table.
540 */ 537 */
541 while (start < map_end) { 538 while (start < map_end) {
542 if (map_end - start > step_size) { 539 if (step_size && map_end - start > step_size) {
543 next = round_up(start + 1, step_size); 540 next = round_up(start + 1, step_size);
544 if (next > map_end) 541 if (next > map_end)
545 next = map_end; 542 next = map_end;
546 } else 543 } else {
547 next = map_end; 544 next = map_end;
545 }
548 546
549 new_mapped_ram_size = init_range_memory_mapping(start, next); 547 mapped_ram_size += init_range_memory_mapping(start, next);
550 start = next; 548 start = next;
551 549
552 if (new_mapped_ram_size > mapped_ram_size) 550 if (mapped_ram_size >= step_size)
553 step_size = get_new_step_size(step_size); 551 step_size = get_new_step_size(step_size);
554 mapped_ram_size += new_mapped_ram_size;
555 } 552 }
556} 553}
557 554