diff options
author | Joerg Roedel <joerg.roedel@amd.com> | 2010-08-16 08:38:33 -0400 |
---|---|---|
committer | H. Peter Anvin <hpa@zytor.com> | 2010-08-18 12:17:20 -0400 |
commit | fd89a137924e0710078c3ae855e7cec1c43cb845 (patch) | |
tree | 901f2048e637d3369ea4f0431dd618081ebeef93 /arch/x86/kernel | |
parent | 07a7795ca2e6e66d00b184efb46bd0e23d90d3fe (diff) |
x86-32: Separate 1:1 pagetables from swapper_pg_dir
This patch fixes machine crashes which occur when heavily exercising the
CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by
AMD Erratum 383 and result in a fatal machine check exception. Here's
the scenario:
1. On 32-bit, the swapper_pg_dir page table is used as the initial page
table for booting a secondary CPU.
2. To make this work, swapper_pg_dir needs a direct mapping of physical
memory in it (the low mappings). By adding those low, large page (2M)
mappings (PAE kernel), we create the necessary conditions for Erratum
383 to occur.
3. Other CPUs which do not participate in the off- and onlining game may
use swapper_pg_dir while the low mappings are present (when leave_mm is
called). For all steps below, the CPU referred to is a CPU that is using
swapper_pg_dir, and not the CPU which is being onlined.
4. The presence of the low mappings in swapper_pg_dir can result
in TLB entries for addresses below __PAGE_OFFSET to be established
speculatively. These TLB entries are marked global and large.
5. When the CPU with such TLB entry switches to another page table, this
TLB entry remains because it is global.
6. The process then generates an access to an address covered by the
above TLB entry but there is a permission mismatch - the TLB entry
covers a large global page not accessible to userspace.
7. Due to this permission mismatch a new 4kb, user TLB entry gets
established. Further, Erratum 383 provides for a small window of time
where both TLB entries are present. This results in an uncorrectable
machine check exception signalling a TLB multimatch which panics the
machine.
There are two ways to fix this issue:
1. Always do a global TLB flush when a new cr3 is loaded and the
old page table was swapper_pg_dir. I consider this a hack hard
to understand and with performance implications
2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit
does.
This patch implements solution 2. It introduces a trampoline_pg_dir
which has the same layout as swapper_pg_dir with low_mappings. This page
table is used as the initial page table of the booting CPU. Later in the
bringup process, it switches to swapper_pg_dir and does a global TLB
flush. This fixes the crashes in our test cases.
-v2: switch to swapper_pg_dir right after entering start_secondary() so
that we are able to access percpu data which might not be mapped in the
trampoline page table.
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <20100816123833.GB28147@aftab>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/head_32.S | 8 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 32 | ||||
-rw-r--r-- | arch/x86/kernel/trampoline.c | 18 |
4 files changed, 40 insertions, 20 deletions
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index ff4c453e13f3..fa8c1b8e09fb 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S | |||
@@ -334,7 +334,7 @@ ENTRY(startup_32_smp) | |||
334 | /* | 334 | /* |
335 | * Enable paging | 335 | * Enable paging |
336 | */ | 336 | */ |
337 | movl $pa(swapper_pg_dir),%eax | 337 | movl pa(initial_page_table), %eax |
338 | movl %eax,%cr3 /* set the page table pointer.. */ | 338 | movl %eax,%cr3 /* set the page table pointer.. */ |
339 | movl %cr0,%eax | 339 | movl %cr0,%eax |
340 | orl $X86_CR0_PG,%eax | 340 | orl $X86_CR0_PG,%eax |
@@ -614,6 +614,8 @@ ignore_int: | |||
614 | .align 4 | 614 | .align 4 |
615 | ENTRY(initial_code) | 615 | ENTRY(initial_code) |
616 | .long i386_start_kernel | 616 | .long i386_start_kernel |
617 | ENTRY(initial_page_table) | ||
618 | .long pa(swapper_pg_dir) | ||
617 | 619 | ||
618 | /* | 620 | /* |
619 | * BSS section | 621 | * BSS section |
@@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir) | |||
629 | #endif | 631 | #endif |
630 | swapper_pg_fixmap: | 632 | swapper_pg_fixmap: |
631 | .fill 1024,4,0 | 633 | .fill 1024,4,0 |
634 | #ifdef CONFIG_X86_TRAMPOLINE | ||
635 | ENTRY(trampoline_pg_dir) | ||
636 | .fill 1024,4,0 | ||
637 | #endif | ||
632 | ENTRY(empty_zero_page) | 638 | ENTRY(empty_zero_page) |
633 | .fill 4096,1,0 | 639 | .fill 4096,1,0 |
634 | 640 | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b008e7883207..c3a4fbb2b996 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -1014,6 +1014,8 @@ void __init setup_arch(char **cmdline_p) | |||
1014 | paging_init(); | 1014 | paging_init(); |
1015 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); | 1015 | x86_init.paging.pagetable_setup_done(swapper_pg_dir); |
1016 | 1016 | ||
1017 | setup_trampoline_page_table(); | ||
1018 | |||
1017 | tboot_probe(); | 1019 | tboot_probe(); |
1018 | 1020 | ||
1019 | #ifdef CONFIG_X86_64 | 1021 | #ifdef CONFIG_X86_64 |
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a5e928b0cb5f..abf4a86ffc54 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -73,7 +73,6 @@ | |||
73 | 73 | ||
74 | #ifdef CONFIG_X86_32 | 74 | #ifdef CONFIG_X86_32 |
75 | u8 apicid_2_node[MAX_APICID]; | 75 | u8 apicid_2_node[MAX_APICID]; |
76 | static int low_mappings; | ||
77 | #endif | 76 | #endif |
78 | 77 | ||
79 | /* State of each CPU */ | 78 | /* State of each CPU */ |
@@ -281,6 +280,18 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
281 | * fragile that we want to limit the things done here to the | 280 | * fragile that we want to limit the things done here to the |
282 | * most necessary things. | 281 | * most necessary things. |
283 | */ | 282 | */ |
283 | |||
284 | #ifdef CONFIG_X86_32 | ||
285 | /* | ||
286 | * Switch away from the trampoline page-table | ||
287 | * | ||
288 | * Do this before cpu_init() because it needs to access per-cpu | ||
289 | * data which may not be mapped in the trampoline page-table. | ||
290 | */ | ||
291 | load_cr3(swapper_pg_dir); | ||
292 | __flush_tlb_all(); | ||
293 | #endif | ||
294 | |||
284 | vmi_bringup(); | 295 | vmi_bringup(); |
285 | cpu_init(); | 296 | cpu_init(); |
286 | preempt_disable(); | 297 | preempt_disable(); |
@@ -299,12 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused) | |||
299 | legacy_pic->chip->unmask(0); | 310 | legacy_pic->chip->unmask(0); |
300 | } | 311 | } |
301 | 312 | ||
302 | #ifdef CONFIG_X86_32 | ||
303 | while (low_mappings) | ||
304 | cpu_relax(); | ||
305 | __flush_tlb_all(); | ||
306 | #endif | ||
307 | |||
308 | /* This must be done before setting cpu_online_mask */ | 313 | /* This must be done before setting cpu_online_mask */ |
309 | set_cpu_sibling_map(raw_smp_processor_id()); | 314 | set_cpu_sibling_map(raw_smp_processor_id()); |
310 | wmb(); | 315 | wmb(); |
@@ -750,6 +755,7 @@ do_rest: | |||
750 | #ifdef CONFIG_X86_32 | 755 | #ifdef CONFIG_X86_32 |
751 | /* Stack for startup_32 can be just as for start_secondary onwards */ | 756 | /* Stack for startup_32 can be just as for start_secondary onwards */ |
752 | irq_ctx_init(cpu); | 757 | irq_ctx_init(cpu); |
758 | initial_page_table = __pa(&trampoline_pg_dir); | ||
753 | #else | 759 | #else |
754 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); | 760 | clear_tsk_thread_flag(c_idle.idle, TIF_FORK); |
755 | initial_gs = per_cpu_offset(cpu); | 761 | initial_gs = per_cpu_offset(cpu); |
@@ -897,20 +903,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) | |||
897 | 903 | ||
898 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; | 904 | per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; |
899 | 905 | ||
900 | #ifdef CONFIG_X86_32 | ||
901 | /* init low mem mapping */ | ||
902 | clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
903 | min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); | ||
904 | flush_tlb_all(); | ||
905 | low_mappings = 1; | ||
906 | |||
907 | err = do_boot_cpu(apicid, cpu); | 906 | err = do_boot_cpu(apicid, cpu); |
908 | 907 | ||
909 | zap_low_mappings(false); | ||
910 | low_mappings = 0; | ||
911 | #else | ||
912 | err = do_boot_cpu(apicid, cpu); | ||
913 | #endif | ||
914 | if (err) { | 908 | if (err) { |
915 | pr_debug("do_boot_cpu failed %d\n", err); | 909 | pr_debug("do_boot_cpu failed %d\n", err); |
916 | return -EIO; | 910 | return -EIO; |
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index c652ef62742d..a874495b3673 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c | |||
@@ -1,6 +1,7 @@ | |||
1 | #include <linux/io.h> | 1 | #include <linux/io.h> |
2 | 2 | ||
3 | #include <asm/trampoline.h> | 3 | #include <asm/trampoline.h> |
4 | #include <asm/pgtable.h> | ||
4 | #include <asm/e820.h> | 5 | #include <asm/e820.h> |
5 | 6 | ||
6 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) | 7 | #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) |
@@ -37,3 +38,20 @@ unsigned long __trampinit setup_trampoline(void) | |||
37 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); | 38 | memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); |
38 | return virt_to_phys(trampoline_base); | 39 | return virt_to_phys(trampoline_base); |
39 | } | 40 | } |
41 | |||
42 | void __init setup_trampoline_page_table(void) | ||
43 | { | ||
44 | #ifdef CONFIG_X86_32 | ||
45 | /* Copy kernel address range */ | ||
46 | clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, | ||
47 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
48 | min_t(unsigned long, KERNEL_PGD_PTRS, | ||
49 | KERNEL_PGD_BOUNDARY)); | ||
50 | |||
51 | /* Initialize low mappings */ | ||
52 | clone_pgd_range(trampoline_pg_dir, | ||
53 | swapper_pg_dir + KERNEL_PGD_BOUNDARY, | ||
54 | min_t(unsigned long, KERNEL_PGD_PTRS, | ||
55 | KERNEL_PGD_BOUNDARY)); | ||
56 | #endif | ||
57 | } | ||