aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorJoerg Roedel <joerg.roedel@amd.com>2010-08-16 08:38:33 -0400
committerH. Peter Anvin <hpa@zytor.com>2010-08-18 12:17:20 -0400
commitfd89a137924e0710078c3ae855e7cec1c43cb845 (patch)
tree901f2048e637d3369ea4f0431dd618081ebeef93 /arch/x86/kernel
parent07a7795ca2e6e66d00b184efb46bd0e23d90d3fe (diff)
x86-32: Separate 1:1 pagetables from swapper_pg_dir
This patch fixes machine crashes which occur when heavily exercising the CPU hotplug codepaths on a 32-bit kernel. These crashes are caused by AMD Erratum 383 and result in a fatal machine check exception. Here's the scenario: 1. On 32-bit, the swapper_pg_dir page table is used as the initial page table for booting a secondary CPU. 2. To make this work, swapper_pg_dir needs a direct mapping of physical memory in it (the low mappings). By adding those low, large page (2M) mappings (PAE kernel), we create the necessary conditions for Erratum 383 to occur. 3. Other CPUs which do not participate in the off- and onlining game may use swapper_pg_dir while the low mappings are present (when leave_mm is called). For all steps below, the CPU referred to is a CPU that is using swapper_pg_dir, and not the CPU which is being onlined. 4. The presence of the low mappings in swapper_pg_dir can result in TLB entries for addresses below __PAGE_OFFSET to be established speculatively. These TLB entries are marked global and large. 5. When the CPU with such TLB entry switches to another page table, this TLB entry remains because it is global. 6. The process then generates an access to an address covered by the above TLB entry but there is a permission mismatch - the TLB entry covers a large global page not accessible to userspace. 7. Due to this permission mismatch a new 4kb, user TLB entry gets established. Further, Erratum 383 provides for a small window of time where both TLB entries are present. This results in an uncorrectable machine check exception signalling a TLB multimatch which panics the machine. There are two ways to fix this issue: 1. Always do a global TLB flush when a new cr3 is loaded and the old page table was swapper_pg_dir. I consider this a hack hard to understand and with performance implications 2. Do not use swapper_pg_dir to boot secondary CPUs like 64-bit does. This patch implements solution 2. It introduces a trampoline_pg_dir which has the same layout as swapper_pg_dir with low_mappings. This page table is used as the initial page table of the booting CPU. Later in the bringup process, it switches to swapper_pg_dir and does a global TLB flush. This fixes the crashes in our test cases. -v2: switch to swapper_pg_dir right after entering start_secondary() so that we are able to access percpu data which might not be mapped in the trampoline page table. Signed-off-by: Joerg Roedel <joerg.roedel@amd.com> LKML-Reference: <20100816123833.GB28147@aftab> Signed-off-by: Borislav Petkov <borislav.petkov@amd.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/head_32.S8
-rw-r--r--arch/x86/kernel/setup.c2
-rw-r--r--arch/x86/kernel/smpboot.c32
-rw-r--r--arch/x86/kernel/trampoline.c18
4 files changed, 40 insertions, 20 deletions
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index ff4c453e13f3..fa8c1b8e09fb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -334,7 +334,7 @@ ENTRY(startup_32_smp)
334/* 334/*
335 * Enable paging 335 * Enable paging
336 */ 336 */
337 movl $pa(swapper_pg_dir),%eax 337 movl pa(initial_page_table), %eax
338 movl %eax,%cr3 /* set the page table pointer.. */ 338 movl %eax,%cr3 /* set the page table pointer.. */
339 movl %cr0,%eax 339 movl %cr0,%eax
340 orl $X86_CR0_PG,%eax 340 orl $X86_CR0_PG,%eax
@@ -614,6 +614,8 @@ ignore_int:
614.align 4 614.align 4
615ENTRY(initial_code) 615ENTRY(initial_code)
616 .long i386_start_kernel 616 .long i386_start_kernel
617ENTRY(initial_page_table)
618 .long pa(swapper_pg_dir)
617 619
618/* 620/*
619 * BSS section 621 * BSS section
@@ -629,6 +631,10 @@ ENTRY(swapper_pg_dir)
629#endif 631#endif
630swapper_pg_fixmap: 632swapper_pg_fixmap:
631 .fill 1024,4,0 633 .fill 1024,4,0
634#ifdef CONFIG_X86_TRAMPOLINE
635ENTRY(trampoline_pg_dir)
636 .fill 1024,4,0
637#endif
632ENTRY(empty_zero_page) 638ENTRY(empty_zero_page)
633 .fill 4096,1,0 639 .fill 4096,1,0
634 640
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b008e7883207..c3a4fbb2b996 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1014,6 +1014,8 @@ void __init setup_arch(char **cmdline_p)
1014 paging_init(); 1014 paging_init();
1015 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 1015 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1016 1016
1017 setup_trampoline_page_table();
1018
1017 tboot_probe(); 1019 tboot_probe();
1018 1020
1019#ifdef CONFIG_X86_64 1021#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a5e928b0cb5f..abf4a86ffc54 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -73,7 +73,6 @@
73 73
74#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
76static int low_mappings;
77#endif 76#endif
78 77
79/* State of each CPU */ 78/* State of each CPU */
@@ -281,6 +280,18 @@ notrace static void __cpuinit start_secondary(void *unused)
281 * fragile that we want to limit the things done here to the 280 * fragile that we want to limit the things done here to the
282 * most necessary things. 281 * most necessary things.
283 */ 282 */
283
284#ifdef CONFIG_X86_32
285 /*
286 * Switch away from the trampoline page-table
287 *
288 * Do this before cpu_init() because it needs to access per-cpu
289 * data which may not be mapped in the trampoline page-table.
290 */
291 load_cr3(swapper_pg_dir);
292 __flush_tlb_all();
293#endif
294
284 vmi_bringup(); 295 vmi_bringup();
285 cpu_init(); 296 cpu_init();
286 preempt_disable(); 297 preempt_disable();
@@ -299,12 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused)
299 legacy_pic->chip->unmask(0); 310 legacy_pic->chip->unmask(0);
300 } 311 }
301 312
302#ifdef CONFIG_X86_32
303 while (low_mappings)
304 cpu_relax();
305 __flush_tlb_all();
306#endif
307
308 /* This must be done before setting cpu_online_mask */ 313 /* This must be done before setting cpu_online_mask */
309 set_cpu_sibling_map(raw_smp_processor_id()); 314 set_cpu_sibling_map(raw_smp_processor_id());
310 wmb(); 315 wmb();
@@ -750,6 +755,7 @@ do_rest:
750#ifdef CONFIG_X86_32 755#ifdef CONFIG_X86_32
751 /* Stack for startup_32 can be just as for start_secondary onwards */ 756 /* Stack for startup_32 can be just as for start_secondary onwards */
752 irq_ctx_init(cpu); 757 irq_ctx_init(cpu);
758 initial_page_table = __pa(&trampoline_pg_dir);
753#else 759#else
754 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 760 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
755 initial_gs = per_cpu_offset(cpu); 761 initial_gs = per_cpu_offset(cpu);
@@ -897,20 +903,8 @@ int __cpuinit native_cpu_up(unsigned int cpu)
897 903
898 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 904 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
899 905
900#ifdef CONFIG_X86_32
901 /* init low mem mapping */
902 clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
903 min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
904 flush_tlb_all();
905 low_mappings = 1;
906
907 err = do_boot_cpu(apicid, cpu); 906 err = do_boot_cpu(apicid, cpu);
908 907
909 zap_low_mappings(false);
910 low_mappings = 0;
911#else
912 err = do_boot_cpu(apicid, cpu);
913#endif
914 if (err) { 908 if (err) {
915 pr_debug("do_boot_cpu failed %d\n", err); 909 pr_debug("do_boot_cpu failed %d\n", err);
916 return -EIO; 910 return -EIO;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index c652ef62742d..a874495b3673 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,6 +1,7 @@
1#include <linux/io.h> 1#include <linux/io.h>
2 2
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/pgtable.h>
4#include <asm/e820.h> 5#include <asm/e820.h>
5 6
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
@@ -37,3 +38,20 @@ unsigned long __trampinit setup_trampoline(void)
37 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
38 return virt_to_phys(trampoline_base); 39 return virt_to_phys(trampoline_base);
39} 40}
41
42void __init setup_trampoline_page_table(void)
43{
44#ifdef CONFIG_X86_32
45 /* Copy kernel address range */
46 clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
47 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
48 min_t(unsigned long, KERNEL_PGD_PTRS,
49 KERNEL_PGD_BOUNDARY));
50
51 /* Initialize low mappings */
52 clone_pgd_range(trampoline_pg_dir,
53 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
54 min_t(unsigned long, KERNEL_PGD_PTRS,
55 KERNEL_PGD_BOUNDARY));
56#endif
57}