aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile14
-rw-r--r--arch/x86/kernel/acpi/sleep.c1
-rw-r--r--arch/x86/kernel/apic.c50
-rw-r--r--arch/x86/kernel/asm-offsets_64.c11
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c34
-rw-r--r--arch/x86/kernel/cpu/common.c202
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c20
-rw-r--r--arch/x86/kernel/cpu/intel.c25
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c63
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel_64.c1
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c12
-rw-r--r--arch/x86/kernel/dumpstack_64.c35
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/efi_64.c1
-rw-r--r--arch/x86/kernel/entry_32.S8
-rw-r--r--arch/x86/kernel/entry_64.S45
-rw-r--r--arch/x86/kernel/genapic_64.c2
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c1
-rw-r--r--arch/x86/kernel/head64.c23
-rw-r--r--arch/x86/kernel/head_32.S19
-rw-r--r--arch/x86/kernel/head_64.S36
-rw-r--r--arch/x86/kernel/hpet.c3
-rw-r--r--arch/x86/kernel/io_apic.c46
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c9
-rw-r--r--arch/x86/kernel/irqinit_32.c11
-rw-r--r--arch/x86/kernel/kprobes.c2
-rw-r--r--arch/x86/kernel/mpparse.c1
-rw-r--r--arch/x86/kernel/nmi.c10
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c43
-rw-r--r--arch/x86/kernel/setup_percpu.c411
-rw-r--r--arch/x86/kernel/signal.c11
-rw-r--r--arch/x86/kernel/smpboot.c73
-rw-r--r--arch/x86/kernel/smpcommon.c30
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tlb_32.c256
-rw-r--r--arch/x86/kernel/tlb_64.c284
-rw-r--r--arch/x86/kernel/tlb_uv.c69
-rw-r--r--arch/x86/kernel/traps.c1
-rw-r--r--arch/x86/kernel/vmi_32.c2
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S26
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c2
47 files changed, 589 insertions, 1353 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d364df03c1d6..37fa30bada17 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,11 +23,12 @@ nostackp := $(call cc-option, -fno-stack-protector)
23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 23CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
24CFLAGS_hpet.o := $(nostackp) 24CFLAGS_hpet.o := $(nostackp)
25CFLAGS_tsc.o := $(nostackp) 25CFLAGS_tsc.o := $(nostackp)
26CFLAGS_paravirt.o := $(nostackp)
26 27
27obj-y := process_$(BITS).o signal.o entry_$(BITS).o 28obj-y := process_$(BITS).o signal.o entry_$(BITS).o
28obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 29obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
29obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 30obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o
30obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 31obj-y += setup.o i8259.o irqinit_$(BITS).o
31obj-$(CONFIG_X86_VISWS) += visws_quirks.o 32obj-$(CONFIG_X86_VISWS) += visws_quirks.o
32obj-$(CONFIG_X86_32) += probe_roms_32.o 33obj-$(CONFIG_X86_32) += probe_roms_32.o
33obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 34obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -57,9 +58,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
57apm-y := apm_32.o 58apm-y := apm_32.o
58obj-$(CONFIG_APM) += apm.o 59obj-$(CONFIG_APM) += apm.o
59obj-$(CONFIG_X86_SMP) += smp.o 60obj-$(CONFIG_X86_SMP) += smp.o
60obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o 61obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o
61obj-$(CONFIG_X86_32_SMP) += smpcommon.o 62obj-$(CONFIG_SMP) += setup_percpu.o
62obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 63obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
63obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 64obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
64obj-$(CONFIG_X86_MPPARSE) += mpparse.o 65obj-$(CONFIG_X86_MPPARSE) += mpparse.o
65obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o 66obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
@@ -114,10 +115,11 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb_64.o # NB rename without _64
114### 115###
115# 64 bit specific files 116# 64 bit specific files
116ifeq ($(CONFIG_X86_64),y) 117ifeq ($(CONFIG_X86_64),y)
117 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 118 obj-y += genapic_64.o genapic_flat_64.o
118 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
119 obj-y += genx2apic_cluster.o 119 obj-y += genx2apic_cluster.o
120 obj-y += genx2apic_phys.o 120 obj-y += genx2apic_phys.o
121 obj-$(CONFIG_X86_UV) += genx2apic_uv_x.o tlb_uv.o
122 obj-$(CONFIG_X86_UV) += bios_uv.o uv_irq.o uv_sysfs.o
121 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 123 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
122 obj-$(CONFIG_AUDIT) += audit_64.o 124 obj-$(CONFIG_AUDIT) += audit_64.o
123 125
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 707c1f6f95fa..4abff454c55b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,6 +101,7 @@ int acpi_save_state_mem(void)
101 stack_start.sp = temp_stack + sizeof(temp_stack); 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address = 102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
104 initial_gs = per_cpu_offset(smp_processor_id());
104#endif 105#endif
105 initial_code = (unsigned long)wakeup_long64; 106 initial_code = (unsigned long)wakeup_long64;
106 saved_magic = 0x123456789abcdef0; 107 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 566a08466b19..c6f15647eba9 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -47,6 +47,7 @@
47#include <asm/proto.h> 47#include <asm/proto.h>
48#include <asm/apic.h> 48#include <asm/apic.h>
49#include <asm/i8259.h> 49#include <asm/i8259.h>
50#include <asm/smp.h>
50 51
51#include <mach_apic.h> 52#include <mach_apic.h>
52#include <mach_apicdef.h> 53#include <mach_apicdef.h>
@@ -59,6 +60,24 @@
59# error SPURIOUS_APIC_VECTOR definition error 60# error SPURIOUS_APIC_VECTOR definition error
60#endif 61#endif
61 62
63unsigned int num_processors;
64unsigned disabled_cpus __cpuinitdata;
65/* Processor that is doing the boot up */
66unsigned int boot_cpu_physical_apicid = -1U;
67EXPORT_SYMBOL(boot_cpu_physical_apicid);
68unsigned int max_physical_apicid;
69
70/* Bitmask of physically existing CPUs */
71physid_mask_t phys_cpu_present_map;
72
73/*
74 * Map cpu index to physical APIC ID
75 */
76DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
77DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
78EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
80
62#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
63/* 82/*
64 * Knob to control our willingness to enable the local APIC. 83 * Knob to control our willingness to enable the local APIC.
@@ -894,6 +913,10 @@ void disable_local_APIC(void)
894{ 913{
895 unsigned int value; 914 unsigned int value;
896 915
916 /* APIC hasn't been mapped yet */
917 if (!apic_phys)
918 return;
919
897 clear_local_APIC(); 920 clear_local_APIC();
898 921
899 /* 922 /*
@@ -1125,6 +1148,13 @@ void __cpuinit setup_local_APIC(void)
1125 unsigned int value; 1148 unsigned int value;
1126 int i, j; 1149 int i, j;
1127 1150
1151 if (disable_apic) {
1152#ifdef CONFIG_X86_IO_APIC
1153 disable_ioapic_setup();
1154#endif
1155 return;
1156 }
1157
1128#ifdef CONFIG_X86_32 1158#ifdef CONFIG_X86_32
1129 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1159 /* Pound the ESR really hard over the head with a big hammer - mbligh */
1130 if (lapic_is_integrated() && esr_disable) { 1160 if (lapic_is_integrated() && esr_disable) {
@@ -1565,11 +1595,11 @@ int apic_version[MAX_APICS];
1565 1595
1566int __init APIC_init_uniprocessor(void) 1596int __init APIC_init_uniprocessor(void)
1567{ 1597{
1568#ifdef CONFIG_X86_64
1569 if (disable_apic) { 1598 if (disable_apic) {
1570 pr_info("Apic disabled\n"); 1599 pr_info("Apic disabled\n");
1571 return -1; 1600 return -1;
1572 } 1601 }
1602#ifdef CONFIG_X86_64
1573 if (!cpu_has_apic) { 1603 if (!cpu_has_apic) {
1574 disable_apic = 1; 1604 disable_apic = 1;
1575 pr_info("Apic disabled by BIOS\n"); 1605 pr_info("Apic disabled by BIOS\n");
@@ -1832,6 +1862,11 @@ void __cpuinit generic_processor_info(int apicid, int version)
1832 num_processors++; 1862 num_processors++;
1833 cpu = cpumask_next_zero(-1, cpu_present_mask); 1863 cpu = cpumask_next_zero(-1, cpu_present_mask);
1834 1864
1865 if (version != apic_version[boot_cpu_physical_apicid])
1866 WARN_ONCE(1,
1867 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1868 apic_version[boot_cpu_physical_apicid], cpu, version);
1869
1835 physid_set(apicid, phys_cpu_present_map); 1870 physid_set(apicid, phys_cpu_present_map);
1836 if (apicid == boot_cpu_physical_apicid) { 1871 if (apicid == boot_cpu_physical_apicid) {
1837 /* 1872 /*
@@ -1867,17 +1902,8 @@ void __cpuinit generic_processor_info(int apicid, int version)
1867#endif 1902#endif
1868 1903
1869#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) 1904#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1870 /* are we being called early in kernel startup? */ 1905 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1871 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1906 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1872 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1873 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1874
1875 cpu_to_apicid[cpu] = apicid;
1876 bios_cpu_apicid[cpu] = apicid;
1877 } else {
1878 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1879 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1880 }
1881#endif 1907#endif
1882 1908
1883 set_cpu_possible(cpu, true); 1909 set_cpu_possible(cpu, true);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..8793ab33e2c1 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -11,7 +11,6 @@
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
12#include <linux/suspend.h> 12#include <linux/suspend.h>
13#include <linux/kbuild.h> 13#include <linux/kbuild.h>
14#include <asm/pda.h>
15#include <asm/processor.h> 14#include <asm/processor.h>
16#include <asm/segment.h> 15#include <asm/segment.h>
17#include <asm/thread_info.h> 16#include <asm/thread_info.h>
@@ -48,16 +47,6 @@ int main(void)
48#endif 47#endif
49 BLANK(); 48 BLANK();
50#undef ENTRY 49#undef ENTRY
51#define ENTRY(entry) DEFINE(pda_ ## entry, offsetof(struct x8664_pda, entry))
52 ENTRY(kernelstack);
53 ENTRY(oldrsp);
54 ENTRY(pcurrent);
55 ENTRY(irqcount);
56 ENTRY(cpunumber);
57 ENTRY(irqstackptr);
58 ENTRY(data_offset);
59 BLANK();
60#undef ENTRY
61#ifdef CONFIG_PARAVIRT 50#ifdef CONFIG_PARAVIRT
62 BLANK(); 51 BLANK();
63 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled); 52 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 2cf23634b6d9..4e581fdc0a5a 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -143,37 +143,3 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
143 return; 143 return;
144#endif 144#endif
145} 145}
146
147#ifdef CONFIG_X86_PAT
148void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
149{
150 if (!cpu_has_pat)
151 pat_disable("PAT not supported by CPU.");
152
153 switch (c->x86_vendor) {
154 case X86_VENDOR_INTEL:
155 /*
156 * There is a known erratum on Pentium III and Core Solo
157 * and Core Duo CPUs.
158 * " Page with PAT set to WC while associated MTRR is UC
159 * may consolidate to UC "
160 * Because of this erratum, it is better to stick with
161 * setting WC in MTRR rather than using PAT on these CPUs.
162 *
163 * Enable PAT WC only on P4, Core 2 or later CPUs.
164 */
165 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
166 return;
167
168 pat_disable("PAT WC disabled due to known CPU erratum.");
169 return;
170
171 case X86_VENDOR_AMD:
172 case X86_VENDOR_CENTAUR:
173 case X86_VENDOR_TRANSMETA:
174 return;
175 }
176
177 pat_disable("PAT disabled. Not yet verified on this CPU type.");
178}
179#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f00258462444..275e2cb43b91 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -28,9 +28,9 @@
28#include <asm/apic.h> 28#include <asm/apic.h>
29#include <mach_apic.h> 29#include <mach_apic.h>
30#include <asm/genapic.h> 30#include <asm/genapic.h>
31#include <asm/uv/uv.h>
31#endif 32#endif
32 33
33#include <asm/pda.h>
34#include <asm/pgtable.h> 34#include <asm/pgtable.h>
35#include <asm/processor.h> 35#include <asm/processor.h>
36#include <asm/desc.h> 36#include <asm/desc.h>
@@ -52,6 +52,15 @@ cpumask_var_t cpu_initialized_mask;
52/* representing cpus for which sibling maps can be computed */ 52/* representing cpus for which sibling maps can be computed */
53cpumask_var_t cpu_sibling_setup_mask; 53cpumask_var_t cpu_sibling_setup_mask;
54 54
55/* correctly size the local cpu masks */
56void __init setup_cpu_local_masks(void)
57{
58 alloc_bootmem_cpumask_var(&cpu_initialized_mask);
59 alloc_bootmem_cpumask_var(&cpu_callin_mask);
60 alloc_bootmem_cpumask_var(&cpu_callout_mask);
61 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
62}
63
55#else /* CONFIG_X86_32 */ 64#else /* CONFIG_X86_32 */
56 65
57cpumask_t cpu_callin_map; 66cpumask_t cpu_callin_map;
@@ -64,23 +73,23 @@ cpumask_t cpu_sibling_setup_map;
64 73
65static struct cpu_dev *this_cpu __cpuinitdata; 74static struct cpu_dev *this_cpu __cpuinitdata;
66 75
76DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
67#ifdef CONFIG_X86_64 77#ifdef CONFIG_X86_64
68/* We need valid kernel segments for data and code in long mode too 78 /*
69 * IRET will check the segment types kkeil 2000/10/28 79 * We need valid kernel segments for data and code in long mode too
70 * Also sysret mandates a special GDT layout 80 * IRET will check the segment types kkeil 2000/10/28
71 */ 81 * Also sysret mandates a special GDT layout
72/* The TLS descriptors are currently at a different place compared to i386. 82 *
73 Hopefully nobody expects them at a fixed place (Wine?) */ 83 * The TLS descriptors are currently at a different place compared to i386.
74DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 84 * Hopefully nobody expects them at a fixed place (Wine?)
85 */
75 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, 86 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
76 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, 87 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
77 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, 88 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
78 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, 89 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
79 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, 90 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
80 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, 91 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
81} };
82#else 92#else
83DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
84 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 93 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
85 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 94 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
86 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 95 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -112,9 +121,9 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
112 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, 121 [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },
113 122
114 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 123 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
115 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 124 [GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
116} };
117#endif 125#endif
126} };
118EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 127EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
119 128
120#ifdef CONFIG_X86_32 129#ifdef CONFIG_X86_32
@@ -215,6 +224,49 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
215#endif 224#endif
216 225
217/* 226/*
227 * Some CPU features depend on higher CPUID levels, which may not always
228 * be available due to CPUID level capping or broken virtualization
229 * software. Add those features to this table to auto-disable them.
230 */
231struct cpuid_dependent_feature {
232 u32 feature;
233 u32 level;
234};
235static const struct cpuid_dependent_feature __cpuinitconst
236cpuid_dependent_features[] = {
237 { X86_FEATURE_MWAIT, 0x00000005 },
238 { X86_FEATURE_DCA, 0x00000009 },
239 { X86_FEATURE_XSAVE, 0x0000000d },
240 { 0, 0 }
241};
242
243static void __cpuinit filter_cpuid_features(struct cpuinfo_x86 *c, bool warn)
244{
245 const struct cpuid_dependent_feature *df;
246 for (df = cpuid_dependent_features; df->feature; df++) {
247 /*
248 * Note: cpuid_level is set to -1 if unavailable, but
249 * extended_extended_level is set to 0 if unavailable
250 * and the legitimate extended levels are all negative
251 * when signed; hence the weird messing around with
252 * signs here...
253 */
254 if (cpu_has(c, df->feature) &&
255 ((s32)df->feature < 0 ?
256 (u32)df->feature > (u32)c->extended_cpuid_level :
257 (s32)df->feature > (s32)c->cpuid_level)) {
258 clear_cpu_cap(c, df->feature);
259 if (warn)
260 printk(KERN_WARNING
261 "CPU: CPU feature %s disabled "
262 "due to lack of CPUID level 0x%x\n",
263 x86_cap_flags[df->feature],
264 df->level);
265 }
266 }
267}
268
269/*
218 * Naming convention should be: <Name> [(<Codename>)] 270 * Naming convention should be: <Name> [(<Codename>)]
219 * This table only is used unless init_<vendor>() below doesn't set it; 271 * This table only is used unless init_<vendor>() below doesn't set it;
220 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 272 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
@@ -249,12 +301,17 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
249void switch_to_new_gdt(void) 301void switch_to_new_gdt(void)
250{ 302{
251 struct desc_ptr gdt_descr; 303 struct desc_ptr gdt_descr;
304 int cpu = smp_processor_id();
252 305
253 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); 306 gdt_descr.address = (long)get_cpu_gdt_table(cpu);
254 gdt_descr.size = GDT_SIZE - 1; 307 gdt_descr.size = GDT_SIZE - 1;
255 load_gdt(&gdt_descr); 308 load_gdt(&gdt_descr);
309 /* Reload the per-cpu base */
256#ifdef CONFIG_X86_32 310#ifdef CONFIG_X86_32
257 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); 311 loadsegment(fs, __KERNEL_PERCPU);
312#else
313 loadsegment(gs, 0);
314 wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu));
258#endif 315#endif
259} 316}
260 317
@@ -572,11 +629,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
572 if (this_cpu->c_early_init) 629 if (this_cpu->c_early_init)
573 this_cpu->c_early_init(c); 630 this_cpu->c_early_init(c);
574 631
575 validate_pat_support(c);
576
577#ifdef CONFIG_SMP 632#ifdef CONFIG_SMP
578 c->cpu_index = boot_cpu_id; 633 c->cpu_index = boot_cpu_id;
579#endif 634#endif
635 filter_cpuid_features(c, false);
580} 636}
581 637
582void __init early_cpu_init(void) 638void __init early_cpu_init(void)
@@ -710,6 +766,9 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
710 * we do "generic changes." 766 * we do "generic changes."
711 */ 767 */
712 768
769 /* Filter out anything that depends on CPUID levels we don't have */
770 filter_cpuid_features(c, true);
771
713 /* If the model name is still unset, do table lookup. */ 772 /* If the model name is still unset, do table lookup. */
714 if (!c->x86_model_id[0]) { 773 if (!c->x86_model_id[0]) {
715 char *p; 774 char *p;
@@ -879,54 +938,26 @@ static __init int setup_disablecpuid(char *arg)
879__setup("clearcpuid=", setup_disablecpuid); 938__setup("clearcpuid=", setup_disablecpuid);
880 939
881#ifdef CONFIG_X86_64 940#ifdef CONFIG_X86_64
882struct x8664_pda **_cpu_pda __read_mostly;
883EXPORT_SYMBOL(_cpu_pda);
884
885struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; 941struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
886 942
887static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; 943DEFINE_PER_CPU_FIRST(union irq_stack_union,
888 944 irq_stack_union) __aligned(PAGE_SIZE);
889void __cpuinit pda_init(int cpu) 945#ifdef CONFIG_SMP
890{ 946DEFINE_PER_CPU(char *, irq_stack_ptr); /* will be set during per cpu init */
891 struct x8664_pda *pda = cpu_pda(cpu); 947#else
948DEFINE_PER_CPU(char *, irq_stack_ptr) =
949 per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64;
950#endif
892 951
893 /* Setup up data that may be needed in __get_free_pages early */ 952DEFINE_PER_CPU(unsigned long, kernel_stack) =
894 loadsegment(fs, 0); 953 (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
895 loadsegment(gs, 0); 954EXPORT_PER_CPU_SYMBOL(kernel_stack);
896 /* Memory clobbers used to order PDA accessed */
897 mb();
898 wrmsrl(MSR_GS_BASE, pda);
899 mb();
900
901 pda->cpunumber = cpu;
902 pda->irqcount = -1;
903 pda->kernelstack = (unsigned long)stack_thread_info() -
904 PDA_STACKOFFSET + THREAD_SIZE;
905 pda->active_mm = &init_mm;
906 pda->mmu_state = 0;
907
908 if (cpu == 0) {
909 /* others are initialized in smpboot.c */
910 pda->pcurrent = &init_task;
911 pda->irqstackptr = boot_cpu_stack;
912 pda->irqstackptr += IRQSTACKSIZE - 64;
913 } else {
914 if (!pda->irqstackptr) {
915 pda->irqstackptr = (char *)
916 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
917 if (!pda->irqstackptr)
918 panic("cannot allocate irqstack for cpu %d",
919 cpu);
920 pda->irqstackptr += IRQSTACKSIZE - 64;
921 }
922 955
923 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) 956DEFINE_PER_CPU(unsigned int, irq_count) = -1;
924 pda->nodenumber = cpu_to_node(cpu);
925 }
926}
927 957
928static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + 958static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
929 DEBUG_STKSZ] __page_aligned_bss; 959 [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
960 __aligned(PAGE_SIZE);
930 961
931extern asmlinkage void ignore_sysret(void); 962extern asmlinkage void ignore_sysret(void);
932 963
@@ -984,15 +1015,14 @@ void __cpuinit cpu_init(void)
984 struct tss_struct *t = &per_cpu(init_tss, cpu); 1015 struct tss_struct *t = &per_cpu(init_tss, cpu);
985 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); 1016 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
986 unsigned long v; 1017 unsigned long v;
987 char *estacks = NULL;
988 struct task_struct *me; 1018 struct task_struct *me;
989 int i; 1019 int i;
990 1020
991 /* CPU 0 is initialised in head64.c */ 1021#ifdef CONFIG_NUMA
992 if (cpu != 0) 1022 if (cpu != 0 && percpu_read(node_number) == 0 &&
993 pda_init(cpu); 1023 cpu_to_node(cpu) != NUMA_NO_NODE)
994 else 1024 percpu_write(node_number, cpu_to_node(cpu));
995 estacks = boot_exception_stacks; 1025#endif
996 1026
997 me = current; 1027 me = current;
998 1028
@@ -1009,6 +1039,8 @@ void __cpuinit cpu_init(void)
1009 */ 1039 */
1010 1040
1011 switch_to_new_gdt(); 1041 switch_to_new_gdt();
1042 loadsegment(fs, 0);
1043
1012 load_idt((const struct desc_ptr *)&idt_descr); 1044 load_idt((const struct desc_ptr *)&idt_descr);
1013 1045
1014 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); 1046 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
@@ -1026,18 +1058,13 @@ void __cpuinit cpu_init(void)
1026 * set up and load the per-CPU TSS 1058 * set up and load the per-CPU TSS
1027 */ 1059 */
1028 if (!orig_ist->ist[0]) { 1060 if (!orig_ist->ist[0]) {
1029 static const unsigned int order[N_EXCEPTION_STACKS] = { 1061 static const unsigned int sizes[N_EXCEPTION_STACKS] = {
1030 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, 1062 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
1031 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER 1063 [DEBUG_STACK - 1] = DEBUG_STKSZ
1032 }; 1064 };
1065 char *estacks = per_cpu(exception_stacks, cpu);
1033 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1066 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1034 if (cpu) { 1067 estacks += sizes[v];
1035 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1036 if (!estacks)
1037 panic("Cannot allocate exception "
1038 "stack %ld %d\n", v, cpu);
1039 }
1040 estacks += PAGE_SIZE << order[v];
1041 orig_ist->ist[v] = t->x86_tss.ist[v] = 1068 orig_ist->ist[v] = t->x86_tss.ist[v] =
1042 (unsigned long)estacks; 1069 (unsigned long)estacks;
1043 } 1070 }
@@ -1071,22 +1098,19 @@ void __cpuinit cpu_init(void)
1071 */ 1098 */
1072 if (kgdb_connected && arch_kgdb_ops.correct_hw_break) 1099 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1073 arch_kgdb_ops.correct_hw_break(); 1100 arch_kgdb_ops.correct_hw_break();
1074 else { 1101 else
1075#endif 1102#endif
1076 /* 1103 {
1077 * Clear all 6 debug registers: 1104 /*
1078 */ 1105 * Clear all 6 debug registers:
1079 1106 */
1080 set_debugreg(0UL, 0); 1107 set_debugreg(0UL, 0);
1081 set_debugreg(0UL, 1); 1108 set_debugreg(0UL, 1);
1082 set_debugreg(0UL, 2); 1109 set_debugreg(0UL, 2);
1083 set_debugreg(0UL, 3); 1110 set_debugreg(0UL, 3);
1084 set_debugreg(0UL, 6); 1111 set_debugreg(0UL, 6);
1085 set_debugreg(0UL, 7); 1112 set_debugreg(0UL, 7);
1086#ifdef CONFIG_KGDB
1087 /* If the kgdb is connected no debug regs should be altered. */
1088 } 1113 }
1089#endif
1090 1114
1091 fpu_init(); 1115 fpu_init();
1092 1116
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 06fcd8f9323c..4b1c319d30c3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -145,7 +145,7 @@ typedef union {
145 145
146struct drv_cmd { 146struct drv_cmd {
147 unsigned int type; 147 unsigned int type;
148 cpumask_var_t mask; 148 const struct cpumask *mask;
149 drv_addr_union addr; 149 drv_addr_union addr;
150 u32 val; 150 u32 val;
151}; 151};
@@ -231,15 +231,9 @@ static u32 get_cur_val(const struct cpumask *mask)
231 return 0; 231 return 0;
232 } 232 }
233 233
234 if (unlikely(!alloc_cpumask_var(&cmd.mask, GFP_KERNEL))) 234 cmd.mask = mask;
235 return 0;
236
237 cpumask_copy(cmd.mask, mask);
238
239 drv_read(&cmd); 235 drv_read(&cmd);
240 236
241 free_cpumask_var(cmd.mask);
242
243 dprintk("get_cur_val = %u\n", cmd.val); 237 dprintk("get_cur_val = %u\n", cmd.val);
244 238
245 return cmd.val; 239 return cmd.val;
@@ -369,7 +363,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
369 return freq; 363 return freq;
370} 364}
371 365
372static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq, 366static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
373 struct acpi_cpufreq_data *data) 367 struct acpi_cpufreq_data *data)
374{ 368{
375 unsigned int cur_freq; 369 unsigned int cur_freq;
@@ -404,9 +398,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
404 return -ENODEV; 398 return -ENODEV;
405 } 399 }
406 400
407 if (unlikely(!alloc_cpumask_var(&cmd.mask, GFP_KERNEL)))
408 return -ENOMEM;
409
410 perf = data->acpi_data; 401 perf = data->acpi_data;
411 result = cpufreq_frequency_table_target(policy, 402 result = cpufreq_frequency_table_target(policy,
412 data->freq_table, 403 data->freq_table,
@@ -451,9 +442,9 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 442
452 /* cpufreq holds the hotplug lock, so we are safe from here on */ 443 /* cpufreq holds the hotplug lock, so we are safe from here on */
453 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY) 444 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
454 cpumask_and(cmd.mask, cpu_online_mask, policy->cpus); 445 cmd.mask = policy->cpus;
455 else 446 else
456 cpumask_copy(cmd.mask, cpumask_of(policy->cpu)); 447 cmd.mask = cpumask_of(policy->cpu);
457 448
458 freqs.old = perf->states[perf->state].core_frequency * 1000; 449 freqs.old = perf->states[perf->state].core_frequency * 1000;
459 freqs.new = data->freq_table[next_state].frequency; 450 freqs.new = data->freq_table[next_state].frequency;
@@ -480,7 +471,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
480 perf->state = next_perf_state; 471 perf->state = next_perf_state;
481 472
482out: 473out:
483 free_cpumask_var(cmd.mask);
484 return result; 474 return result;
485} 475}
486 476
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 8ea6929e974c..5deefae9064d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,6 +29,19 @@
29 29
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 31{
32 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
39 misc_enable &= ~MSR_IA32_MISC_ENABLE_LIMIT_CPUID;
40 wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
41 c->cpuid_level = cpuid_eax(0);
42 }
43 }
44
32 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 45 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
33 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 46 (c->x86 == 0x6 && c->x86_model >= 0x0e))
34 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 47 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
@@ -50,6 +63,18 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
50 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 63 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
51 } 64 }
52 65
66 /*
67 * There is a known erratum on Pentium III and Core Solo
68 * and Core Duo CPUs.
69 * " Page with PAT set to WC while associated MTRR is UC
70 * may consolidate to UC "
71 * Because of this erratum, it is better to stick with
72 * setting WC in MTRR rather than using PAT on these CPUs.
73 *
74 * Enable PAT WC only on P4, Core 2 or later CPUs.
75 */
76 if (c->x86 == 6 && c->x86_model < 15)
77 clear_cpu_cap(c, X86_FEATURE_PAT);
53} 78}
54 79
55#ifdef CONFIG_X86_32 80#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 48533d77be78..58527a9fc404 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -132,7 +132,16 @@ struct _cpuid4_info {
132 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
133 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable; 134 unsigned long can_disable;
135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
136};
137
138/* subset of above _cpuid4_info w/o shared_cpu_map */
139struct _cpuid4_info_regs {
140 union _cpuid4_leaf_eax eax;
141 union _cpuid4_leaf_ebx ebx;
142 union _cpuid4_leaf_ecx ecx;
143 unsigned long size;
144 unsigned long can_disable;
136}; 145};
137 146
138#ifdef CONFIG_PCI 147#ifdef CONFIG_PCI
@@ -263,7 +272,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
263} 272}
264 273
265static void __cpuinit 274static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) 275amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
267{ 276{
268 if (index < 3) 277 if (index < 3)
269 return; 278 return;
@@ -271,7 +280,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
271} 280}
272 281
273static int 282static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 283__cpuinit cpuid4_cache_lookup_regs(int index,
284 struct _cpuid4_info_regs *this_leaf)
275{ 285{
276 union _cpuid4_leaf_eax eax; 286 union _cpuid4_leaf_eax eax;
277 union _cpuid4_leaf_ebx ebx; 287 union _cpuid4_leaf_ebx ebx;
@@ -299,6 +309,15 @@ __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
299 return 0; 309 return 0;
300} 310}
301 311
312static int
313__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
314{
315 struct _cpuid4_info_regs *leaf_regs =
316 (struct _cpuid4_info_regs *)this_leaf;
317
318 return cpuid4_cache_lookup_regs(index, leaf_regs);
319}
320
302static int __cpuinit find_num_cache_leaves(void) 321static int __cpuinit find_num_cache_leaves(void)
303{ 322{
304 unsigned int eax, ebx, ecx, edx; 323 unsigned int eax, ebx, ecx, edx;
@@ -338,11 +357,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
338 * parameters cpuid leaf to find the cache details 357 * parameters cpuid leaf to find the cache details
339 */ 358 */
340 for (i = 0; i < num_cache_leaves; i++) { 359 for (i = 0; i < num_cache_leaves; i++) {
341 struct _cpuid4_info this_leaf; 360 struct _cpuid4_info_regs this_leaf;
342
343 int retval; 361 int retval;
344 362
345 retval = cpuid4_cache_lookup(i, &this_leaf); 363 retval = cpuid4_cache_lookup_regs(i, &this_leaf);
346 if (retval >= 0) { 364 if (retval >= 0) {
347 switch(this_leaf.eax.split.level) { 365 switch(this_leaf.eax.split.level) {
348 case 1: 366 case 1:
@@ -491,17 +509,20 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
491 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; 509 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
492 510
493 if (num_threads_sharing == 1) 511 if (num_threads_sharing == 1)
494 cpu_set(cpu, this_leaf->shared_cpu_map); 512 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
495 else { 513 else {
496 index_msb = get_count_order(num_threads_sharing); 514 index_msb = get_count_order(num_threads_sharing);
497 515
498 for_each_online_cpu(i) { 516 for_each_online_cpu(i) {
499 if (cpu_data(i).apicid >> index_msb == 517 if (cpu_data(i).apicid >> index_msb ==
500 c->apicid >> index_msb) { 518 c->apicid >> index_msb) {
501 cpu_set(i, this_leaf->shared_cpu_map); 519 cpumask_set_cpu(i,
520 to_cpumask(this_leaf->shared_cpu_map));
502 if (i != cpu && per_cpu(cpuid4_info, i)) { 521 if (i != cpu && per_cpu(cpuid4_info, i)) {
503 sibling_leaf = CPUID4_INFO_IDX(i, index); 522 sibling_leaf =
504 cpu_set(cpu, sibling_leaf->shared_cpu_map); 523 CPUID4_INFO_IDX(i, index);
524 cpumask_set_cpu(cpu, to_cpumask(
525 sibling_leaf->shared_cpu_map));
505 } 526 }
506 } 527 }
507 } 528 }
@@ -513,9 +534,10 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
513 int sibling; 534 int sibling;
514 535
515 this_leaf = CPUID4_INFO_IDX(cpu, index); 536 this_leaf = CPUID4_INFO_IDX(cpu, index);
516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 537 for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
517 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 538 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
518 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 539 cpumask_clear_cpu(cpu,
540 to_cpumask(sibling_leaf->shared_cpu_map));
519 } 541 }
520} 542}
521#else 543#else
@@ -620,8 +642,9 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
620 int n = 0; 642 int n = 0;
621 643
622 if (len > 1) { 644 if (len > 1) {
623 cpumask_t *mask = &this_leaf->shared_cpu_map; 645 const struct cpumask *mask;
624 646
647 mask = to_cpumask(this_leaf->shared_cpu_map);
625 n = type? 648 n = type?
626 cpulist_scnprintf(buf, len-2, mask) : 649 cpulist_scnprintf(buf, len-2, mask) :
627 cpumask_scnprintf(buf, len-2, mask); 650 cpumask_scnprintf(buf, len-2, mask);
@@ -684,7 +707,8 @@ static struct pci_dev *get_k8_northbridge(int node)
684 707
685static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) 708static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
686{ 709{
687 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 710 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
711 int node = cpu_to_node(cpumask_first(mask));
688 struct pci_dev *dev = NULL; 712 struct pci_dev *dev = NULL;
689 ssize_t ret = 0; 713 ssize_t ret = 0;
690 int i; 714 int i;
@@ -718,7 +742,8 @@ static ssize_t
718store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, 742store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
719 size_t count) 743 size_t count)
720{ 744{
721 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); 745 const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
746 int node = cpu_to_node(cpumask_first(mask));
722 struct pci_dev *dev = NULL; 747 struct pci_dev *dev = NULL;
723 unsigned int ret, index, val; 748 unsigned int ret, index, val;
724 749
@@ -863,7 +888,7 @@ err_out:
863 return -ENOMEM; 888 return -ENOMEM;
864} 889}
865 890
866static cpumask_t cache_dev_map = CPU_MASK_NONE; 891static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
867 892
868/* Add/Remove cache interface for CPU device */ 893/* Add/Remove cache interface for CPU device */
869static int __cpuinit cache_add_dev(struct sys_device * sys_dev) 894static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
@@ -903,7 +928,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
903 } 928 }
904 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 929 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
905 } 930 }
906 cpu_set(cpu, cache_dev_map); 931 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
907 932
908 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 933 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
909 return 0; 934 return 0;
@@ -916,9 +941,9 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
916 941
917 if (per_cpu(cpuid4_info, cpu) == NULL) 942 if (per_cpu(cpuid4_info, cpu) == NULL)
918 return; 943 return;
919 if (!cpu_isset(cpu, cache_dev_map)) 944 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
920 return; 945 return;
921 cpu_clear(cpu, cache_dev_map); 946 cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
922 947
923 for (i = 0; i < num_cache_leaves; i++) 948 for (i = 0; i < num_cache_leaves; i++)
924 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj)); 949 kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 8ae8c4ff094d..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -67,7 +67,7 @@ static struct threshold_block threshold_defaults = {
67struct threshold_bank { 67struct threshold_bank {
68 struct kobject *kobj; 68 struct kobject *kobj;
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
73 73
@@ -481,7 +481,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
481 481
482#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 483 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
484 i = first_cpu(per_cpu(cpu_core_map, cpu)); 484 i = cpumask_first(&per_cpu(cpu_core_map, cpu));
485 485
486 /* first core not up yet */ 486 /* first core not up yet */
487 if (cpu_data(i).cpu_core_id) 487 if (cpu_data(i).cpu_core_id)
@@ -501,7 +501,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
501 if (err) 501 if (err)
502 goto out; 502 goto out;
503 503
504 b->cpus = per_cpu(cpu_core_map, cpu); 504 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
505 per_cpu(threshold_banks, cpu)[bank] = b; 505 per_cpu(threshold_banks, cpu)[bank] = b;
506 goto out; 506 goto out;
507 } 507 }
@@ -512,15 +512,20 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
512 err = -ENOMEM; 512 err = -ENOMEM;
513 goto out; 513 goto out;
514 } 514 }
515 if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
516 kfree(b);
517 err = -ENOMEM;
518 goto out;
519 }
515 520
516 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj); 521 b->kobj = kobject_create_and_add(name, &per_cpu(device_mce, cpu).kobj);
517 if (!b->kobj) 522 if (!b->kobj)
518 goto out_free; 523 goto out_free;
519 524
520#ifndef CONFIG_SMP 525#ifndef CONFIG_SMP
521 b->cpus = CPU_MASK_ALL; 526 cpumask_setall(b->cpus);
522#else 527#else
523 b->cpus = per_cpu(cpu_core_map, cpu); 528 cpumask_copy(b->cpus, &per_cpu(cpu_core_map, cpu));
524#endif 529#endif
525 530
526 per_cpu(threshold_banks, cpu)[bank] = b; 531 per_cpu(threshold_banks, cpu)[bank] = b;
@@ -529,7 +534,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
529 if (err) 534 if (err)
530 goto out_free; 535 goto out_free;
531 536
532 for_each_cpu_mask_nr(i, b->cpus) { 537 for_each_cpu(i, b->cpus) {
533 if (i == cpu) 538 if (i == cpu)
534 continue; 539 continue;
535 540
@@ -545,6 +550,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
545 550
546out_free: 551out_free:
547 per_cpu(threshold_banks, cpu)[bank] = NULL; 552 per_cpu(threshold_banks, cpu)[bank] = NULL;
553 free_cpumask_var(b->cpus);
548 kfree(b); 554 kfree(b);
549out: 555out:
550 return err; 556 return err;
@@ -619,7 +625,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
619#endif 625#endif
620 626
621 /* remove all sibling symlinks before unregistering */ 627 /* remove all sibling symlinks before unregistering */
622 for_each_cpu_mask_nr(i, b->cpus) { 628 for_each_cpu(i, b->cpus) {
623 if (i == cpu) 629 if (i == cpu)
624 continue; 630 continue;
625 631
@@ -632,6 +638,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
632free_out: 638free_out:
633 kobject_del(b->kobj); 639 kobject_del(b->kobj);
634 kobject_put(b->kobj); 640 kobject_put(b->kobj);
641 free_cpumask_var(b->cpus);
635 kfree(b); 642 kfree(b);
636 per_cpu(threshold_banks, cpu)[bank] = NULL; 643 per_cpu(threshold_banks, cpu)[bank] = NULL;
637} 644}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 4b48f251fd39..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,6 +7,7 @@
7#include <linux/interrupt.h> 7#include <linux/interrupt.h>
8#include <linux/percpu.h> 8#include <linux/percpu.h>
9#include <asm/processor.h> 9#include <asm/processor.h>
10#include <asm/apic.h>
10#include <asm/msr.h> 11#include <asm/msr.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/hw_irq.h> 13#include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index b59ddcc88cd8..0c0a455fe95c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -33,11 +33,13 @@ u64 mtrr_tom2;
33struct mtrr_state_type mtrr_state = {}; 33struct mtrr_state_type mtrr_state = {};
34EXPORT_SYMBOL_GPL(mtrr_state); 34EXPORT_SYMBOL_GPL(mtrr_state);
35 35
36#undef MODULE_PARAM_PREFIX 36static int __initdata mtrr_show;
37#define MODULE_PARAM_PREFIX "mtrr." 37static int __init mtrr_debug(char *opt)
38 38{
39static int mtrr_show; 39 mtrr_show = 1;
40module_param_named(show, mtrr_show, bool, 0); 40 return 0;
41}
42early_param("mtrr.show", mtrr_debug);
41 43
42/* 44/*
43 * Returns the effective MTRR type for the region 45 * Returns the effective MTRR type for the region
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index c302d0707048..d35db5993fd6 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -106,7 +106,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
106 const struct stacktrace_ops *ops, void *data) 106 const struct stacktrace_ops *ops, void *data)
107{ 107{
108 const unsigned cpu = get_cpu(); 108 const unsigned cpu = get_cpu();
109 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; 109 unsigned long *irq_stack_end =
110 (unsigned long *)per_cpu(irq_stack_ptr, cpu);
110 unsigned used = 0; 111 unsigned used = 0;
111 struct thread_info *tinfo; 112 struct thread_info *tinfo;
112 int graph = 0; 113 int graph = 0;
@@ -160,23 +161,23 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
160 stack = (unsigned long *) estack_end[-2]; 161 stack = (unsigned long *) estack_end[-2];
161 continue; 162 continue;
162 } 163 }
163 if (irqstack_end) { 164 if (irq_stack_end) {
164 unsigned long *irqstack; 165 unsigned long *irq_stack;
165 irqstack = irqstack_end - 166 irq_stack = irq_stack_end -
166 (IRQSTACKSIZE - 64) / sizeof(*irqstack); 167 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
167 168
168 if (stack >= irqstack && stack < irqstack_end) { 169 if (stack >= irq_stack && stack < irq_stack_end) {
169 if (ops->stack(data, "IRQ") < 0) 170 if (ops->stack(data, "IRQ") < 0)
170 break; 171 break;
171 bp = print_context_stack(tinfo, stack, bp, 172 bp = print_context_stack(tinfo, stack, bp,
172 ops, data, irqstack_end, &graph); 173 ops, data, irq_stack_end, &graph);
173 /* 174 /*
174 * We link to the next stack (which would be 175 * We link to the next stack (which would be
175 * the process stack normally) the last 176 * the process stack normally) the last
176 * pointer (index -1 to end) in the IRQ stack: 177 * pointer (index -1 to end) in the IRQ stack:
177 */ 178 */
178 stack = (unsigned long *) (irqstack_end[-1]); 179 stack = (unsigned long *) (irq_stack_end[-1]);
179 irqstack_end = NULL; 180 irq_stack_end = NULL;
180 ops->stack(data, "EOI"); 181 ops->stack(data, "EOI");
181 continue; 182 continue;
182 } 183 }
@@ -199,10 +200,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
199 unsigned long *stack; 200 unsigned long *stack;
200 int i; 201 int i;
201 const int cpu = smp_processor_id(); 202 const int cpu = smp_processor_id();
202 unsigned long *irqstack_end = 203 unsigned long *irq_stack_end =
203 (unsigned long *) (cpu_pda(cpu)->irqstackptr); 204 (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
204 unsigned long *irqstack = 205 unsigned long *irq_stack =
205 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); 206 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
206 207
207 /* 208 /*
208 * debugging aid: "show_stack(NULL, NULL);" prints the 209 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -218,9 +219,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
218 219
219 stack = sp; 220 stack = sp;
220 for (i = 0; i < kstack_depth_to_print; i++) { 221 for (i = 0; i < kstack_depth_to_print; i++) {
221 if (stack >= irqstack && stack <= irqstack_end) { 222 if (stack >= irq_stack && stack <= irq_stack_end) {
222 if (stack == irqstack_end) { 223 if (stack == irq_stack_end) {
223 stack = (unsigned long *) (irqstack_end[-1]); 224 stack = (unsigned long *) (irq_stack_end[-1]);
224 printk(" <EOI> "); 225 printk(" <EOI> ");
225 } 226 }
226 } else { 227 } else {
@@ -241,7 +242,7 @@ void show_registers(struct pt_regs *regs)
241 int i; 242 int i;
242 unsigned long sp; 243 unsigned long sp;
243 const int cpu = smp_processor_id(); 244 const int cpu = smp_processor_id();
244 struct task_struct *cur = cpu_pda(cpu)->pcurrent; 245 struct task_struct *cur = current;
245 246
246 sp = regs->sp; 247 sp = regs->sp;
247 printk("CPU %d ", cpu); 248 printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 1119d247fe11..b205272ad394 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -366,10 +366,12 @@ void __init efi_init(void)
366 SMBIOS_TABLE_GUID)) { 366 SMBIOS_TABLE_GUID)) {
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369#ifdef CONFIG_X86_UV
369 } else if (!efi_guidcmp(config_tables[i].guid, 370 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) { 371 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table; 372 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table); 373 printk(" UVsystab=0x%lx ", config_tables[i].table);
374#endif
373 } else if (!efi_guidcmp(config_tables[i].guid, 375 } else if (!efi_guidcmp(config_tables[i].guid,
374 HCDP_TABLE_GUID)) { 376 HCDP_TABLE_GUID)) {
375 efi.hcdp = config_tables[i].table; 377 efi.hcdp = config_tables[i].table;
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index 652c5287215f..a4ee29127fdf 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -36,6 +36,7 @@
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/efi.h> 37#include <asm/efi.h>
38#include <asm/cacheflush.h> 38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
39 40
40static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
41static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d6f0490a7391..a0b91aac72a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -672,7 +672,7 @@ common_interrupt:
672ENDPROC(common_interrupt) 672ENDPROC(common_interrupt)
673 CFI_ENDPROC 673 CFI_ENDPROC
674 674
675#define BUILD_INTERRUPT(name, nr) \ 675#define BUILD_INTERRUPT3(name, nr, fn) \
676ENTRY(name) \ 676ENTRY(name) \
677 RING0_INT_FRAME; \ 677 RING0_INT_FRAME; \
678 pushl $~(nr); \ 678 pushl $~(nr); \
@@ -680,11 +680,13 @@ ENTRY(name) \
680 SAVE_ALL; \ 680 SAVE_ALL; \
681 TRACE_IRQS_OFF \ 681 TRACE_IRQS_OFF \
682 movl %esp,%eax; \ 682 movl %esp,%eax; \
683 call smp_##name; \ 683 call fn; \
684 jmp ret_from_intr; \ 684 jmp ret_from_intr; \
685 CFI_ENDPROC; \ 685 CFI_ENDPROC; \
686ENDPROC(name) 686ENDPROC(name)
687 687
688#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name)
689
688/* The include is where all of the SMP etc. interrupts come from */ 690/* The include is where all of the SMP etc. interrupts come from */
689#include "entry_arch.h" 691#include "entry_arch.h"
690 692
@@ -1203,7 +1205,6 @@ nmi_stack_correct:
1203 pushl %eax 1205 pushl %eax
1204 CFI_ADJUST_CFA_OFFSET 4 1206 CFI_ADJUST_CFA_OFFSET 4
1205 SAVE_ALL 1207 SAVE_ALL
1206 TRACE_IRQS_OFF
1207 xorl %edx,%edx # zero error code 1208 xorl %edx,%edx # zero error code
1208 movl %esp,%eax # pt_regs pointer 1209 movl %esp,%eax # pt_regs pointer
1209 call do_nmi 1210 call do_nmi
@@ -1244,7 +1245,6 @@ nmi_espfix_stack:
1244 pushl %eax 1245 pushl %eax
1245 CFI_ADJUST_CFA_OFFSET 4 1246 CFI_ADJUST_CFA_OFFSET 4
1246 SAVE_ALL 1247 SAVE_ALL
1247 TRACE_IRQS_OFF
1248 FIXUP_ESPFIX_STACK # %eax == %esp 1248 FIXUP_ESPFIX_STACK # %eax == %esp
1249 xorl %edx,%edx # zero error code 1249 xorl %edx,%edx # zero error code
1250 call do_nmi 1250 call do_nmi
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e28c7a987793..82801fd2e931 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -52,6 +52,7 @@
52#include <asm/irqflags.h> 52#include <asm/irqflags.h>
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/percpu.h>
55 56
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h> 58#include <linux/elf-em.h>
@@ -209,7 +210,7 @@ ENTRY(native_usergs_sysret64)
209 210
210 /* %rsp:at FRAMEEND */ 211 /* %rsp:at FRAMEEND */
211 .macro FIXUP_TOP_OF_STACK tmp offset=0 212 .macro FIXUP_TOP_OF_STACK tmp offset=0
212 movq %gs:pda_oldrsp,\tmp 213 movq PER_CPU_VAR(old_rsp),\tmp
213 movq \tmp,RSP+\offset(%rsp) 214 movq \tmp,RSP+\offset(%rsp)
214 movq $__USER_DS,SS+\offset(%rsp) 215 movq $__USER_DS,SS+\offset(%rsp)
215 movq $__USER_CS,CS+\offset(%rsp) 216 movq $__USER_CS,CS+\offset(%rsp)
@@ -220,7 +221,7 @@ ENTRY(native_usergs_sysret64)
220 221
221 .macro RESTORE_TOP_OF_STACK tmp offset=0 222 .macro RESTORE_TOP_OF_STACK tmp offset=0
222 movq RSP+\offset(%rsp),\tmp 223 movq RSP+\offset(%rsp),\tmp
223 movq \tmp,%gs:pda_oldrsp 224 movq \tmp,PER_CPU_VAR(old_rsp)
224 movq EFLAGS+\offset(%rsp),\tmp 225 movq EFLAGS+\offset(%rsp),\tmp
225 movq \tmp,R11+\offset(%rsp) 226 movq \tmp,R11+\offset(%rsp)
226 .endm 227 .endm
@@ -336,15 +337,15 @@ ENTRY(save_args)
336 je 1f 337 je 1f
337 SWAPGS 338 SWAPGS
338 /* 339 /*
339 * irqcount is used to check if a CPU is already on an interrupt stack 340 * irq_count is used to check if a CPU is already on an interrupt stack
340 * or not. While this is essentially redundant with preempt_count it is 341 * or not. While this is essentially redundant with preempt_count it is
341 * a little cheaper to use a separate counter in the PDA (short of 342 * a little cheaper to use a separate counter in the PDA (short of
342 * moving irq_enter into assembly, which would be too much work) 343 * moving irq_enter into assembly, which would be too much work)
343 */ 344 */
3441: incl %gs:pda_irqcount 3451: incl PER_CPU_VAR(irq_count)
345 jne 2f 346 jne 2f
346 popq_cfi %rax /* move return address... */ 347 popq_cfi %rax /* move return address... */
347 mov %gs:pda_irqstackptr,%rsp 348 mov PER_CPU_VAR(irq_stack_ptr),%rsp
348 EMPTY_FRAME 0 349 EMPTY_FRAME 0
349 pushq_cfi %rax /* ... to the new stack */ 350 pushq_cfi %rax /* ... to the new stack */
350 /* 351 /*
@@ -408,6 +409,8 @@ END(save_paranoid)
408ENTRY(ret_from_fork) 409ENTRY(ret_from_fork)
409 DEFAULT_FRAME 410 DEFAULT_FRAME
410 411
412 LOCK ; btr $TIF_FORK,TI_flags(%r8)
413
411 push kernel_eflags(%rip) 414 push kernel_eflags(%rip)
412 CFI_ADJUST_CFA_OFFSET 8 415 CFI_ADJUST_CFA_OFFSET 8
413 popf # reset kernel eflags 416 popf # reset kernel eflags
@@ -467,7 +470,7 @@ END(ret_from_fork)
467ENTRY(system_call) 470ENTRY(system_call)
468 CFI_STARTPROC simple 471 CFI_STARTPROC simple
469 CFI_SIGNAL_FRAME 472 CFI_SIGNAL_FRAME
470 CFI_DEF_CFA rsp,PDA_STACKOFFSET 473 CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
471 CFI_REGISTER rip,rcx 474 CFI_REGISTER rip,rcx
472 /*CFI_REGISTER rflags,r11*/ 475 /*CFI_REGISTER rflags,r11*/
473 SWAPGS_UNSAFE_STACK 476 SWAPGS_UNSAFE_STACK
@@ -478,8 +481,8 @@ ENTRY(system_call)
478 */ 481 */
479ENTRY(system_call_after_swapgs) 482ENTRY(system_call_after_swapgs)
480 483
481 movq %rsp,%gs:pda_oldrsp 484 movq %rsp,PER_CPU_VAR(old_rsp)
482 movq %gs:pda_kernelstack,%rsp 485 movq PER_CPU_VAR(kernel_stack),%rsp
483 /* 486 /*
484 * No need to follow this irqs off/on section - it's straight 487 * No need to follow this irqs off/on section - it's straight
485 * and short: 488 * and short:
@@ -522,7 +525,7 @@ sysret_check:
522 CFI_REGISTER rip,rcx 525 CFI_REGISTER rip,rcx
523 RESTORE_ARGS 0,-ARG_SKIP,1 526 RESTORE_ARGS 0,-ARG_SKIP,1
524 /*CFI_REGISTER rflags,r11*/ 527 /*CFI_REGISTER rflags,r11*/
525 movq %gs:pda_oldrsp, %rsp 528 movq PER_CPU_VAR(old_rsp), %rsp
526 USERGS_SYSRET64 529 USERGS_SYSRET64
527 530
528 CFI_RESTORE_STATE 531 CFI_RESTORE_STATE
@@ -832,11 +835,11 @@ common_interrupt:
832 XCPT_FRAME 835 XCPT_FRAME
833 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 836 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
834 interrupt do_IRQ 837 interrupt do_IRQ
835 /* 0(%rsp): oldrsp-ARGOFFSET */ 838 /* 0(%rsp): old_rsp-ARGOFFSET */
836ret_from_intr: 839ret_from_intr:
837 DISABLE_INTERRUPTS(CLBR_NONE) 840 DISABLE_INTERRUPTS(CLBR_NONE)
838 TRACE_IRQS_OFF 841 TRACE_IRQS_OFF
839 decl %gs:pda_irqcount 842 decl PER_CPU_VAR(irq_count)
840 leaveq 843 leaveq
841 CFI_DEF_CFA_REGISTER rsp 844 CFI_DEF_CFA_REGISTER rsp
842 CFI_ADJUST_CFA_OFFSET -8 845 CFI_ADJUST_CFA_OFFSET -8
@@ -981,8 +984,10 @@ apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
981 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt 984 irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
982#endif 985#endif
983 986
987#ifdef CONFIG_X86_UV
984apicinterrupt UV_BAU_MESSAGE \ 988apicinterrupt UV_BAU_MESSAGE \
985 uv_bau_message_intr1 uv_bau_message_interrupt 989 uv_bau_message_intr1 uv_bau_message_interrupt
990#endif
986apicinterrupt LOCAL_TIMER_VECTOR \ 991apicinterrupt LOCAL_TIMER_VECTOR \
987 apic_timer_interrupt smp_apic_timer_interrupt 992 apic_timer_interrupt smp_apic_timer_interrupt
988 993
@@ -1072,10 +1077,10 @@ ENTRY(\sym)
1072 TRACE_IRQS_OFF 1077 TRACE_IRQS_OFF
1073 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
1074 xorl %esi,%esi /* no error code */ 1079 xorl %esi,%esi /* no error code */
1075 movq %gs:pda_data_offset, %rbp 1080 PER_CPU(init_tss, %rbp)
1076 subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1081 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1077 call \do_sym 1082 call \do_sym
1078 addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) 1083 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
1079 jmp paranoid_exit /* %ebx: no swapgs flag */ 1084 jmp paranoid_exit /* %ebx: no swapgs flag */
1080 CFI_ENDPROC 1085 CFI_ENDPROC
1081END(\sym) 1086END(\sym)
@@ -1259,14 +1264,14 @@ ENTRY(call_softirq)
1259 CFI_REL_OFFSET rbp,0 1264 CFI_REL_OFFSET rbp,0
1260 mov %rsp,%rbp 1265 mov %rsp,%rbp
1261 CFI_DEF_CFA_REGISTER rbp 1266 CFI_DEF_CFA_REGISTER rbp
1262 incl %gs:pda_irqcount 1267 incl PER_CPU_VAR(irq_count)
1263 cmove %gs:pda_irqstackptr,%rsp 1268 cmove PER_CPU_VAR(irq_stack_ptr),%rsp
1264 push %rbp # backlink for old unwinder 1269 push %rbp # backlink for old unwinder
1265 call __do_softirq 1270 call __do_softirq
1266 leaveq 1271 leaveq
1267 CFI_DEF_CFA_REGISTER rsp 1272 CFI_DEF_CFA_REGISTER rsp
1268 CFI_ADJUST_CFA_OFFSET -8 1273 CFI_ADJUST_CFA_OFFSET -8
1269 decl %gs:pda_irqcount 1274 decl PER_CPU_VAR(irq_count)
1270 ret 1275 ret
1271 CFI_ENDPROC 1276 CFI_ENDPROC
1272END(call_softirq) 1277END(call_softirq)
@@ -1296,15 +1301,15 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1296 movq %rdi, %rsp # we don't return, adjust the stack frame 1301 movq %rdi, %rsp # we don't return, adjust the stack frame
1297 CFI_ENDPROC 1302 CFI_ENDPROC
1298 DEFAULT_FRAME 1303 DEFAULT_FRAME
129911: incl %gs:pda_irqcount 130411: incl PER_CPU_VAR(irq_count)
1300 movq %rsp,%rbp 1305 movq %rsp,%rbp
1301 CFI_DEF_CFA_REGISTER rbp 1306 CFI_DEF_CFA_REGISTER rbp
1302 cmovzq %gs:pda_irqstackptr,%rsp 1307 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
1303 pushq %rbp # backlink for old unwinder 1308 pushq %rbp # backlink for old unwinder
1304 call xen_evtchn_do_upcall 1309 call xen_evtchn_do_upcall
1305 popq %rsp 1310 popq %rsp
1306 CFI_DEF_CFA_REGISTER rsp 1311 CFI_DEF_CFA_REGISTER rsp
1307 decl %gs:pda_irqcount 1312 decl PER_CPU_VAR(irq_count)
1308 jmp error_exit 1313 jmp error_exit
1309 CFI_ENDPROC 1314 CFI_ENDPROC
1310END(do_hypervisor_callback) 1315END(do_hypervisor_callback)
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 2bced78b0b8e..e656c2721154 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,7 +32,9 @@ extern struct genapic apic_x2apic_cluster;
32struct genapic __read_mostly *genapic = &apic_flat; 32struct genapic __read_mostly *genapic = &apic_flat;
33 33
34static struct genapic *apic_probe[] __initdata = { 34static struct genapic *apic_probe[] __initdata = {
35#ifdef CONFIG_X86_UV
35 &apic_x2apic_uv_x, 36 &apic_x2apic_uv_x,
37#endif
36 &apic_x2apic_phys, 38 &apic_x2apic_phys,
37 &apic_x2apic_cluster, 39 &apic_x2apic_cluster,
38 &apic_physflat, 40 &apic_physflat,
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index b193e082f6ce..bfe36249145c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -25,6 +25,7 @@
25#include <asm/ipi.h> 25#include <asm/ipi.h>
26#include <asm/genapic.h> 26#include <asm/genapic.h>
27#include <asm/pgtable.h> 27#include <asm/pgtable.h>
28#include <asm/uv/uv.h>
28#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
29#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
30#include <asm/uv/bios.h> 31#include <asm/uv/bios.h>
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,27 +26,6 @@
26#include <asm/bios_ebda.h> 26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 27#include <asm/trampoline.h>
28 28
29/* boot cpu pda */
30static struct x8664_pda _boot_cpu_pda;
31
32#ifdef CONFIG_SMP
33/*
34 * We install an empty cpu_pda pointer table to indicate to early users
35 * (numa_set_node) that the cpu_pda pointer table for cpus other than
36 * the boot cpu is not yet setup.
37 */
38static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39#else
40static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
41#endif
42
43void __init x86_64_init_pda(void)
44{
45 _cpu_pda = __cpu_pda;
46 cpu_pda(0) = &_boot_cpu_pda;
47 pda_init(0);
48}
49
50static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
51{ 30{
52 pgd_t *pgd = pgd_offset_k(0UL); 31 pgd_t *pgd = pgd_offset_k(0UL);
@@ -112,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
112 if (console_loglevel == 10) 91 if (console_loglevel == 10)
113 early_printk("Kernel alive\n"); 92 early_printk("Kernel alive\n");
114 93
115 x86_64_init_pda();
116
117 x86_64_start_reservations(real_mode_data); 94 x86_64_start_reservations(real_mode_data);
118} 95}
119 96
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index e835b4eea70b..722464c520cf 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -429,12 +429,14 @@ is386: movl $2,%ecx # set MP
429 ljmp $(__KERNEL_CS),$1f 429 ljmp $(__KERNEL_CS),$1f
4301: movl $(__KERNEL_DS),%eax # reload all the segment registers 4301: movl $(__KERNEL_DS),%eax # reload all the segment registers
431 movl %eax,%ss # after changing gdt. 431 movl %eax,%ss # after changing gdt.
432 movl %eax,%fs # gets reset once there's real percpu
433 432
434 movl $(__USER_DS),%eax # DS/ES contains default USER segment 433 movl $(__USER_DS),%eax # DS/ES contains default USER segment
435 movl %eax,%ds 434 movl %eax,%ds
436 movl %eax,%es 435 movl %eax,%es
437 436
437 movl $(__KERNEL_PERCPU), %eax
438 movl %eax,%fs # set this cpu's percpu
439
438 xorl %eax,%eax # Clear GS and LDT 440 xorl %eax,%eax # Clear GS and LDT
439 movl %eax,%gs 441 movl %eax,%gs
440 lldt %ax 442 lldt %ax
@@ -446,8 +448,6 @@ is386: movl $2,%ecx # set MP
446 movb $1, ready 448 movb $1, ready
447 cmpb $0,%cl # the first CPU calls start_kernel 449 cmpb $0,%cl # the first CPU calls start_kernel
448 je 1f 450 je 1f
449 movl $(__KERNEL_PERCPU), %eax
450 movl %eax,%fs # set this cpu's percpu
451 movl (stack_start), %esp 451 movl (stack_start), %esp
4521: 4521:
453#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
@@ -548,12 +548,8 @@ early_fault:
548 pushl %eax 548 pushl %eax
549 pushl %edx /* trapno */ 549 pushl %edx /* trapno */
550 pushl $fault_msg 550 pushl $fault_msg
551#ifdef CONFIG_EARLY_PRINTK
552 call early_printk
553#else
554 call printk 551 call printk
555#endif 552#endif
556#endif
557 call dump_stack 553 call dump_stack
558hlt_loop: 554hlt_loop:
559 hlt 555 hlt
@@ -580,11 +576,10 @@ ignore_int:
580 pushl 32(%esp) 576 pushl 32(%esp)
581 pushl 40(%esp) 577 pushl 40(%esp)
582 pushl $int_msg 578 pushl $int_msg
583#ifdef CONFIG_EARLY_PRINTK
584 call early_printk
585#else
586 call printk 579 call printk
587#endif 580
581 call dump_stack
582
588 addl $(5*4),%esp 583 addl $(5*4),%esp
589 popl %ds 584 popl %ds
590 popl %es 585 popl %es
@@ -660,7 +655,7 @@ early_recursion_flag:
660 .long 0 655 .long 0
661 656
662int_msg: 657int_msg:
663 .asciz "Unknown interrupt or fault at EIP %p %p %p\n" 658 .asciz "Unknown interrupt or fault at: %p %p %p\n"
664 659
665fault_msg: 660fault_msg:
666/* fault info: */ 661/* fault info: */
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..a0a2b5ca9b7d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,6 +19,7 @@
19#include <asm/msr.h> 19#include <asm/msr.h>
20#include <asm/cache.h> 20#include <asm/cache.h>
21#include <asm/processor-flags.h> 21#include <asm/processor-flags.h>
22#include <asm/percpu.h>
22 23
23#ifdef CONFIG_PARAVIRT 24#ifdef CONFIG_PARAVIRT
24#include <asm/asm-offsets.h> 25#include <asm/asm-offsets.h>
@@ -204,6 +205,19 @@ ENTRY(secondary_startup_64)
204 pushq $0 205 pushq $0
205 popfq 206 popfq
206 207
208#ifdef CONFIG_SMP
209 /*
210 * Fix up static pointers that need __per_cpu_load added. The assembler
211 * is unable to do this directly. This is only needed for the boot cpu.
212 * These values are set up with the correct base addresses by C code for
213 * secondary cpus.
214 */
215 movq initial_gs(%rip), %rax
216 cmpl $0, per_cpu__cpu_number(%rax)
217 jne 1f
218 addq %rax, early_gdt_descr_base(%rip)
2191:
220#endif
207 /* 221 /*
208 * We must switch to a new descriptor in kernel space for the GDT 222 * We must switch to a new descriptor in kernel space for the GDT
209 * because soon the kernel won't have access anymore to the userspace 223 * because soon the kernel won't have access anymore to the userspace
@@ -226,12 +240,15 @@ ENTRY(secondary_startup_64)
226 movl %eax,%fs 240 movl %eax,%fs
227 movl %eax,%gs 241 movl %eax,%gs
228 242
229 /* 243 /* Set up %gs.
230 * Setup up a dummy PDA. this is just for some early bootup code 244 *
231 * that does in_interrupt() 245 * The base of %gs always points to the bottom of the irqstack
232 */ 246 * union. If the stack protector canary is enabled, it is
247 * located at %gs:40. Note that, on SMP, the boot cpu uses
248 * init data section till per cpu areas are set up.
249 */
233 movl $MSR_GS_BASE,%ecx 250 movl $MSR_GS_BASE,%ecx
234 movq $empty_zero_page,%rax 251 movq initial_gs(%rip),%rax
235 movq %rax,%rdx 252 movq %rax,%rdx
236 shrq $32,%rdx 253 shrq $32,%rdx
237 wrmsr 254 wrmsr
@@ -257,6 +274,12 @@ ENTRY(secondary_startup_64)
257 .align 8 274 .align 8
258 ENTRY(initial_code) 275 ENTRY(initial_code)
259 .quad x86_64_start_kernel 276 .quad x86_64_start_kernel
277 ENTRY(initial_gs)
278#ifdef CONFIG_SMP
279 .quad __per_cpu_load
280#else
281 .quad PER_CPU_VAR(irq_stack_union)
282#endif
260 __FINITDATA 283 __FINITDATA
261 284
262 ENTRY(stack_start) 285 ENTRY(stack_start)
@@ -401,7 +424,8 @@ NEXT_PAGE(level2_spare_pgt)
401 .globl early_gdt_descr 424 .globl early_gdt_descr
402early_gdt_descr: 425early_gdt_descr:
403 .word GDT_ENTRIES*8-1 426 .word GDT_ENTRIES*8-1
404 .quad per_cpu__gdt_page 427early_gdt_descr_base:
428 .quad per_cpu__gdt_page
405 429
406ENTRY(phys_base) 430ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 431 /* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index cd759ad90690..64d5ad0b8add 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -628,11 +628,12 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
628 628
629 switch (action & 0xf) { 629 switch (action & 0xf) {
630 case CPU_ONLINE: 630 case CPU_ONLINE:
631 INIT_DELAYED_WORK(&work.work, hpet_work); 631 INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work);
632 init_completion(&work.complete); 632 init_completion(&work.complete);
633 /* FIXME: add schedule_work_on() */ 633 /* FIXME: add schedule_work_on() */
634 schedule_delayed_work_on(cpu, &work.work, 0); 634 schedule_delayed_work_on(cpu, &work.work, 0);
635 wait_for_completion(&work.complete); 635 wait_for_completion(&work.complete);
636 destroy_timer_on_stack(&work.work.timer);
636 break; 637 break;
637 case CPU_DEAD: 638 case CPU_DEAD:
638 if (hdev) { 639 if (hdev) {
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 157aafa45583..bfb7d734062a 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -357,7 +357,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
357 357
358 if (!cfg->move_in_progress) { 358 if (!cfg->move_in_progress) {
359 /* it means that domain is not changed */ 359 /* it means that domain is not changed */
360 if (!cpumask_intersects(&desc->affinity, mask)) 360 if (!cpumask_intersects(desc->affinity, mask))
361 cfg->move_desc_pending = 1; 361 cfg->move_desc_pending = 1;
362 } 362 }
363} 363}
@@ -580,9 +580,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
580 if (assign_irq_vector(irq, cfg, mask)) 580 if (assign_irq_vector(irq, cfg, mask))
581 return BAD_APICID; 581 return BAD_APICID;
582 582
583 cpumask_and(&desc->affinity, cfg->domain, mask); 583 cpumask_and(desc->affinity, cfg->domain, mask);
584 set_extra_move_desc(desc, mask); 584 set_extra_move_desc(desc, mask);
585 return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); 585 return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
586} 586}
587 587
588static void 588static void
@@ -2382,7 +2382,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2382 if (cfg->move_in_progress) 2382 if (cfg->move_in_progress)
2383 send_cleanup_vector(cfg); 2383 send_cleanup_vector(cfg);
2384 2384
2385 cpumask_copy(&desc->affinity, mask); 2385 cpumask_copy(desc->affinity, mask);
2386} 2386}
2387 2387
2388static int migrate_irq_remapped_level_desc(struct irq_desc *desc) 2388static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2404,11 +2404,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
2404 } 2404 }
2405 2405
2406 /* everthing is clear. we have right of way */ 2406 /* everthing is clear. we have right of way */
2407 migrate_ioapic_irq_desc(desc, &desc->pending_mask); 2407 migrate_ioapic_irq_desc(desc, desc->pending_mask);
2408 2408
2409 ret = 0; 2409 ret = 0;
2410 desc->status &= ~IRQ_MOVE_PENDING; 2410 desc->status &= ~IRQ_MOVE_PENDING;
2411 cpumask_clear(&desc->pending_mask); 2411 cpumask_clear(desc->pending_mask);
2412 2412
2413unmask: 2413unmask:
2414 unmask_IO_APIC_irq_desc(desc); 2414 unmask_IO_APIC_irq_desc(desc);
@@ -2433,7 +2433,7 @@ static void ir_irq_migration(struct work_struct *work)
2433 continue; 2433 continue;
2434 } 2434 }
2435 2435
2436 desc->chip->set_affinity(irq, &desc->pending_mask); 2436 desc->chip->set_affinity(irq, desc->pending_mask);
2437 spin_unlock_irqrestore(&desc->lock, flags); 2437 spin_unlock_irqrestore(&desc->lock, flags);
2438 } 2438 }
2439 } 2439 }
@@ -2447,7 +2447,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2447{ 2447{
2448 if (desc->status & IRQ_LEVEL) { 2448 if (desc->status & IRQ_LEVEL) {
2449 desc->status |= IRQ_MOVE_PENDING; 2449 desc->status |= IRQ_MOVE_PENDING;
2450 cpumask_copy(&desc->pending_mask, mask); 2450 cpumask_copy(desc->pending_mask, mask);
2451 migrate_irq_remapped_level_desc(desc); 2451 migrate_irq_remapped_level_desc(desc);
2452 return; 2452 return;
2453 } 2453 }
@@ -2515,7 +2515,7 @@ static void irq_complete_move(struct irq_desc **descp)
2515 2515
2516 /* domain has not changed, but affinity did */ 2516 /* domain has not changed, but affinity did */
2517 me = smp_processor_id(); 2517 me = smp_processor_id();
2518 if (cpu_isset(me, desc->affinity)) { 2518 if (cpumask_test_cpu(me, desc->affinity)) {
2519 *descp = desc = move_irq_desc(desc, me); 2519 *descp = desc = move_irq_desc(desc, me);
2520 /* get the new one */ 2520 /* get the new one */
2521 cfg = desc->chip_data; 2521 cfg = desc->chip_data;
@@ -3182,7 +3182,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
3182 3182
3183 irq = 0; 3183 irq = 0;
3184 spin_lock_irqsave(&vector_lock, flags); 3184 spin_lock_irqsave(&vector_lock, flags);
3185 for (new = irq_want; new < NR_IRQS; new++) { 3185 for (new = irq_want; new < nr_irqs; new++) {
3186 if (platform_legacy_irq(new)) 3186 if (platform_legacy_irq(new))
3187 continue; 3187 continue;
3188 3188
@@ -3257,6 +3257,9 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3257 int err; 3257 int err;
3258 unsigned dest; 3258 unsigned dest;
3259 3259
3260 if (disable_apic)
3261 return -ENXIO;
3262
3260 cfg = irq_cfg(irq); 3263 cfg = irq_cfg(irq);
3261 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3264 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3262 if (err) 3265 if (err)
@@ -3691,6 +3694,9 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3691 struct irq_cfg *cfg; 3694 struct irq_cfg *cfg;
3692 int err; 3695 int err;
3693 3696
3697 if (disable_apic)
3698 return -ENXIO;
3699
3694 cfg = irq_cfg(irq); 3700 cfg = irq_cfg(irq);
3695 err = assign_irq_vector(irq, cfg, TARGET_CPUS); 3701 err = assign_irq_vector(irq, cfg, TARGET_CPUS);
3696 if (!err) { 3702 if (!err) {
@@ -3725,7 +3731,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3725} 3731}
3726#endif /* CONFIG_HT_IRQ */ 3732#endif /* CONFIG_HT_IRQ */
3727 3733
3728#ifdef CONFIG_X86_64 3734#ifdef CONFIG_X86_UV
3729/* 3735/*
3730 * Re-target the irq to the specified CPU and enable the specified MMR located 3736 * Re-target the irq to the specified CPU and enable the specified MMR located
3731 * on the specified blade to allow the sending of MSIs to the specified CPU. 3737 * on the specified blade to allow the sending of MSIs to the specified CPU.
@@ -3815,6 +3821,22 @@ void __init probe_nr_irqs_gsi(void)
3815 nr_irqs_gsi = nr; 3821 nr_irqs_gsi = nr;
3816} 3822}
3817 3823
3824#ifdef CONFIG_SPARSE_IRQ
3825int __init arch_probe_nr_irqs(void)
3826{
3827 int nr;
3828
3829 nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
3830 (NR_VECTORS + (8 * nr_cpu_ids)) :
3831 (NR_VECTORS + (32 * nr_ioapics)));
3832
3833 if (nr < nr_irqs && nr > nr_irqs_gsi)
3834 nr_irqs = nr;
3835
3836 return 0;
3837}
3838#endif
3839
3818/* -------------------------------------------------------------------------- 3840/* --------------------------------------------------------------------------
3819 ACPI-based IOAPIC Configuration 3841 ACPI-based IOAPIC Configuration
3820 -------------------------------------------------------------------------- */ 3842 -------------------------------------------------------------------------- */
@@ -4004,7 +4026,7 @@ void __init setup_ioapic_dest(void)
4004 */ 4026 */
4005 if (desc->status & 4027 if (desc->status &
4006 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 4028 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4007 mask = &desc->affinity; 4029 mask = desc->affinity;
4008 else 4030 else
4009 mask = TARGET_CPUS; 4031 mask = TARGET_CPUS;
4010 4032
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3973e2df7f87..8b30d0c2512c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -36,11 +36,7 @@ void ack_bad_irq(unsigned int irq)
36#endif 36#endif
37} 37}
38 38
39#ifdef CONFIG_X86_32 39#define irq_stats(x) (&per_cpu(irq_stat, x))
40# define irq_stats(x) (&per_cpu(irq_stat, x))
41#else
42# define irq_stats(x) cpu_pda(x)
43#endif
44/* 40/*
45 * /proc/interrupts printing: 41 * /proc/interrupts printing:
46 */ 42 */
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
248 if (irq == 2) 248 if (irq == 2)
249 continue; 249 continue;
250 250
251 affinity = &desc->affinity; 251 affinity = desc->affinity;
252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 252 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
253 printk("Breaking affinity for irq %i\n", irq); 253 printk("Breaking affinity for irq %i\n", irq);
254 affinity = cpu_all_mask; 254 affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..018963aa6ee3 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,6 +18,13 @@
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <asm/io_apic.h> 19#include <asm/io_apic.h>
20#include <asm/idle.h> 20#include <asm/idle.h>
21#include <asm/apic.h>
22
23DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
24EXPORT_PER_CPU_SYMBOL(irq_stat);
25
26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs);
21 28
22/* 29/*
23 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
@@ -100,7 +107,7 @@ void fixup_irqs(void)
100 /* interrupt's are disabled at this point */ 107 /* interrupt's are disabled at this point */
101 spin_lock(&desc->lock); 108 spin_lock(&desc->lock);
102 109
103 affinity = &desc->affinity; 110 affinity = desc->affinity;
104 if (!irq_has_action(irq) || 111 if (!irq_has_action(irq) ||
105 cpumask_equal(affinity, cpu_online_mask)) { 112 cpumask_equal(affinity, cpu_online_mask)) {
106 spin_unlock(&desc->lock); 113 spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1507ad4e674d..bf629cadec1a 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -149,8 +149,15 @@ void __init native_init_IRQ(void)
149 */ 149 */
150 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 150 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
151 151
152 /* IPI for invalidation */ 152 /* IPIs for invalidation */
153 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); 153 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
154 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
155 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
156 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
157 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
158 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
159 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
160 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
154 161
155 /* IPI for generic function call */ 162 /* IPI for generic function call */
156 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 163 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 884d985b8b82..e948b28a5a9a 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -446,7 +446,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 446static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
447 struct kprobe_ctlblk *kcb) 447 struct kprobe_ctlblk *kcb)
448{ 448{
449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) 449#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER)
450 if (p->ainsn.boostable == 1 && !p->post_handler) { 450 if (p->ainsn.boostable == 1 && !p->post_handler) {
451 /* Boost up -- we can execute copied instructions directly */ 451 /* Boost up -- we can execute copied instructions directly */
452 reset_current_kprobe(); 452 reset_current_kprobe();
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index ad36377dc935..fa6bb263892e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/e820.h> 27#include <asm/e820.h>
28#include <asm/trampoline.h> 28#include <asm/trampoline.h>
29#include <asm/setup.h> 29#include <asm/setup.h>
30#include <asm/smp.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 7228979f1e7f..23b6d9e6e4f5 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -61,11 +61,7 @@ static int endflag __initdata;
61 61
62static inline unsigned int get_nmi_count(int cpu) 62static inline unsigned int get_nmi_count(int cpu)
63{ 63{
64#ifdef CONFIG_X86_64 64 return per_cpu(irq_stat, cpu).__nmi_count;
65 return cpu_pda(cpu)->__nmi_count;
66#else
67 return nmi_count(cpu);
68#endif
69} 65}
70 66
71static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
@@ -82,12 +78,8 @@ static inline int mce_in_progress(void)
82 */ 78 */
83static inline unsigned int get_timer_irqs(int cpu) 79static inline unsigned int get_timer_irqs(int cpu)
84{ 80{
85#ifdef CONFIG_X86_64
86 return read_pda(apic_timer_irqs) + read_pda(irq0_irqs);
87#else
88 return per_cpu(irq_stat, cpu).apic_timer_irqs + 81 return per_cpu(irq_stat, cpu).apic_timer_irqs +
89 per_cpu(irq_stat, cpu).irq0_irqs; 82 per_cpu(irq_stat, cpu).irq0_irqs;
90#endif
91} 83}
92 84
93#ifdef CONFIG_SMP 85#ifdef CONFIG_SMP
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..202514be5923 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -435,7 +435,6 @@ struct pv_mmu_ops pv_mmu_ops = {
435#endif /* PAGETABLE_LEVELS >= 3 */ 435#endif /* PAGETABLE_LEVELS >= 3 */
436 436
437 .pte_val = native_pte_val, 437 .pte_val = native_pte_val,
438 .pte_flags = native_pte_flags,
439 .pgd_val = native_pgd_val, 438 .pgd_val = native_pgd_val,
440 439
441 .make_pte = native_make_pte, 440 .make_pte = native_make_pte,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..1a1ae8edc40c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -66,9 +66,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 66DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
67EXPORT_PER_CPU_SYMBOL(current_task); 67EXPORT_PER_CPU_SYMBOL(current_task);
68 68
69DEFINE_PER_CPU(int, cpu_number);
70EXPORT_PER_CPU_SYMBOL(cpu_number);
71
72/* 69/*
73 * Return saved PC of a blocked thread. 70 * Return saved PC of a blocked thread.
74 */ 71 */
@@ -111,7 +108,6 @@ void cpu_idle(void)
111 play_dead(); 108 play_dead();
112 109
113 local_irq_disable(); 110 local_irq_disable();
114 __get_cpu_var(irq_stat).idle_timestamp = jiffies;
115 /* Don't trace irqs off for idle */ 111 /* Don't trace irqs off for idle */
116 stop_critical_timings(); 112 stop_critical_timings();
117 pm_idle(); 113 pm_idle();
@@ -591,7 +587,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
591 if (prev->gs | next->gs) 587 if (prev->gs | next->gs)
592 loadsegment(gs, next->gs); 588 loadsegment(gs, next->gs);
593 589
594 x86_write_percpu(current_task, next_p); 590 percpu_write(current_task, next_p);
595 591
596 return prev_p; 592 return prev_p;
597} 593}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 416fb9282f4f..c422eebb0c58 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
16 16
17#include <stdarg.h> 17#include <stdarg.h>
18 18
19#include <linux/stackprotector.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/errno.h> 21#include <linux/errno.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -46,7 +47,6 @@
46#include <asm/processor.h> 47#include <asm/processor.h>
47#include <asm/i387.h> 48#include <asm/i387.h>
48#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
49#include <asm/pda.h>
50#include <asm/prctl.h> 50#include <asm/prctl.h>
51#include <asm/desc.h> 51#include <asm/desc.h>
52#include <asm/proto.h> 52#include <asm/proto.h>
@@ -57,6 +57,12 @@
57 57
58asmlinkage extern void ret_from_fork(void); 58asmlinkage extern void ret_from_fork(void);
59 59
60DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
61EXPORT_PER_CPU_SYMBOL(current_task);
62
63DEFINE_PER_CPU(unsigned long, old_rsp);
64static DEFINE_PER_CPU(unsigned char, is_idle);
65
60unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 66unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
61 67
62static ATOMIC_NOTIFIER_HEAD(idle_notifier); 68static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -75,13 +81,13 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
75 81
76void enter_idle(void) 82void enter_idle(void)
77{ 83{
78 write_pda(isidle, 1); 84 percpu_write(is_idle, 1);
79 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 85 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
80} 86}
81 87
82static void __exit_idle(void) 88static void __exit_idle(void)
83{ 89{
84 if (test_and_clear_bit_pda(0, isidle) == 0) 90 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
85 return; 91 return;
86 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 92 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
87} 93}
@@ -111,6 +117,17 @@ static inline void play_dead(void)
111void cpu_idle(void) 117void cpu_idle(void)
112{ 118{
113 current_thread_info()->status |= TS_POLLING; 119 current_thread_info()->status |= TS_POLLING;
120
121 /*
122 * If we're the non-boot CPU, nothing set the PDA stack
123 * canary up for us - and if we are the boot CPU we have
124 * a 0 stack canary. This is a good place for updating
125 * it, as we wont ever return from this function (so the
126 * invalid canaries already on the stack wont ever
127 * trigger):
128 */
129 boot_init_stack_canary();
130
114 /* endless idle loop with no priority at all */ 131 /* endless idle loop with no priority at all */
115 while (1) { 132 while (1) {
116 tick_nohz_stop_sched_tick(1); 133 tick_nohz_stop_sched_tick(1);
@@ -392,7 +409,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
392 load_gs_index(0); 409 load_gs_index(0);
393 regs->ip = new_ip; 410 regs->ip = new_ip;
394 regs->sp = new_sp; 411 regs->sp = new_sp;
395 write_pda(oldrsp, new_sp); 412 percpu_write(old_rsp, new_sp);
396 regs->cs = __USER_CS; 413 regs->cs = __USER_CS;
397 regs->ss = __USER_DS; 414 regs->ss = __USER_DS;
398 regs->flags = 0x200; 415 regs->flags = 0x200;
@@ -613,21 +630,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
613 /* 630 /*
614 * Switch the PDA and FPU contexts. 631 * Switch the PDA and FPU contexts.
615 */ 632 */
616 prev->usersp = read_pda(oldrsp); 633 prev->usersp = percpu_read(old_rsp);
617 write_pda(oldrsp, next->usersp); 634 percpu_write(old_rsp, next->usersp);
618 write_pda(pcurrent, next_p); 635 percpu_write(current_task, next_p);
619 636
620 write_pda(kernelstack, 637 percpu_write(kernel_stack,
621 (unsigned long)task_stack_page(next_p) + 638 (unsigned long)task_stack_page(next_p) +
622 THREAD_SIZE - PDA_STACKOFFSET); 639 THREAD_SIZE - KERNEL_STACK_OFFSET);
623#ifdef CONFIG_CC_STACKPROTECTOR
624 write_pda(stack_canary, next_p->stack_canary);
625 /*
626 * Build time only check to make sure the stack_canary is at
627 * offset 40 in the pda; this is a gcc ABI requirement
628 */
629 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
630#endif
631 640
632 /* 641 /*
633 * Now maybe reload the debug registers and handle I/O bitmaps 642 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index bf63de72b643..0d1e7ac439f4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,146 +13,46 @@
13#include <asm/mpspec.h> 13#include <asm/mpspec.h>
14#include <asm/apicdef.h> 14#include <asm/apicdef.h>
15#include <asm/highmem.h> 15#include <asm/highmem.h>
16#include <asm/proto.h>
16#include <asm/cpumask.h> 17#include <asm/cpumask.h>
18#include <asm/cpu.h>
17 19
18#ifdef CONFIG_X86_LOCAL_APIC 20#ifdef CONFIG_DEBUG_PER_CPU_MAPS
19unsigned int num_processors; 21# define DBG(x...) printk(KERN_DEBUG x)
20unsigned disabled_cpus __cpuinitdata;
21/* Processor that is doing the boot up */
22unsigned int boot_cpu_physical_apicid = -1U;
23EXPORT_SYMBOL(boot_cpu_physical_apicid);
24unsigned int max_physical_apicid;
25
26/* Bitmask of physically existing CPUs */
27physid_mask_t phys_cpu_present_map;
28#endif
29
30/* map cpu index to physical APIC ID */
31DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
32DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
33EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
34EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
35
36#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
37#define X86_64_NUMA 1
38
39/* map cpu index to node index */
40DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
41EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
42
43/* which logical CPUs are on which nodes */
44cpumask_t *node_to_cpumask_map;
45EXPORT_SYMBOL(node_to_cpumask_map);
46
47/* setup node_to_cpumask_map */
48static void __init setup_node_to_cpumask_map(void);
49
50#else 22#else
51static inline void setup_node_to_cpumask_map(void) { } 23# define DBG(x...)
52#endif 24#endif
53 25
54#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP) 26DEFINE_PER_CPU(int, cpu_number);
55/* 27EXPORT_PER_CPU_SYMBOL(cpu_number);
56 * Copy data used in early init routines from the initial arrays to the
57 * per cpu data areas. These arrays then become expendable and the
58 * *_early_ptr's are zeroed indicating that the static arrays are gone.
59 */
60static void __init setup_per_cpu_maps(void)
61{
62 int cpu;
63 28
64 for_each_possible_cpu(cpu) { 29#ifdef CONFIG_X86_64
65 per_cpu(x86_cpu_to_apicid, cpu) = 30#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
66 early_per_cpu_map(x86_cpu_to_apicid, cpu); 31#else
67 per_cpu(x86_bios_cpu_apicid, cpu) = 32#define BOOT_PERCPU_OFFSET 0
68 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
69#ifdef X86_64_NUMA
70 per_cpu(x86_cpu_to_node_map, cpu) =
71 early_per_cpu_map(x86_cpu_to_node_map, cpu);
72#endif 33#endif
73 }
74 34
75 /* indicate the early static arrays will soon be gone */ 35DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
76 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 36EXPORT_PER_CPU_SYMBOL(this_cpu_off);
77 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
78#ifdef X86_64_NUMA
79 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
80#endif
81}
82 37
83#ifdef CONFIG_X86_32 38unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
84/* 39 [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
85 * Great future not-so-futuristic plan: make i386 and x86_64 do it 40};
86 * the same way
87 */
88unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
89EXPORT_SYMBOL(__per_cpu_offset); 41EXPORT_SYMBOL(__per_cpu_offset);
90static inline void setup_cpu_pda_map(void) { }
91
92#elif !defined(CONFIG_SMP)
93static inline void setup_cpu_pda_map(void) { }
94
95#else /* CONFIG_SMP && CONFIG_X86_64 */
96
97/*
98 * Allocate cpu_pda pointer table and array via alloc_bootmem.
99 */
100static void __init setup_cpu_pda_map(void)
101{
102 char *pda;
103 struct x8664_pda **new_cpu_pda;
104 unsigned long size;
105 int cpu;
106
107 size = roundup(sizeof(struct x8664_pda), cache_line_size());
108
109 /* allocate cpu_pda array and pointer table */
110 {
111 unsigned long tsize = nr_cpu_ids * sizeof(void *);
112 unsigned long asize = size * (nr_cpu_ids - 1);
113 42
114 tsize = roundup(tsize, cache_line_size()); 43static inline void setup_percpu_segment(int cpu)
115 new_cpu_pda = alloc_bootmem(tsize + asize);
116 pda = (char *)new_cpu_pda + tsize;
117 }
118
119 /* initialize pointer table to static pda's */
120 for_each_possible_cpu(cpu) {
121 if (cpu == 0) {
122 /* leave boot cpu pda in place */
123 new_cpu_pda[0] = cpu_pda(0);
124 continue;
125 }
126 new_cpu_pda[cpu] = (struct x8664_pda *)pda;
127 new_cpu_pda[cpu]->in_bootmem = 1;
128 pda += size;
129 }
130
131 /* point to new pointer table */
132 _cpu_pda = new_cpu_pda;
133}
134
135#endif /* CONFIG_SMP && CONFIG_X86_64 */
136
137#ifdef CONFIG_X86_64
138
139/* correctly size the local cpu masks */
140static void setup_cpu_local_masks(void)
141{ 44{
142 alloc_bootmem_cpumask_var(&cpu_initialized_mask); 45#ifdef CONFIG_X86_32
143 alloc_bootmem_cpumask_var(&cpu_callin_mask); 46 struct desc_struct gdt;
144 alloc_bootmem_cpumask_var(&cpu_callout_mask);
145 alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
146}
147
148#else /* CONFIG_X86_32 */
149 47
150static inline void setup_cpu_local_masks(void) 48 pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
151{ 49 0x2 | DESCTYPE_S, 0x8);
50 gdt.s = 1;
51 write_gdt_entry(get_cpu_gdt_table(cpu),
52 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
53#endif
152} 54}
153 55
154#endif /* CONFIG_X86_32 */
155
156/* 56/*
157 * Great future plan: 57 * Great future plan:
158 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data. 58 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
@@ -160,18 +60,12 @@ static inline void setup_cpu_local_masks(void)
160 */ 60 */
161void __init setup_per_cpu_areas(void) 61void __init setup_per_cpu_areas(void)
162{ 62{
163 ssize_t size, old_size; 63 ssize_t size;
164 char *ptr; 64 char *ptr;
165 int cpu; 65 int cpu;
166 unsigned long align = 1;
167
168 /* Setup cpu_pda map */
169 setup_cpu_pda_map();
170 66
171 /* Copy section for each CPU (we discard the original) */ 67 /* Copy section for each CPU (we discard the original) */
172 old_size = PERCPU_ENOUGH_ROOM; 68 size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
173 align = max_t(unsigned long, PAGE_SIZE, align);
174 size = roundup(old_size, align);
175 69
176 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 70 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
177 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 71 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
@@ -180,30 +74,67 @@ void __init setup_per_cpu_areas(void)
180 74
181 for_each_possible_cpu(cpu) { 75 for_each_possible_cpu(cpu) {
182#ifndef CONFIG_NEED_MULTIPLE_NODES 76#ifndef CONFIG_NEED_MULTIPLE_NODES
183 ptr = __alloc_bootmem(size, align, 77 ptr = alloc_bootmem_pages(size);
184 __pa(MAX_DMA_ADDRESS));
185#else 78#else
186 int node = early_cpu_to_node(cpu); 79 int node = early_cpu_to_node(cpu);
187 if (!node_online(node) || !NODE_DATA(node)) { 80 if (!node_online(node) || !NODE_DATA(node)) {
188 ptr = __alloc_bootmem(size, align, 81 ptr = alloc_bootmem_pages(size);
189 __pa(MAX_DMA_ADDRESS));
190 pr_info("cpu %d has no node %d or node-local memory\n", 82 pr_info("cpu %d has no node %d or node-local memory\n",
191 cpu, node); 83 cpu, node);
192 pr_debug("per cpu data for cpu%d at %016lx\n", 84 pr_debug("per cpu data for cpu%d at %016lx\n",
193 cpu, __pa(ptr)); 85 cpu, __pa(ptr));
194 } else { 86 } else {
195 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, 87 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
196 __pa(MAX_DMA_ADDRESS));
197 pr_debug("per cpu data for cpu%d on node%d at %016lx\n", 88 pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
198 cpu, node, __pa(ptr)); 89 cpu, node, __pa(ptr));
199 } 90 }
200#endif 91#endif
92
93 memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
201 per_cpu_offset(cpu) = ptr - __per_cpu_start; 94 per_cpu_offset(cpu) = ptr - __per_cpu_start;
202 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 95 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
96 per_cpu(cpu_number, cpu) = cpu;
97 setup_percpu_segment(cpu);
98 /*
99 * Copy data used in early init routines from the
100 * initial arrays to the per cpu data areas. These
101 * arrays then become expendable and the *_early_ptr's
102 * are zeroed indicating that the static arrays are
103 * gone.
104 */
105#ifdef CONFIG_X86_LOCAL_APIC
106 per_cpu(x86_cpu_to_apicid, cpu) =
107 early_per_cpu_map(x86_cpu_to_apicid, cpu);
108 per_cpu(x86_bios_cpu_apicid, cpu) =
109 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
110#endif
111#ifdef CONFIG_X86_64
112 per_cpu(irq_stack_ptr, cpu) =
113 per_cpu(irq_stack_union.irq_stack, cpu) +
114 IRQ_STACK_SIZE - 64;
115#ifdef CONFIG_NUMA
116 per_cpu(x86_cpu_to_node_map, cpu) =
117 early_per_cpu_map(x86_cpu_to_node_map, cpu);
118#endif
119#endif
120 /*
121 * Up to this point, the boot CPU has been using .data.init
122 * area. Reload any changed state for the boot CPU.
123 */
124 if (cpu == boot_cpu_id)
125 switch_to_new_gdt();
126
127 DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
203 } 128 }
204 129
205 /* Setup percpu data maps */ 130 /* indicate the early static arrays will soon be gone */
206 setup_per_cpu_maps(); 131#ifdef CONFIG_X86_LOCAL_APIC
132 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
133 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
134#endif
135#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
136 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
137#endif
207 138
208 /* Setup node to cpumask map */ 139 /* Setup node to cpumask map */
209 setup_node_to_cpumask_map(); 140 setup_node_to_cpumask_map();
@@ -211,199 +142,3 @@ void __init setup_per_cpu_areas(void)
211 /* Setup cpu initialized, callin, callout masks */ 142 /* Setup cpu initialized, callin, callout masks */
212 setup_cpu_local_masks(); 143 setup_cpu_local_masks();
213} 144}
214
215#endif
216
217#ifdef X86_64_NUMA
218
219/*
220 * Allocate node_to_cpumask_map based on number of available nodes
221 * Requires node_possible_map to be valid.
222 *
223 * Note: node_to_cpumask() is not valid until after this is done.
224 */
225static void __init setup_node_to_cpumask_map(void)
226{
227 unsigned int node, num = 0;
228 cpumask_t *map;
229
230 /* setup nr_node_ids if not done yet */
231 if (nr_node_ids == MAX_NUMNODES) {
232 for_each_node_mask(node, node_possible_map)
233 num = node;
234 nr_node_ids = num + 1;
235 }
236
237 /* allocate the map */
238 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
239
240 pr_debug("Node to cpumask map at %p for %d nodes\n",
241 map, nr_node_ids);
242
243 /* node_to_cpumask() will now work */
244 node_to_cpumask_map = map;
245}
246
247void __cpuinit numa_set_node(int cpu, int node)
248{
249 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
250
251 if (cpu_pda(cpu) && node != NUMA_NO_NODE)
252 cpu_pda(cpu)->nodenumber = node;
253
254 if (cpu_to_node_map)
255 cpu_to_node_map[cpu] = node;
256
257 else if (per_cpu_offset(cpu))
258 per_cpu(x86_cpu_to_node_map, cpu) = node;
259
260 else
261 pr_debug("Setting node for non-present cpu %d\n", cpu);
262}
263
264void __cpuinit numa_clear_node(int cpu)
265{
266 numa_set_node(cpu, NUMA_NO_NODE);
267}
268
269#ifndef CONFIG_DEBUG_PER_CPU_MAPS
270
271void __cpuinit numa_add_cpu(int cpu)
272{
273 cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
274}
275
276void __cpuinit numa_remove_cpu(int cpu)
277{
278 cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
279}
280
281#else /* CONFIG_DEBUG_PER_CPU_MAPS */
282
283/*
284 * --------- debug versions of the numa functions ---------
285 */
286static void __cpuinit numa_set_cpumask(int cpu, int enable)
287{
288 int node = cpu_to_node(cpu);
289 cpumask_t *mask;
290 char buf[64];
291
292 if (node_to_cpumask_map == NULL) {
293 printk(KERN_ERR "node_to_cpumask_map NULL\n");
294 dump_stack();
295 return;
296 }
297
298 mask = &node_to_cpumask_map[node];
299 if (enable)
300 cpu_set(cpu, *mask);
301 else
302 cpu_clear(cpu, *mask);
303
304 cpulist_scnprintf(buf, sizeof(buf), mask);
305 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
306 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
307}
308
309void __cpuinit numa_add_cpu(int cpu)
310{
311 numa_set_cpumask(cpu, 1);
312}
313
314void __cpuinit numa_remove_cpu(int cpu)
315{
316 numa_set_cpumask(cpu, 0);
317}
318
319int cpu_to_node(int cpu)
320{
321 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
322 printk(KERN_WARNING
323 "cpu_to_node(%d): usage too early!\n", cpu);
324 dump_stack();
325 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
326 }
327 return per_cpu(x86_cpu_to_node_map, cpu);
328}
329EXPORT_SYMBOL(cpu_to_node);
330
331/*
332 * Same function as cpu_to_node() but used if called before the
333 * per_cpu areas are setup.
334 */
335int early_cpu_to_node(int cpu)
336{
337 if (early_per_cpu_ptr(x86_cpu_to_node_map))
338 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
339
340 if (!per_cpu_offset(cpu)) {
341 printk(KERN_WARNING
342 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
343 dump_stack();
344 return NUMA_NO_NODE;
345 }
346 return per_cpu(x86_cpu_to_node_map, cpu);
347}
348
349
350/* empty cpumask */
351static const cpumask_t cpu_mask_none;
352
353/*
354 * Returns a pointer to the bitmask of CPUs on Node 'node'.
355 */
356const cpumask_t *cpumask_of_node(int node)
357{
358 if (node_to_cpumask_map == NULL) {
359 printk(KERN_WARNING
360 "cpumask_of_node(%d): no node_to_cpumask_map!\n",
361 node);
362 dump_stack();
363 return (const cpumask_t *)&cpu_online_map;
364 }
365 if (node >= nr_node_ids) {
366 printk(KERN_WARNING
367 "cpumask_of_node(%d): node > nr_node_ids(%d)\n",
368 node, nr_node_ids);
369 dump_stack();
370 return &cpu_mask_none;
371 }
372 return &node_to_cpumask_map[node];
373}
374EXPORT_SYMBOL(cpumask_of_node);
375
376/*
377 * Returns a bitmask of CPUs on Node 'node'.
378 *
379 * Side note: this function creates the returned cpumask on the stack
380 * so with a high NR_CPUS count, excessive stack space is used. The
381 * node_to_cpumask_ptr function should be used whenever possible.
382 */
383cpumask_t node_to_cpumask(int node)
384{
385 if (node_to_cpumask_map == NULL) {
386 printk(KERN_WARNING
387 "node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
388 dump_stack();
389 return cpu_online_map;
390 }
391 if (node >= nr_node_ids) {
392 printk(KERN_WARNING
393 "node_to_cpumask(%d): node > nr_node_ids(%d)\n",
394 node, nr_node_ids);
395 dump_stack();
396 return cpu_mask_none;
397 }
398 return node_to_cpumask_map[node];
399}
400EXPORT_SYMBOL(node_to_cpumask);
401
402/*
403 * --------- end of debug versions of the numa functions ---------
404 */
405
406#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
407
408#endif /* X86_64_NUMA */
409
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index cf34eb37fbee..7fc78b019815 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -649,9 +649,16 @@ badframe:
649} 649}
650 650
651#ifdef CONFIG_X86_32 651#ifdef CONFIG_X86_32
652asmlinkage int sys_rt_sigreturn(struct pt_regs regs) 652/*
653 * Note: do not pass in pt_regs directly as with tail-call optimization
654 * GCC will incorrectly stomp on the caller's frame and corrupt user-space
655 * register state:
656 */
657asmlinkage int sys_rt_sigreturn(unsigned long __unused)
653{ 658{
654 return do_rt_sigreturn(&regs); 659 struct pt_regs *regs = (struct pt_regs *)&__unused;
660
661 return do_rt_sigreturn(regs);
655} 662}
656#else /* !CONFIG_X86_32 */ 663#else /* !CONFIG_X86_32 */
657asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 664asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 6c2b8444b830..f9dbcff43546 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,6 +62,7 @@
62#include <asm/vmi.h> 62#include <asm/vmi.h>
63#include <asm/genapic.h> 63#include <asm/genapic.h>
64#include <asm/setup.h> 64#include <asm/setup.h>
65#include <asm/uv/uv.h>
65#include <linux/mc146818rtc.h> 66#include <linux/mc146818rtc.h>
66 67
67#include <mach_apic.h> 68#include <mach_apic.h>
@@ -744,52 +745,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
744 complete(&c_idle->done); 745 complete(&c_idle->done);
745} 746}
746 747
747#ifdef CONFIG_X86_64
748
749/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
750static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
751{
752 if (!after_bootmem)
753 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
754}
755
756/*
757 * Allocate node local memory for the AP pda.
758 *
759 * Must be called after the _cpu_pda pointer table is initialized.
760 */
761int __cpuinit get_local_pda(int cpu)
762{
763 struct x8664_pda *oldpda, *newpda;
764 unsigned long size = sizeof(struct x8664_pda);
765 int node = cpu_to_node(cpu);
766
767 if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
768 return 0;
769
770 oldpda = cpu_pda(cpu);
771 newpda = kmalloc_node(size, GFP_ATOMIC, node);
772 if (!newpda) {
773 printk(KERN_ERR "Could not allocate node local PDA "
774 "for CPU %d on node %d\n", cpu, node);
775
776 if (oldpda)
777 return 0; /* have a usable pda */
778 else
779 return -1;
780 }
781
782 if (oldpda) {
783 memcpy(newpda, oldpda, size);
784 free_bootmem_pda(oldpda);
785 }
786
787 newpda->in_bootmem = 0;
788 cpu_pda(cpu) = newpda;
789 return 0;
790}
791#endif /* CONFIG_X86_64 */
792
793static int __cpuinit do_boot_cpu(int apicid, int cpu) 748static int __cpuinit do_boot_cpu(int apicid, int cpu)
794/* 749/*
795 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 750 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -807,16 +762,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
807 }; 762 };
808 INIT_WORK(&c_idle.work, do_fork_idle); 763 INIT_WORK(&c_idle.work, do_fork_idle);
809 764
810#ifdef CONFIG_X86_64
811 /* Allocate node local memory for AP pdas */
812 if (cpu > 0) {
813 boot_error = get_local_pda(cpu);
814 if (boot_error)
815 goto restore_state;
816 /* if can't get pda memory, can't start cpu */
817 }
818#endif
819
820 alternatives_smp_switch(1); 765 alternatives_smp_switch(1);
821 766
822 c_idle.idle = get_idle_for_cpu(cpu); 767 c_idle.idle = get_idle_for_cpu(cpu);
@@ -846,14 +791,16 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
846 791
847 set_idle_for_cpu(cpu, c_idle.idle); 792 set_idle_for_cpu(cpu, c_idle.idle);
848do_rest: 793do_rest:
849#ifdef CONFIG_X86_32
850 per_cpu(current_task, cpu) = c_idle.idle; 794 per_cpu(current_task, cpu) = c_idle.idle;
851 init_gdt(cpu); 795#ifdef CONFIG_X86_32
852 /* Stack for startup_32 can be just as for start_secondary onwards */ 796 /* Stack for startup_32 can be just as for start_secondary onwards */
853 irq_ctx_init(cpu); 797 irq_ctx_init(cpu);
854#else 798#else
855 cpu_pda(cpu)->pcurrent = c_idle.idle;
856 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 799 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
800 initial_gs = per_cpu_offset(cpu);
801 per_cpu(kernel_stack, cpu) =
802 (unsigned long)task_stack_page(c_idle.idle) -
803 KERNEL_STACK_OFFSET + THREAD_SIZE;
857#endif 804#endif
858 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 805 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
859 initial_code = (unsigned long)start_secondary; 806 initial_code = (unsigned long)start_secondary;
@@ -930,9 +877,7 @@ do_rest:
930 inquire_remote_apic(apicid); 877 inquire_remote_apic(apicid);
931 } 878 }
932 } 879 }
933#ifdef CONFIG_X86_64 880
934restore_state:
935#endif
936 if (boot_error) { 881 if (boot_error) {
937 /* Try to put things back the way they were before ... */ 882 /* Try to put things back the way they were before ... */
938 numa_remove_cpu(cpu); /* was set by numa_add_cpu */ 883 numa_remove_cpu(cpu); /* was set by numa_add_cpu */
@@ -1124,6 +1069,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1124 printk(KERN_ERR "... forcing use of dummy APIC emulation." 1069 printk(KERN_ERR "... forcing use of dummy APIC emulation."
1125 "(tell your hw vendor)\n"); 1070 "(tell your hw vendor)\n");
1126 smpboot_clear_io_apic(); 1071 smpboot_clear_io_apic();
1072 disable_ioapic_setup();
1127 return -1; 1073 return -1;
1128 } 1074 }
1129 1075
@@ -1239,9 +1185,6 @@ out:
1239void __init native_smp_prepare_boot_cpu(void) 1185void __init native_smp_prepare_boot_cpu(void)
1240{ 1186{
1241 int me = smp_processor_id(); 1187 int me = smp_processor_id();
1242#ifdef CONFIG_X86_32
1243 init_gdt(me);
1244#endif
1245 switch_to_new_gdt(); 1188 switch_to_new_gdt();
1246 /* already set me in cpu_online_mask in boot_cpu_init() */ 1189 /* already set me in cpu_online_mask in boot_cpu_init() */
1247 cpumask_set_cpu(me, cpu_callout_mask); 1190 cpumask_set_cpu(me, cpu_callout_mask);
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
deleted file mode 100644
index 397e309839dd..000000000000
--- a/arch/x86/kernel/smpcommon.c
+++ /dev/null
@@ -1,30 +0,0 @@
1/*
2 * SMP stuff which is common to all sub-architectures.
3 */
4#include <linux/module.h>
5#include <asm/smp.h>
6
7#ifdef CONFIG_X86_32
8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10
11/*
12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
16__cpuinit void init_gdt(int cpu)
17{
18 struct desc_struct gdt;
19
20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
23
24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
26
27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
28 per_cpu(cpu_number, cpu) = cpu;
29}
30#endif
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c3..e2e86a08f31d 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -88,7 +88,7 @@ ENTRY(sys_call_table)
88 .long sys_uselib 88 .long sys_uselib
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
deleted file mode 100644
index ce5054642247..000000000000
--- a/arch/x86/kernel/tlb_32.c
+++ /dev/null
@@ -1,256 +0,0 @@
1#include <linux/spinlock.h>
2#include <linux/cpu.h>
3#include <linux/interrupt.h>
4
5#include <asm/tlbflush.h>
6
7DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
8 ____cacheline_aligned = { &init_mm, 0, };
9
10/* must come after the send_IPI functions above for inlining */
11#include <mach_ipi.h>
12
13/*
14 * Smarter SMP flushing macros.
15 * c/o Linus Torvalds.
16 *
17 * These mean you can really definitely utterly forget about
18 * writing to user space from interrupts. (Its not allowed anyway).
19 *
20 * Optimizations Manfred Spraul <manfred@colorfullife.com>
21 */
22
23static cpumask_t flush_cpumask;
24static struct mm_struct *flush_mm;
25static unsigned long flush_va;
26static DEFINE_SPINLOCK(tlbstate_lock);
27
28/*
29 * We cannot call mmdrop() because we are in interrupt context,
30 * instead update mm->cpu_vm_mask.
31 *
32 * We need to reload %cr3 since the page tables may be going
33 * away from under us..
34 */
35void leave_mm(int cpu)
36{
37 BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
38 cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
39 load_cr3(swapper_pg_dir);
40}
41EXPORT_SYMBOL_GPL(leave_mm);
42
43/*
44 *
45 * The flush IPI assumes that a thread switch happens in this order:
46 * [cpu0: the cpu that switches]
47 * 1) switch_mm() either 1a) or 1b)
48 * 1a) thread switch to a different mm
49 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
50 * Stop ipi delivery for the old mm. This is not synchronized with
51 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
52 * for the wrong mm, and in the worst case we perform a superfluous
53 * tlb flush.
54 * 1a2) set cpu_tlbstate to TLBSTATE_OK
55 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
56 * was in lazy tlb mode.
57 * 1a3) update cpu_tlbstate[].active_mm
58 * Now cpu0 accepts tlb flushes for the new mm.
59 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
60 * Now the other cpus will send tlb flush ipis.
61 * 1a4) change cr3.
62 * 1b) thread switch without mm change
63 * cpu_tlbstate[].active_mm is correct, cpu0 already handles
64 * flush ipis.
65 * 1b1) set cpu_tlbstate to TLBSTATE_OK
66 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
67 * Atomically set the bit [other cpus will start sending flush ipis],
68 * and test the bit.
69 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
70 * 2) switch %%esp, ie current
71 *
72 * The interrupt must handle 2 special cases:
73 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
74 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
75 * runs in kernel space, the cpu could load tlb entries for user space
76 * pages.
77 *
78 * The good news is that cpu_tlbstate is local to each cpu, no
79 * write/read ordering problems.
80 */
81
82/*
83 * TLB flush IPI:
84 *
85 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
86 * 2) Leave the mm if we are in the lazy tlb mode.
87 */
88
89void smp_invalidate_interrupt(struct pt_regs *regs)
90{
91 unsigned long cpu;
92
93 cpu = get_cpu();
94
95 if (!cpu_isset(cpu, flush_cpumask))
96 goto out;
97 /*
98 * This was a BUG() but until someone can quote me the
99 * line from the intel manual that guarantees an IPI to
100 * multiple CPUs is retried _only_ on the erroring CPUs
101 * its staying as a return
102 *
103 * BUG();
104 */
105
106 if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
107 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
108 if (flush_va == TLB_FLUSH_ALL)
109 local_flush_tlb();
110 else
111 __flush_tlb_one(flush_va);
112 } else
113 leave_mm(cpu);
114 }
115 ack_APIC_irq();
116 smp_mb__before_clear_bit();
117 cpu_clear(cpu, flush_cpumask);
118 smp_mb__after_clear_bit();
119out:
120 put_cpu_no_resched();
121 inc_irq_stat(irq_tlb_count);
122}
123
124void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
125 unsigned long va)
126{
127 cpumask_t cpumask = *cpumaskp;
128
129 /*
130 * A couple of (to be removed) sanity checks:
131 *
132 * - current CPU must not be in mask
133 * - mask must exist :)
134 */
135 BUG_ON(cpus_empty(cpumask));
136 BUG_ON(cpu_isset(smp_processor_id(), cpumask));
137 BUG_ON(!mm);
138
139#ifdef CONFIG_HOTPLUG_CPU
140 /* If a CPU which we ran on has gone down, OK. */
141 cpus_and(cpumask, cpumask, cpu_online_map);
142 if (unlikely(cpus_empty(cpumask)))
143 return;
144#endif
145
146 /*
147 * i'm not happy about this global shared spinlock in the
148 * MM hot path, but we'll see how contended it is.
149 * AK: x86-64 has a faster method that could be ported.
150 */
151 spin_lock(&tlbstate_lock);
152
153 flush_mm = mm;
154 flush_va = va;
155 cpus_or(flush_cpumask, cpumask, flush_cpumask);
156
157 /*
158 * Make the above memory operations globally visible before
159 * sending the IPI.
160 */
161 smp_mb();
162 /*
163 * We have to send the IPI only to
164 * CPUs affected.
165 */
166 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR);
167
168 while (!cpus_empty(flush_cpumask))
169 /* nothing. lockup detection does not belong here */
170 cpu_relax();
171
172 flush_mm = NULL;
173 flush_va = 0;
174 spin_unlock(&tlbstate_lock);
175}
176
177void flush_tlb_current_task(void)
178{
179 struct mm_struct *mm = current->mm;
180 cpumask_t cpu_mask;
181
182 preempt_disable();
183 cpu_mask = mm->cpu_vm_mask;
184 cpu_clear(smp_processor_id(), cpu_mask);
185
186 local_flush_tlb();
187 if (!cpus_empty(cpu_mask))
188 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
189 preempt_enable();
190}
191
192void flush_tlb_mm(struct mm_struct *mm)
193{
194 cpumask_t cpu_mask;
195
196 preempt_disable();
197 cpu_mask = mm->cpu_vm_mask;
198 cpu_clear(smp_processor_id(), cpu_mask);
199
200 if (current->active_mm == mm) {
201 if (current->mm)
202 local_flush_tlb();
203 else
204 leave_mm(smp_processor_id());
205 }
206 if (!cpus_empty(cpu_mask))
207 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
208
209 preempt_enable();
210}
211
212void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
213{
214 struct mm_struct *mm = vma->vm_mm;
215 cpumask_t cpu_mask;
216
217 preempt_disable();
218 cpu_mask = mm->cpu_vm_mask;
219 cpu_clear(smp_processor_id(), cpu_mask);
220
221 if (current->active_mm == mm) {
222 if (current->mm)
223 __flush_tlb_one(va);
224 else
225 leave_mm(smp_processor_id());
226 }
227
228 if (!cpus_empty(cpu_mask))
229 flush_tlb_others(cpu_mask, mm, va);
230
231 preempt_enable();
232}
233EXPORT_SYMBOL(flush_tlb_page);
234
235static void do_flush_tlb_all(void *info)
236{
237 unsigned long cpu = smp_processor_id();
238
239 __flush_tlb_all();
240 if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
241 leave_mm(cpu);
242}
243
244void flush_tlb_all(void)
245{
246 on_each_cpu(do_flush_tlb_all, NULL, 1);
247}
248
249void reset_lazy_tlbstate(void)
250{
251 int cpu = raw_smp_processor_id();
252
253 per_cpu(cpu_tlbstate, cpu).state = 0;
254 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
255}
256
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
deleted file mode 100644
index f8be6f1d2e48..000000000000
--- a/arch/x86/kernel/tlb_64.c
+++ /dev/null
@@ -1,284 +0,0 @@
1#include <linux/init.h>
2
3#include <linux/mm.h>
4#include <linux/delay.h>
5#include <linux/spinlock.h>
6#include <linux/smp.h>
7#include <linux/kernel_stat.h>
8#include <linux/mc146818rtc.h>
9#include <linux/interrupt.h>
10
11#include <asm/mtrr.h>
12#include <asm/pgalloc.h>
13#include <asm/tlbflush.h>
14#include <asm/mmu_context.h>
15#include <asm/proto.h>
16#include <asm/apicdef.h>
17#include <asm/idle.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20
21#include <mach_ipi.h>
22/*
23 * Smarter SMP flushing macros.
24 * c/o Linus Torvalds.
25 *
26 * These mean you can really definitely utterly forget about
27 * writing to user space from interrupts. (Its not allowed anyway).
28 *
29 * Optimizations Manfred Spraul <manfred@colorfullife.com>
30 *
31 * More scalable flush, from Andi Kleen
32 *
33 * To avoid global state use 8 different call vectors.
34 * Each CPU uses a specific vector to trigger flushes on other
35 * CPUs. Depending on the received vector the target CPUs look into
36 * the right per cpu variable for the flush data.
37 *
38 * With more than 8 CPUs they are hashed to the 8 available
39 * vectors. The limited global vector space forces us to this right now.
40 * In future when interrupts are split into per CPU domains this could be
41 * fixed, at the cost of triggering multiple IPIs in some cases.
42 */
43
44union smp_flush_state {
45 struct {
46 cpumask_t flush_cpumask;
47 struct mm_struct *flush_mm;
48 unsigned long flush_va;
49 spinlock_t tlbstate_lock;
50 };
51 char pad[SMP_CACHE_BYTES];
52} ____cacheline_aligned;
53
54/* State is put into the per CPU data section, but padded
55 to a full cache line because other CPUs can access it and we don't
56 want false sharing in the per cpu data segment. */
57static DEFINE_PER_CPU(union smp_flush_state, flush_state);
58
59/*
60 * We cannot call mmdrop() because we are in interrupt context,
61 * instead update mm->cpu_vm_mask.
62 */
63void leave_mm(int cpu)
64{
65 if (read_pda(mmu_state) == TLBSTATE_OK)
66 BUG();
67 cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask);
68 load_cr3(swapper_pg_dir);
69}
70EXPORT_SYMBOL_GPL(leave_mm);
71
72/*
73 *
74 * The flush IPI assumes that a thread switch happens in this order:
75 * [cpu0: the cpu that switches]
76 * 1) switch_mm() either 1a) or 1b)
77 * 1a) thread switch to a different mm
78 * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
79 * Stop ipi delivery for the old mm. This is not synchronized with
80 * the other cpus, but smp_invalidate_interrupt ignore flush ipis
81 * for the wrong mm, and in the worst case we perform a superfluous
82 * tlb flush.
83 * 1a2) set cpu mmu_state to TLBSTATE_OK
84 * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
85 * was in lazy tlb mode.
86 * 1a3) update cpu active_mm
87 * Now cpu0 accepts tlb flushes for the new mm.
88 * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
89 * Now the other cpus will send tlb flush ipis.
90 * 1a4) change cr3.
91 * 1b) thread switch without mm change
92 * cpu active_mm is correct, cpu0 already handles
93 * flush ipis.
94 * 1b1) set cpu mmu_state to TLBSTATE_OK
95 * 1b2) test_and_set the cpu bit in cpu_vm_mask.
96 * Atomically set the bit [other cpus will start sending flush ipis],
97 * and test the bit.
98 * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
99 * 2) switch %%esp, ie current
100 *
101 * The interrupt must handle 2 special cases:
102 * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
103 * - the cpu performs speculative tlb reads, i.e. even if the cpu only
104 * runs in kernel space, the cpu could load tlb entries for user space
105 * pages.
106 *
107 * The good news is that cpu mmu_state is local to each cpu, no
108 * write/read ordering problems.
109 */
110
111/*
112 * TLB flush IPI:
113 *
114 * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
115 * 2) Leave the mm if we are in the lazy tlb mode.
116 *
117 * Interrupts are disabled.
118 */
119
120asmlinkage void smp_invalidate_interrupt(struct pt_regs *regs)
121{
122 int cpu;
123 int sender;
124 union smp_flush_state *f;
125
126 cpu = smp_processor_id();
127 /*
128 * orig_rax contains the negated interrupt vector.
129 * Use that to determine where the sender put the data.
130 */
131 sender = ~regs->orig_ax - INVALIDATE_TLB_VECTOR_START;
132 f = &per_cpu(flush_state, sender);
133
134 if (!cpu_isset(cpu, f->flush_cpumask))
135 goto out;
136 /*
137 * This was a BUG() but until someone can quote me the
138 * line from the intel manual that guarantees an IPI to
139 * multiple CPUs is retried _only_ on the erroring CPUs
140 * its staying as a return
141 *
142 * BUG();
143 */
144
145 if (f->flush_mm == read_pda(active_mm)) {
146 if (read_pda(mmu_state) == TLBSTATE_OK) {
147 if (f->flush_va == TLB_FLUSH_ALL)
148 local_flush_tlb();
149 else
150 __flush_tlb_one(f->flush_va);
151 } else
152 leave_mm(cpu);
153 }
154out:
155 ack_APIC_irq();
156 cpu_clear(cpu, f->flush_cpumask);
157 inc_irq_stat(irq_tlb_count);
158}
159
160void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
161 unsigned long va)
162{
163 int sender;
164 union smp_flush_state *f;
165 cpumask_t cpumask = *cpumaskp;
166
167 if (is_uv_system() && uv_flush_tlb_others(&cpumask, mm, va))
168 return;
169
170 /* Caller has disabled preemption */
171 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS;
172 f = &per_cpu(flush_state, sender);
173
174 /*
175 * Could avoid this lock when
176 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
177 * probably not worth checking this for a cache-hot lock.
178 */
179 spin_lock(&f->tlbstate_lock);
180
181 f->flush_mm = mm;
182 f->flush_va = va;
183 cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask);
184
185 /*
186 * Make the above memory operations globally visible before
187 * sending the IPI.
188 */
189 smp_mb();
190 /*
191 * We have to send the IPI only to
192 * CPUs affected.
193 */
194 send_IPI_mask(&cpumask, INVALIDATE_TLB_VECTOR_START + sender);
195
196 while (!cpus_empty(f->flush_cpumask))
197 cpu_relax();
198
199 f->flush_mm = NULL;
200 f->flush_va = 0;
201 spin_unlock(&f->tlbstate_lock);
202}
203
204static int __cpuinit init_smp_flush(void)
205{
206 int i;
207
208 for_each_possible_cpu(i)
209 spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
210
211 return 0;
212}
213core_initcall(init_smp_flush);
214
215void flush_tlb_current_task(void)
216{
217 struct mm_struct *mm = current->mm;
218 cpumask_t cpu_mask;
219
220 preempt_disable();
221 cpu_mask = mm->cpu_vm_mask;
222 cpu_clear(smp_processor_id(), cpu_mask);
223
224 local_flush_tlb();
225 if (!cpus_empty(cpu_mask))
226 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
227 preempt_enable();
228}
229
230void flush_tlb_mm(struct mm_struct *mm)
231{
232 cpumask_t cpu_mask;
233
234 preempt_disable();
235 cpu_mask = mm->cpu_vm_mask;
236 cpu_clear(smp_processor_id(), cpu_mask);
237
238 if (current->active_mm == mm) {
239 if (current->mm)
240 local_flush_tlb();
241 else
242 leave_mm(smp_processor_id());
243 }
244 if (!cpus_empty(cpu_mask))
245 flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
246
247 preempt_enable();
248}
249
250void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
251{
252 struct mm_struct *mm = vma->vm_mm;
253 cpumask_t cpu_mask;
254
255 preempt_disable();
256 cpu_mask = mm->cpu_vm_mask;
257 cpu_clear(smp_processor_id(), cpu_mask);
258
259 if (current->active_mm == mm) {
260 if (current->mm)
261 __flush_tlb_one(va);
262 else
263 leave_mm(smp_processor_id());
264 }
265
266 if (!cpus_empty(cpu_mask))
267 flush_tlb_others(cpu_mask, mm, va);
268
269 preempt_enable();
270}
271
272static void do_flush_tlb_all(void *info)
273{
274 unsigned long cpu = smp_processor_id();
275
276 __flush_tlb_all();
277 if (read_pda(mmu_state) == TLBSTATE_LAZY)
278 leave_mm(cpu);
279}
280
281void flush_tlb_all(void)
282{
283 on_each_cpu(do_flush_tlb_all, NULL, 1);
284}
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f885023167e0..89fce1b6d01f 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -11,6 +11,7 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
13#include <asm/mmu_context.h> 13#include <asm/mmu_context.h>
14#include <asm/uv/uv.h>
14#include <asm/uv/uv_mmrs.h> 15#include <asm/uv/uv_mmrs.h>
15#include <asm/uv/uv_hub.h> 16#include <asm/uv/uv_hub.h>
16#include <asm/uv/uv_bau.h> 17#include <asm/uv/uv_bau.h>
@@ -200,6 +201,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
200 destination_timeouts = 0; 201 destination_timeouts = 0;
201 } 202 }
202 } 203 }
204 cpu_relax();
203 } 205 }
204 return FLUSH_COMPLETE; 206 return FLUSH_COMPLETE;
205} 207}
@@ -209,14 +211,15 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
209 * 211 *
210 * Send a broadcast and wait for a broadcast message to complete. 212 * Send a broadcast and wait for a broadcast message to complete.
211 * 213 *
212 * The cpumaskp mask contains the cpus the broadcast was sent to. 214 * The flush_mask contains the cpus the broadcast was sent to.
213 * 215 *
214 * Returns 1 if all remote flushing was done. The mask is zeroed. 216 * Returns NULL if all remote flushing was done. The mask is zeroed.
215 * Returns 0 if some remote flushing remains to be done. The mask is left 217 * Returns @flush_mask if some remote flushing remains to be done. The
216 * unchanged. 218 * mask will have some bits still set.
217 */ 219 */
218int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc, 220const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade,
219 cpumask_t *cpumaskp) 221 struct bau_desc *bau_desc,
222 struct cpumask *flush_mask)
220{ 223{
221 int completion_status = 0; 224 int completion_status = 0;
222 int right_shift; 225 int right_shift;
@@ -263,59 +266,69 @@ int uv_flush_send_and_wait(int cpu, int this_blade, struct bau_desc *bau_desc,
263 * Success, so clear the remote cpu's from the mask so we don't 266 * Success, so clear the remote cpu's from the mask so we don't
264 * use the IPI method of shootdown on them. 267 * use the IPI method of shootdown on them.
265 */ 268 */
266 for_each_cpu_mask(bit, *cpumaskp) { 269 for_each_cpu(bit, flush_mask) {
267 blade = uv_cpu_to_blade_id(bit); 270 blade = uv_cpu_to_blade_id(bit);
268 if (blade == this_blade) 271 if (blade == this_blade)
269 continue; 272 continue;
270 cpu_clear(bit, *cpumaskp); 273 cpumask_clear_cpu(bit, flush_mask);
271 } 274 }
272 if (!cpus_empty(*cpumaskp)) 275 if (!cpumask_empty(flush_mask))
273 return 0; 276 return flush_mask;
274 return 1; 277 return NULL;
275} 278}
276 279
277/** 280/**
278 * uv_flush_tlb_others - globally purge translation cache of a virtual 281 * uv_flush_tlb_others - globally purge translation cache of a virtual
279 * address or all TLB's 282 * address or all TLB's
280 * @cpumaskp: mask of all cpu's in which the address is to be removed 283 * @cpumask: mask of all cpu's in which the address is to be removed
281 * @mm: mm_struct containing virtual address range 284 * @mm: mm_struct containing virtual address range
282 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 285 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
286 * @cpu: the current cpu
283 * 287 *
284 * This is the entry point for initiating any UV global TLB shootdown. 288 * This is the entry point for initiating any UV global TLB shootdown.
285 * 289 *
286 * Purges the translation caches of all specified processors of the given 290 * Purges the translation caches of all specified processors of the given
287 * virtual address, or purges all TLB's on specified processors. 291 * virtual address, or purges all TLB's on specified processors.
288 * 292 *
289 * The caller has derived the cpumaskp from the mm_struct and has subtracted 293 * The caller has derived the cpumask from the mm_struct. This function
290 * the local cpu from the mask. This function is called only if there 294 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
291 * are bits set in the mask. (e.g. flush_tlb_page())
292 * 295 *
293 * The cpumaskp is converted into a nodemask of the nodes containing 296 * The cpumask is converted into a nodemask of the nodes containing
294 * the cpus. 297 * the cpus.
295 * 298 *
296 * Returns 1 if all remote flushing was done. 299 * Note that this function should be called with preemption disabled.
297 * Returns 0 if some remote flushing remains to be done. 300 *
301 * Returns NULL if all remote flushing was done.
302 * Returns pointer to cpumask if some remote flushing remains to be
303 * done. The returned pointer is valid till preemption is re-enabled.
298 */ 304 */
299int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm, 305const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
300 unsigned long va) 306 struct mm_struct *mm,
307 unsigned long va, unsigned int cpu)
301{ 308{
309 static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
310 struct cpumask *flush_mask = &__get_cpu_var(flush_tlb_mask);
302 int i; 311 int i;
303 int bit; 312 int bit;
304 int blade; 313 int blade;
305 int cpu; 314 int uv_cpu;
306 int this_blade; 315 int this_blade;
307 int locals = 0; 316 int locals = 0;
308 struct bau_desc *bau_desc; 317 struct bau_desc *bau_desc;
309 318
310 cpu = uv_blade_processor_id(); 319 WARN_ON(!in_atomic());
320
321 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
322
323 uv_cpu = uv_blade_processor_id();
311 this_blade = uv_numa_blade_id(); 324 this_blade = uv_numa_blade_id();
312 bau_desc = __get_cpu_var(bau_control).descriptor_base; 325 bau_desc = __get_cpu_var(bau_control).descriptor_base;
313 bau_desc += UV_ITEMS_PER_DESCRIPTOR * cpu; 326 bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
314 327
315 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 328 bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
316 329
317 i = 0; 330 i = 0;
318 for_each_cpu_mask(bit, *cpumaskp) { 331 for_each_cpu(bit, flush_mask) {
319 blade = uv_cpu_to_blade_id(bit); 332 blade = uv_cpu_to_blade_id(bit);
320 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1)); 333 BUG_ON(blade > (UV_DISTRIBUTION_SIZE - 1));
321 if (blade == this_blade) { 334 if (blade == this_blade) {
@@ -330,17 +343,17 @@ int uv_flush_tlb_others(cpumask_t *cpumaskp, struct mm_struct *mm,
330 * no off_node flushing; return status for local node 343 * no off_node flushing; return status for local node
331 */ 344 */
332 if (locals) 345 if (locals)
333 return 0; 346 return flush_mask;
334 else 347 else
335 return 1; 348 return NULL;
336 } 349 }
337 __get_cpu_var(ptcstats).requestor++; 350 __get_cpu_var(ptcstats).requestor++;
338 __get_cpu_var(ptcstats).ntargeted += i; 351 __get_cpu_var(ptcstats).ntargeted += i;
339 352
340 bau_desc->payload.address = va; 353 bau_desc->payload.address = va;
341 bau_desc->payload.sending_cpu = smp_processor_id(); 354 bau_desc->payload.sending_cpu = cpu;
342 355
343 return uv_flush_send_and_wait(cpu, this_blade, bau_desc, cpumaskp); 356 return uv_flush_send_and_wait(uv_cpu, this_blade, bau_desc, flush_mask);
344} 357}
345 358
346/* 359/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 98c2d055284b..ed5aee5f3fcc 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -59,7 +59,6 @@
59#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
60#include <asm/pgalloc.h> 60#include <asm/pgalloc.h>
61#include <asm/proto.h> 61#include <asm/proto.h>
62#include <asm/pda.h>
63#else 62#else
64#include <asm/processor-flags.h> 63#include <asm/processor-flags.h>
65#include <asm/arch_hooks.h> 64#include <asm/arch_hooks.h>
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 23206ba16874..1d3302cc2ddf 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -858,7 +858,7 @@ void __init vmi_init(void)
858#endif 858#endif
859} 859}
860 860
861void vmi_activate(void) 861void __init vmi_activate(void)
862{ 862{
863 unsigned long flags; 863 unsigned long flags;
864 864
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 82c67559dde7..3eba7f7bac05 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -178,14 +178,7 @@ SECTIONS
178 __initramfs_end = .; 178 __initramfs_end = .;
179 } 179 }
180#endif 180#endif
181 . = ALIGN(PAGE_SIZE); 181 PERCPU(PAGE_SIZE)
182 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
183 __per_cpu_start = .;
184 *(.data.percpu.page_aligned)
185 *(.data.percpu)
186 *(.data.percpu.shared_aligned)
187 __per_cpu_end = .;
188 }
189 . = ALIGN(PAGE_SIZE); 182 . = ALIGN(PAGE_SIZE);
190 /* freed after init ends here */ 183 /* freed after init ends here */
191 184
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..c9740996430a 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
5#define LOAD_OFFSET __START_KERNEL_map 5#define LOAD_OFFSET __START_KERNEL_map
6 6
7#include <asm-generic/vmlinux.lds.h> 7#include <asm-generic/vmlinux.lds.h>
8#include <asm/asm-offsets.h>
8#include <asm/page.h> 9#include <asm/page.h>
9 10
10#undef i386 /* in case the preprocessor is a 32bit one */ 11#undef i386 /* in case the preprocessor is a 32bit one */
@@ -13,12 +14,14 @@ OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
13OUTPUT_ARCH(i386:x86-64) 14OUTPUT_ARCH(i386:x86-64)
14ENTRY(phys_startup_64) 15ENTRY(phys_startup_64)
15jiffies_64 = jiffies; 16jiffies_64 = jiffies;
16_proxy_pda = 1;
17PHDRS { 17PHDRS {
18 text PT_LOAD FLAGS(5); /* R_E */ 18 text PT_LOAD FLAGS(5); /* R_E */
19 data PT_LOAD FLAGS(7); /* RWE */ 19 data PT_LOAD FLAGS(7); /* RWE */
20 user PT_LOAD FLAGS(7); /* RWE */ 20 user PT_LOAD FLAGS(7); /* RWE */
21 data.init PT_LOAD FLAGS(7); /* RWE */ 21 data.init PT_LOAD FLAGS(7); /* RWE */
22#ifdef CONFIG_SMP
23 percpu PT_LOAD FLAGS(7); /* RWE */
24#endif
22 note PT_NOTE FLAGS(0); /* ___ */ 25 note PT_NOTE FLAGS(0); /* ___ */
23} 26}
24SECTIONS 27SECTIONS
@@ -208,14 +211,28 @@ SECTIONS
208 __initramfs_end = .; 211 __initramfs_end = .;
209#endif 212#endif
210 213
214#ifdef CONFIG_SMP
215 /*
216 * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the
217 * output PHDR, so the next output section - __data_nosave - should
218 * switch it back to data.init. Also, pda should be at the head of
219 * percpu area. Preallocate it and define the percpu offset symbol
220 * so that it can be accessed as a percpu variable.
221 */
222 . = ALIGN(PAGE_SIZE);
223 PERCPU_VADDR(0, :percpu)
224#else
211 PERCPU(PAGE_SIZE) 225 PERCPU(PAGE_SIZE)
226#endif
212 227
213 . = ALIGN(PAGE_SIZE); 228 . = ALIGN(PAGE_SIZE);
214 __init_end = .; 229 __init_end = .;
215 230
216 . = ALIGN(PAGE_SIZE); 231 . = ALIGN(PAGE_SIZE);
217 __nosave_begin = .; 232 __nosave_begin = .;
218 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } 233 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
234 *(.data.nosave)
235 } :data.init /* switch back to data.init, see PERCPU_VADDR() above */
219 . = ALIGN(PAGE_SIZE); 236 . = ALIGN(PAGE_SIZE);
220 __nosave_end = .; 237 __nosave_end = .;
221 238
@@ -244,3 +261,8 @@ SECTIONS
244 */ 261 */
245ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 262ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
246 "kernel image bigger than KERNEL_IMAGE_SIZE") 263 "kernel image bigger than KERNEL_IMAGE_SIZE")
264
265#ifdef CONFIG_SMP
266ASSERT((per_cpu__irq_stack_union == 0),
267 "irq_stack_union is not at start of per-cpu area");
268#endif
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 695e426aa354..3909e3ba5ce3 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -58,5 +58,3 @@ EXPORT_SYMBOL(__memcpy);
58EXPORT_SYMBOL(empty_zero_page); 58EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 59EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 60EXPORT_SYMBOL(load_gs_index);
61
62EXPORT_SYMBOL(_proxy_pda);