aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:09:09 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2009-10-17 19:12:33 -0400
commit0f8f86c7bdd1c954fbe153af437a0d91a6c5721a (patch)
tree94a8d419a470a4f9852ca397bb9bbe48db92ff5c /arch/x86/kernel
parentdca2d6ac09d9ef59ff46820d4f0c94b08a671202 (diff)
parentf39cdf25bf77219676ec5360980ac40b1a7e144a (diff)
Merge commit 'perf/core' into perf/hw-breakpoint
Conflicts: kernel/Makefile kernel/trace/Makefile kernel/trace/trace.h samples/Makefile Merge reason: We need to be uptodate with the perf events development branch because we plan to rewrite the breakpoints API on top of perf events.
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile7
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S4
-rw-r--r--arch/x86/kernel/apic/apic.c40
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c63
-rw-r--r--arch/x86/kernel/apic/nmi.c6
-rw-r--r--arch/x86/kernel/apic/numaq_32.c57
-rw-r--r--arch/x86/kernel/apic/probe_64.c15
-rw-r--r--arch/x86/kernel/apic/summit_32.c2
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c11
-rw-r--r--arch/x86/kernel/cpu/Makefile4
-rw-r--r--arch/x86/kernel/cpu/amd.c12
-rw-r--r--arch/x86/kernel/cpu/common.c5
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c116
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c14
-rw-r--r--arch/x86/kernel/cpu/intel.c6
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile5
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c116
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c159
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h15
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c396
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c94
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c163
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c127
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c80
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c29
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c46
-rw-r--r--arch/x86/kernel/cpu/perf_event.c (renamed from arch/x86/kernel/cpu/perf_counter.c)781
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c2
-rw-r--r--arch/x86/kernel/cpu/sched.c55
-rw-r--r--arch/x86/kernel/cpu/vmware.c27
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/dumpstack_32.c1
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c25
-rw-r--r--arch/x86/kernel/early_printk.c788
-rw-r--r--arch/x86/kernel/efi.c4
-rw-r--r--arch/x86/kernel/entry_32.S7
-rw-r--r--arch/x86/kernel/entry_64.S32
-rw-r--r--arch/x86/kernel/ftrace.c84
-rw-r--r--arch/x86/kernel/head32.c26
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S7
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c10
-rw-r--r--arch/x86/kernel/i8253.c19
-rw-r--r--arch/x86/kernel/init_task.c5
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irqinit.c40
-rw-r--r--arch/x86/kernel/kvmclock.c11
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/microcode_core.c6
-rw-r--r--arch/x86/kernel/mpparse.c75
-rw-r--r--arch/x86/kernel/mrst.c24
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/paravirt.c36
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/kernel/pci-gart_64.c1
-rw-r--r--arch/x86/kernel/pci-swiotlb.c5
-rw-r--r--arch/x86/kernel/process.c31
-rw-r--r--arch/x86/kernel/ptrace.c21
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/rtc.c17
-rw-r--r--arch/x86/kernel/setup.c131
-rw-r--r--arch/x86/kernel/setup_percpu.c364
-rw-r--r--arch/x86/kernel/sfi.c122
-rw-r--r--arch/x86/kernel/signal.c2
-rw-r--r--arch/x86/kernel/smpboot.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c447
-rw-r--r--arch/x86/kernel/time.c121
-rw-r--r--arch/x86/kernel/time_32.c137
-rw-r--r--arch/x86/kernel/time_64.c135
-rw-r--r--arch/x86/kernel/trampoline.c12
-rw-r--r--arch/x86/kernel/trampoline_32.S8
-rw-r--r--arch/x86/kernel/trampoline_64.S7
-rw-r--r--arch/x86/kernel/traps.c13
-rw-r--r--arch/x86/kernel/tsc.c88
-rw-r--r--arch/x86/kernel/tsc_sync.c2
-rw-r--r--arch/x86/kernel/visws_quirks.c54
-rw-r--r--arch/x86/kernel/vmi_32.c14
-rw-r--r--arch/x86/kernel/vmiclock_32.c2
-rw-r--r--arch/x86/kernel/vmlinux.lds.S111
-rw-r--r--arch/x86/kernel/vsyscall_64.c11
-rw-r--r--arch/x86/kernel/x86_init.c75
92 files changed, 2469 insertions, 3267 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index bf04201b6575..4f2e66e29ecc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -31,8 +31,8 @@ GCOV_PROFILE_paravirt.o := n
31 31
32obj-y := process_$(BITS).o signal.o entry_$(BITS).o 32obj-y := process_$(BITS).o signal.o entry_$(BITS).o
33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 33obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
34obj-y += time_$(BITS).o ioport.o ldt.o dumpstack.o 34obj-y += time.o ioport.o ldt.o dumpstack.o
35obj-y += setup.o i8259.o irqinit.o 35obj-y += setup.o x86_init.o i8259.o irqinit.o
36obj-$(CONFIG_X86_VISWS) += visws_quirks.o 36obj-$(CONFIG_X86_VISWS) += visws_quirks.o
37obj-$(CONFIG_X86_32) += probe_roms_32.o 37obj-$(CONFIG_X86_32) += probe_roms_32.o
38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 38obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
@@ -52,9 +52,11 @@ obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
52obj-$(CONFIG_X86_32) += tls.o 52obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 53obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 54obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o
55obj-$(CONFIG_STACKTRACE) += stacktrace.o 56obj-$(CONFIG_STACKTRACE) += stacktrace.o
56obj-y += cpu/ 57obj-y += cpu/
57obj-y += acpi/ 58obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
58obj-y += reboot.o 60obj-y += reboot.o
59obj-$(CONFIG_MCA) += mca_32.o 61obj-$(CONFIG_MCA) += mca_32.o
60obj-$(CONFIG_X86_MSR) += msr.o 62obj-$(CONFIG_X86_MSR) += msr.o
@@ -104,6 +106,7 @@ obj-$(CONFIG_SCx200) += scx200.o
104scx200-y += scx200_32.o 106scx200-y += scx200_32.o
105 107
106obj-$(CONFIG_OLPC) += olpc.o 108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_X86_MRST) += mrst.o
107 110
108microcode-y := microcode_core.o 111microcode-y := microcode_core.o
109microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 112microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8c44c232efcb..59cdfa4686b2 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 7da00b799cda..0e50e1e5c573 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -56,6 +56,6 @@ SECTIONS
56 /DISCARD/ : { 56 /DISCARD/ : {
57 *(.note*) 57 *(.note*)
58 } 58 }
59
60 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
61} 59}
60
61ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 159740decc41..894aa97f0717 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,7 +14,7 @@
14 * Mikael Pettersson : PM converted to driver model. 14 * Mikael Pettersson : PM converted to driver model.
15 */ 15 */
16 16
17#include <linux/perf_counter.h> 17#include <linux/perf_event.h>
18#include <linux/kernel_stat.h> 18#include <linux/kernel_stat.h>
19#include <linux/mc146818rtc.h> 19#include <linux/mc146818rtc.h>
20#include <linux/acpi_pmtmr.h> 20#include <linux/acpi_pmtmr.h>
@@ -35,7 +35,8 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/perf_counter.h> 38#include <asm/perf_event.h>
39#include <asm/x86_init.h>
39#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
40#include <asm/atomic.h> 41#include <asm/atomic.h>
41#include <asm/mpspec.h> 42#include <asm/mpspec.h>
@@ -61,7 +62,7 @@ unsigned int boot_cpu_physical_apicid = -1U;
61/* 62/*
62 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
63 * 64 *
64 * This determines the messaging protocol we can use: if all APIC IDs 65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
65 * are in the 0 ... 7 range, then we can use logical addressing which 66 * are in the 0 ... 7 range, then we can use logical addressing which
66 * has some performance advantages (better broadcasting). 67 * has some performance advantages (better broadcasting).
67 * 68 *
@@ -978,7 +979,7 @@ void lapic_shutdown(void)
978{ 979{
979 unsigned long flags; 980 unsigned long flags;
980 981
981 if (!cpu_has_apic) 982 if (!cpu_has_apic && !apic_from_smp_config())
982 return; 983 return;
983 984
984 local_irq_save(flags); 985 local_irq_save(flags);
@@ -1188,7 +1189,7 @@ void __cpuinit setup_local_APIC(void)
1188 apic_write(APIC_ESR, 0); 1189 apic_write(APIC_ESR, 0);
1189 } 1190 }
1190#endif 1191#endif
1191 perf_counters_lapic_init(); 1192 perf_events_lapic_init();
1192 1193
1193 preempt_disable(); 1194 preempt_disable();
1194 1195
@@ -1196,8 +1197,7 @@ void __cpuinit setup_local_APIC(void)
1196 * Double-check whether this APIC is really registered. 1197 * Double-check whether this APIC is really registered.
1197 * This is meaningless in clustered apic mode, so we skip it. 1198 * This is meaningless in clustered apic mode, so we skip it.
1198 */ 1199 */
1199 if (!apic->apic_id_registered()) 1200 BUG_ON(!apic->apic_id_registered());
1200 BUG();
1201 1201
1202 /* 1202 /*
1203 * Intel recommends to set DFR, LDR and TPR before enabling 1203 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1709,7 +1709,7 @@ int __init APIC_init_uniprocessor(void)
1709 localise_nmi_watchdog(); 1709 localise_nmi_watchdog();
1710#endif 1710#endif
1711 1711
1712 setup_boot_clock(); 1712 x86_init.timers.setup_percpu_clockev();
1713#ifdef CONFIG_X86_64 1713#ifdef CONFIG_X86_64
1714 check_nmi_watchdog(); 1714 check_nmi_watchdog();
1715#endif 1715#endif
@@ -1916,24 +1916,14 @@ void __cpuinit generic_processor_info(int apicid, int version)
1916 max_physical_apicid = apicid; 1916 max_physical_apicid = apicid;
1917 1917
1918#ifdef CONFIG_X86_32 1918#ifdef CONFIG_X86_32
1919 /* 1919 switch (boot_cpu_data.x86_vendor) {
1920 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1920 case X86_VENDOR_INTEL:
1921 * but we need to work other dependencies like SMP_SUSPEND etc 1921 if (num_processors > 8)
1922 * before this can be done without some confusion. 1922 def_to_bigsmp = 1;
1923 * if (CPU_HOTPLUG_ENABLED || num_processors > 8) 1923 break;
1924 * - Ashok Raj <ashok.raj@intel.com> 1924 case X86_VENDOR_AMD:
1925 */ 1925 if (max_physical_apicid >= 8)
1926 if (max_physical_apicid >= 8) {
1927 switch (boot_cpu_data.x86_vendor) {
1928 case X86_VENDOR_INTEL:
1929 if (!APIC_XAPIC(version)) {
1930 def_to_bigsmp = 0;
1931 break;
1932 }
1933 /* If P4 and above fall through */
1934 case X86_VENDOR_AMD:
1935 def_to_bigsmp = 1; 1926 def_to_bigsmp = 1;
1936 }
1937 } 1927 }
1938#endif 1928#endif
1939 1929
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 676cdac385c0..77a06413b6b2 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -112,7 +112,7 @@ static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map)
112 return physids_promote(0xFFL); 112 return physids_promote(0xFFL);
113} 113}
114 114
115static int bigsmp_check_phys_apicid_present(int boot_cpu_physical_apicid) 115static int bigsmp_check_phys_apicid_present(int phys_apicid)
116{ 116{
117 return 1; 117 return 1;
118} 118}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3c8f9e75d038..dc69f28489f5 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -96,6 +96,11 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
96/* # of MP IRQ source entries */ 96/* # of MP IRQ source entries */
97int mp_irq_entries; 97int mp_irq_entries;
98 98
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY;
103
99#if defined (CONFIG_MCA) || defined (CONFIG_EISA) 104#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
100int mp_bus_id_to_type[MAX_MP_BUSSES]; 105int mp_bus_id_to_type[MAX_MP_BUSSES];
101#endif 106#endif
@@ -173,6 +178,12 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
173 [15] = { .vector = IRQ15_VECTOR, }, 178 [15] = { .vector = IRQ15_VECTOR, },
174}; 179};
175 180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186
176int __init arch_early_irq_init(void) 187int __init arch_early_irq_init(void)
177{ 188{
178 struct irq_cfg *cfg; 189 struct irq_cfg *cfg;
@@ -190,7 +201,7 @@ int __init arch_early_irq_init(void)
190 desc->chip_data = &cfg[i]; 201 desc->chip_data = &cfg[i];
191 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
192 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
193 if (i < NR_IRQS_LEGACY) 204 if (i < nr_legacy_irqs)
194 cpumask_setall(cfg[i].domain); 205 cpumask_setall(cfg[i].domain);
195 } 206 }
196 207
@@ -216,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node)
216 227
217 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 228 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
218 if (cfg) { 229 if (cfg) {
219 if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 230 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) {
220 kfree(cfg); 231 kfree(cfg);
221 cfg = NULL; 232 cfg = NULL;
222 } else if (!alloc_cpumask_var_node(&cfg->old_domain, 233 } else if (!zalloc_cpumask_var_node(&cfg->old_domain,
223 GFP_ATOMIC, node)) { 234 GFP_ATOMIC, node)) {
224 free_cpumask_var(cfg->domain); 235 free_cpumask_var(cfg->domain);
225 kfree(cfg); 236 kfree(cfg);
226 cfg = NULL; 237 cfg = NULL;
227 } else {
228 cpumask_clear(cfg->domain);
229 cpumask_clear(cfg->old_domain);
230 } 238 }
231 } 239 }
232 240
@@ -867,7 +875,7 @@ static int __init find_isa_irq_apic(int irq, int type)
867 */ 875 */
868static int EISA_ELCR(unsigned int irq) 876static int EISA_ELCR(unsigned int irq)
869{ 877{
870 if (irq < NR_IRQS_LEGACY) { 878 if (irq < nr_legacy_irqs) {
871 unsigned int port = 0x4d0 + (irq >> 3); 879 unsigned int port = 0x4d0 + (irq >> 3);
872 return (inb(port) >> (irq & 7)) & 1; 880 return (inb(port) >> (irq & 7)) & 1;
873 } 881 }
@@ -1464,7 +1472,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1464 } 1472 }
1465 1473
1466 ioapic_register_intr(irq, desc, trigger); 1474 ioapic_register_intr(irq, desc, trigger);
1467 if (irq < NR_IRQS_LEGACY) 1475 if (irq < nr_legacy_irqs)
1468 disable_8259A_irq(irq); 1476 disable_8259A_irq(irq);
1469 1477
1470 ioapic_write_entry(apic_id, pin, entry); 1478 ioapic_write_entry(apic_id, pin, entry);
@@ -1831,7 +1839,7 @@ __apicdebuginit(void) print_PIC(void)
1831 unsigned int v; 1839 unsigned int v;
1832 unsigned long flags; 1840 unsigned long flags;
1833 1841
1834 if (apic_verbosity == APIC_QUIET) 1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs)
1835 return; 1843 return;
1836 1844
1837 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1845 printk(KERN_DEBUG "\nprinting PIC contents\n");
@@ -1863,7 +1871,7 @@ __apicdebuginit(int) print_all_ICs(void)
1863 print_PIC(); 1871 print_PIC();
1864 1872
1865 /* don't print out if apic is not there */ 1873 /* don't print out if apic is not there */
1866 if (!cpu_has_apic || disable_apic) 1874 if (!cpu_has_apic && !apic_from_smp_config())
1867 return 0; 1875 return 0;
1868 1876
1869 print_all_local_APICs(); 1877 print_all_local_APICs();
@@ -1894,6 +1902,10 @@ void __init enable_IO_APIC(void)
1894 spin_unlock_irqrestore(&ioapic_lock, flags); 1902 spin_unlock_irqrestore(&ioapic_lock, flags);
1895 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1896 } 1904 }
1905
1906 if (!nr_legacy_irqs)
1907 return;
1908
1897 for(apic = 0; apic < nr_ioapics; apic++) { 1909 for(apic = 0; apic < nr_ioapics; apic++) {
1898 int pin; 1910 int pin;
1899 /* See if any of the pins is in ExtINT mode */ 1911 /* See if any of the pins is in ExtINT mode */
@@ -1948,6 +1960,9 @@ void disable_IO_APIC(void)
1948 */ 1960 */
1949 clear_IO_APIC(); 1961 clear_IO_APIC();
1950 1962
1963 if (!nr_legacy_irqs)
1964 return;
1965
1951 /* 1966 /*
1952 * If the i8259 is routed through an IOAPIC 1967 * If the i8259 is routed through an IOAPIC
1953 * Put that IOAPIC in virtual wire mode 1968 * Put that IOAPIC in virtual wire mode
@@ -1981,7 +1996,7 @@ void disable_IO_APIC(void)
1981 /* 1996 /*
1982 * Use virtual wire A mode when interrupt remapping is enabled. 1997 * Use virtual wire A mode when interrupt remapping is enabled.
1983 */ 1998 */
1984 if (cpu_has_apic) 1999 if (cpu_has_apic || apic_from_smp_config())
1985 disconnect_bsp_APIC(!intr_remapping_enabled && 2000 disconnect_bsp_APIC(!intr_remapping_enabled &&
1986 ioapic_i8259.pin != -1); 2001 ioapic_i8259.pin != -1);
1987} 2002}
@@ -1994,7 +2009,7 @@ void disable_IO_APIC(void)
1994 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 2009 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1995 */ 2010 */
1996 2011
1997static void __init setup_ioapic_ids_from_mpc(void) 2012void __init setup_ioapic_ids_from_mpc(void)
1998{ 2013{
1999 union IO_APIC_reg_00 reg_00; 2014 union IO_APIC_reg_00 reg_00;
2000 physid_mask_t phys_id_present_map; 2015 physid_mask_t phys_id_present_map;
@@ -2003,9 +2018,8 @@ static void __init setup_ioapic_ids_from_mpc(void)
2003 unsigned char old_id; 2018 unsigned char old_id;
2004 unsigned long flags; 2019 unsigned long flags;
2005 2020
2006 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) 2021 if (acpi_ioapic)
2007 return; 2022 return;
2008
2009 /* 2023 /*
2010 * Don't check I/O APIC IDs for xAPIC systems. They have 2024 * Don't check I/O APIC IDs for xAPIC systems. They have
2011 * no meaning without the serial APIC bus. 2025 * no meaning without the serial APIC bus.
@@ -2179,7 +2193,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2179 struct irq_cfg *cfg; 2193 struct irq_cfg *cfg;
2180 2194
2181 spin_lock_irqsave(&ioapic_lock, flags); 2195 spin_lock_irqsave(&ioapic_lock, flags);
2182 if (irq < NR_IRQS_LEGACY) { 2196 if (irq < nr_legacy_irqs) {
2183 disable_8259A_irq(irq); 2197 disable_8259A_irq(irq);
2184 if (i8259A_irq_pending(irq)) 2198 if (i8259A_irq_pending(irq))
2185 was_pending = 1; 2199 was_pending = 1;
@@ -2657,7 +2671,7 @@ static inline void init_IO_APIC_traps(void)
2657 * so default to an old-fashioned 8259 2671 * so default to an old-fashioned 8259
2658 * interrupt if we can.. 2672 * interrupt if we can..
2659 */ 2673 */
2660 if (irq < NR_IRQS_LEGACY) 2674 if (irq < nr_legacy_irqs)
2661 make_8259A_irq(irq); 2675 make_8259A_irq(irq);
2662 else 2676 else
2663 /* Strange. Oh, well.. */ 2677 /* Strange. Oh, well.. */
@@ -2993,7 +3007,7 @@ out:
2993 * the I/O APIC in all cases now. No actual device should request 3007 * the I/O APIC in all cases now. No actual device should request
2994 * it anyway. --macro 3008 * it anyway. --macro
2995 */ 3009 */
2996#define PIC_IRQS (1 << PIC_CASCADE_IR) 3010#define PIC_IRQS (1UL << PIC_CASCADE_IR)
2997 3011
2998void __init setup_IO_APIC(void) 3012void __init setup_IO_APIC(void)
2999{ 3013{
@@ -3001,21 +3015,19 @@ void __init setup_IO_APIC(void)
3001 /* 3015 /*
3002 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3003 */ 3017 */
3004 3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3005 io_apic_irqs = ~PIC_IRQS;
3006 3019
3007 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3008 /* 3021 /*
3009 * Set up IO-APIC IRQ routing. 3022 * Set up IO-APIC IRQ routing.
3010 */ 3023 */
3011#ifdef CONFIG_X86_32 3024 x86_init.mpparse.setup_ioapic_ids();
3012 if (!acpi_ioapic) 3025
3013 setup_ioapic_ids_from_mpc();
3014#endif
3015 sync_Arb_IDs(); 3026 sync_Arb_IDs();
3016 setup_IO_APIC_irqs(); 3027 setup_IO_APIC_irqs();
3017 init_IO_APIC_traps(); 3028 init_IO_APIC_traps();
3018 check_timer(); 3029 if (nr_legacy_irqs)
3030 check_timer();
3019} 3031}
3020 3032
3021/* 3033/*
@@ -3116,7 +3128,6 @@ static int __init ioapic_init_sysfs(void)
3116 3128
3117device_initcall(ioapic_init_sysfs); 3129device_initcall(ioapic_init_sysfs);
3118 3130
3119static int nr_irqs_gsi = NR_IRQS_LEGACY;
3120/* 3131/*
3121 * Dynamic irq allocate and deallocation 3132 * Dynamic irq allocate and deallocation
3122 */ 3133 */
@@ -3856,7 +3867,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3856 /* 3867 /*
3857 * IRQs < 16 are already in the irq_2_pin[] map 3868 * IRQs < 16 are already in the irq_2_pin[] map
3858 */ 3869 */
3859 if (irq >= NR_IRQS_LEGACY) { 3870 if (irq >= nr_legacy_irqs) {
3860 cfg = desc->chip_data; 3871 cfg = desc->chip_data;
3861 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3862 printk(KERN_INFO "can not add pin %d for irq %d\n", 3873 printk(KERN_INFO "can not add pin %d for irq %d\n",
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index db7220220d09..7ff61d6a188a 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -66,7 +66,7 @@ static inline unsigned int get_nmi_count(int cpu)
66 66
67static inline int mce_in_progress(void) 67static inline int mce_in_progress(void)
68{ 68{
69#if defined(CONFIG_X86_NEW_MCE) 69#if defined(CONFIG_X86_MCE)
70 return atomic_read(&mce_entry) > 0; 70 return atomic_read(&mce_entry) > 0;
71#endif 71#endif
72 return 0; 72 return 0;
@@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
508/* 508/*
509 * proc handler for /proc/sys/kernel/nmi 509 * proc handler for /proc/sys/kernel/nmi
510 */ 510 */
511int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, 511int proc_nmi_enabled(struct ctl_table *table, int write,
512 void __user *buffer, size_t *length, loff_t *ppos) 512 void __user *buffer, size_t *length, loff_t *ppos)
513{ 513{
514 int old_state; 514 int old_state;
515 515
516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; 516 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
517 old_state = nmi_watchdog_enabled; 517 old_state = nmi_watchdog_enabled;
518 proc_dointvec(table, write, file, buffer, length, ppos); 518 proc_dointvec(table, write, buffer, length, ppos);
519 if (!!old_state == !!nmi_watchdog_enabled) 519 if (!!old_state == !!nmi_watchdog_enabled)
520 return 0; 520 return 0;
521 521
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index ca96e68f0d23..efa00e2b8505 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -66,7 +66,6 @@ struct mpc_trans {
66 unsigned short trans_reserved; 66 unsigned short trans_reserved;
67}; 67};
68 68
69/* x86_quirks member */
70static int mpc_record; 69static int mpc_record;
71 70
72static struct mpc_trans *translation_table[MAX_MPC_ENTRY]; 71static struct mpc_trans *translation_table[MAX_MPC_ENTRY];
@@ -130,10 +129,9 @@ void __cpuinit numaq_tsc_disable(void)
130 } 129 }
131} 130}
132 131
133static int __init numaq_pre_time_init(void) 132static void __init numaq_tsc_init(void)
134{ 133{
135 numaq_tsc_disable(); 134 numaq_tsc_disable();
136 return 0;
137} 135}
138 136
139static inline int generate_logical_apicid(int quad, int phys_apicid) 137static inline int generate_logical_apicid(int quad, int phys_apicid)
@@ -177,6 +175,19 @@ static void mpc_oem_pci_bus(struct mpc_bus *m)
177 quad_local_to_mp_bus_id[quad][local] = m->busid; 175 quad_local_to_mp_bus_id[quad][local] = m->busid;
178} 176}
179 177
178/*
179 * Called from mpparse code.
180 * mode = 0: prescan
181 * mode = 1: one mpc entry scanned
182 */
183static void numaq_mpc_record(unsigned int mode)
184{
185 if (!mode)
186 mpc_record = 0;
187 else
188 mpc_record++;
189}
190
180static void __init MP_translation_info(struct mpc_trans *m) 191static void __init MP_translation_info(struct mpc_trans *m)
181{ 192{
182 printk(KERN_INFO 193 printk(KERN_INFO
@@ -206,9 +217,9 @@ static int __init mpf_checksum(unsigned char *mp, int len)
206/* 217/*
207 * Read/parse the MPC oem tables 218 * Read/parse the MPC oem tables
208 */ 219 */
209static void __init 220static void __init smp_read_mpc_oem(struct mpc_table *mpc)
210 smp_read_mpc_oem(struct mpc_oemtable *oemtable, unsigned short oemsize)
211{ 221{
222 struct mpc_oemtable *oemtable = (void *)(long)mpc->oemptr;
212 int count = sizeof(*oemtable); /* the header size */ 223 int count = sizeof(*oemtable); /* the header size */
213 unsigned char *oemptr = ((unsigned char *)oemtable) + count; 224 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
214 225
@@ -250,29 +261,6 @@ static void __init
250 } 261 }
251} 262}
252 263
253static int __init numaq_setup_ioapic_ids(void)
254{
255 /* so can skip it */
256 return 1;
257}
258
259static struct x86_quirks numaq_x86_quirks __initdata = {
260 .arch_pre_time_init = numaq_pre_time_init,
261 .arch_time_init = NULL,
262 .arch_pre_intr_init = NULL,
263 .arch_memory_setup = NULL,
264 .arch_intr_init = NULL,
265 .arch_trap_init = NULL,
266 .mach_get_smp_config = NULL,
267 .mach_find_smp_config = NULL,
268 .mpc_record = &mpc_record,
269 .mpc_apic_id = mpc_apic_id,
270 .mpc_oem_bus_info = mpc_oem_bus_info,
271 .mpc_oem_pci_bus = mpc_oem_pci_bus,
272 .smp_read_mpc_oem = smp_read_mpc_oem,
273 .setup_ioapic_ids = numaq_setup_ioapic_ids,
274};
275
276static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
277{ 265{
278 /* 266 /*
@@ -286,8 +274,15 @@ static __init void early_check_numaq(void)
286 if (smp_found_config) 274 if (smp_found_config)
287 early_get_smp_config(); 275 early_get_smp_config();
288 276
289 if (found_numaq) 277 if (found_numaq) {
290 x86_quirks = &numaq_x86_quirks; 278 x86_init.mpparse.mpc_record = numaq_mpc_record;
279 x86_init.mpparse.setup_ioapic_ids = x86_init_noop;
280 x86_init.mpparse.mpc_apic_id = mpc_apic_id;
281 x86_init.mpparse.smp_read_mpc_oem = smp_read_mpc_oem;
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init;
285 }
291} 286}
292 287
293int __init get_memcfg_numaq(void) 288int __init get_memcfg_numaq(void)
@@ -418,7 +413,7 @@ static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid)
418/* Where the IO area was mapped on multiquad, always 0 otherwise */ 413/* Where the IO area was mapped on multiquad, always 0 otherwise */
419void *xquad_portio; 414void *xquad_portio;
420 415
421static inline int numaq_check_phys_apicid_present(int boot_cpu_physical_apicid) 416static inline int numaq_check_phys_apicid_present(int phys_apicid)
422{ 417{
423 return 1; 418 return 1;
424} 419}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 65edc180fc82..c4cbd3080c1c 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void)
64 apic = &apic_x2apic_phys; 64 apic = &apic_x2apic_phys;
65 else 65 else
66 apic = &apic_x2apic_cluster; 66 apic = &apic_x2apic_cluster;
67 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
68 } 67 }
69#endif 68#endif
70 69
71 if (apic == &apic_flat) { 70 if (apic == &apic_flat) {
72 if (max_physical_apicid >= 8) 71 switch (boot_cpu_data.x86_vendor) {
73 apic = &apic_physflat; 72 case X86_VENDOR_INTEL:
74 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
75 } 80 }
76 81
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83
77 if (is_vsmp_box()) { 84 if (is_vsmp_box()) {
78 /* need to update phys_pkg_id */ 85 /* need to update phys_pkg_id */
79 apic->phys_pkg_id = apicid_phys_pkg_id; 86 apic->phys_pkg_id = apicid_phys_pkg_id;
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index eafdfbd1ea95..645ecc4ff0be 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -272,7 +272,7 @@ static physid_mask_t summit_apicid_to_cpu_present(int apicid)
272 return physid_mask_of_physid(0); 272 return physid_mask_of_physid(0);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int boot_cpu_physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
276{ 276{
277 return 1; 277 return 1;
278} 278}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 601159374e87..f5f5886a6b53 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode)
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
390} 390}
391 391
392static __init void map_mmr_high(int max_pnode)
393{
394 union uvh_rh_gam_mmr_overlay_config_mmr_u mmr;
395 int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT;
396
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
400}
401
392static __init void map_mmioh_high(int max_pnode) 402static __init void map_mmioh_high(int max_pnode)
393{ 403{
394 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 404 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
@@ -643,6 +653,7 @@ void __init uv_system_init(void)
643 } 653 }
644 654
645 map_gru_high(max_pnode); 655 map_gru_high(max_pnode);
656 map_mmr_high(max_pnode);
646 map_mmioh_high(max_pnode); 657 map_mmioh_high(max_pnode);
647 658
648 uv_cpu_init(); 659 uv_cpu_init();
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c1f253dac155..68537e957a9b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp)
13 13
14obj-y := intel_cacheinfo.o addon_cpuid_features.o 14obj-y := intel_cacheinfo.o addon_cpuid_features.o
15obj-y += proc.o capflags.o powerflags.o common.o 15obj-y += proc.o capflags.o powerflags.o common.o
16obj-y += vmware.o hypervisor.o 16obj-y += vmware.o hypervisor.o sched.o
17 17
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 19obj-$(CONFIG_X86_64) += bugs_64.o
@@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o
27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 27obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 28obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
29 29
30obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o 30obj-$(CONFIG_PERF_EVENTS) += perf_event.o
31 31
32obj-$(CONFIG_X86_MCE) += mcheck/ 32obj-$(CONFIG_X86_MCE) += mcheck/
33obj-$(CONFIG_MTRR) += mtrr/ 33obj-$(CONFIG_MTRR) += mtrr/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 22a47c82f3c0..c910a716a71c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
184 * approved Athlon 184 * approved Athlon
185 */ 185 */
186 WARN_ONCE(1, "WARNING: This combination of AMD" 186 WARN_ONCE(1, "WARNING: This combination of AMD"
187 "processors is not suitable for SMP.\n"); 187 " processors is not suitable for SMP.\n");
188 if (!test_taint(TAINT_UNSAFE_SMP)) 188 if (!test_taint(TAINT_UNSAFE_SMP))
189 add_taint(TAINT_UNSAFE_SMP); 189 add_taint(TAINT_UNSAFE_SMP);
190 190
@@ -333,6 +333,16 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
333#endif 333#endif
334} 334}
335 335
336int amd_get_nb_id(int cpu)
337{
338 int id = 0;
339#ifdef CONFIG_SMP
340 id = per_cpu(cpu_llc_id, cpu);
341#endif
342 return id;
343}
344EXPORT_SYMBOL_GPL(amd_get_nb_id);
345
336static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 346static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
337{ 347{
338#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 348#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2055fc2b2e6b..cc25c2b4a567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,7 +13,7 @@
13#include <linux/io.h> 13#include <linux/io.h>
14 14
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_counter.h> 16#include <asm/perf_event.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
19#include <asm/processor.h> 19#include <asm/processor.h>
@@ -34,7 +34,6 @@
34#include <asm/mce.h> 34#include <asm/mce.h>
35#include <asm/msr.h> 35#include <asm/msr.h>
36#include <asm/pat.h> 36#include <asm/pat.h>
37#include <linux/smp.h>
38 37
39#ifdef CONFIG_X86_LOCAL_APIC 38#ifdef CONFIG_X86_LOCAL_APIC
40#include <asm/uv/uv.h> 39#include <asm/uv/uv.h>
@@ -870,7 +869,7 @@ void __init identify_boot_cpu(void)
870#else 869#else
871 vgetcpu_set_mode(); 870 vgetcpu_set_mode();
872#endif 871#endif
873 init_hw_perf_counters(); 872 init_hw_perf_events();
874} 873}
875 874
876void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 875void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 6b2a52dd0403..dca325c03999 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -30,8 +30,8 @@
30#include <asm/apic.h> 30#include <asm/apic.h>
31#include <asm/desc.h> 31#include <asm/desc.h>
32 32
33static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]); 33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]); 34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count); 35static DEFINE_PER_CPU(int, cpu_priv_count);
36 36
37static DEFINE_MUTEX(cpu_debug_lock); 37static DEFINE_MUTEX(cpu_debug_lock);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index ae9b503220ca..7d5c3b0ea8da 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <trace/power.h> 36#include <trace/events/power.h>
37 37
38#include <linux/acpi.h> 38#include <linux/acpi.h>
39#include <linux/io.h> 39#include <linux/io.h>
@@ -60,7 +60,6 @@ enum {
60}; 60};
61 61
62#define INTEL_MSR_RANGE (0xffff) 62#define INTEL_MSR_RANGE (0xffff)
63#define CPUID_6_ECX_APERFMPERF_CAPABILITY (0x1)
64 63
65struct acpi_cpufreq_data { 64struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data; 65 struct acpi_processor_performance *acpi_data;
@@ -71,13 +70,7 @@ struct acpi_cpufreq_data {
71 70
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
73 72
74struct acpi_msr_data { 73static DEFINE_PER_CPU(struct aperfmperf, old_perf);
75 u64 saved_aperf, saved_mperf;
76};
77
78static DEFINE_PER_CPU(struct acpi_msr_data, msr_data);
79
80DEFINE_TRACE(power_mark);
81 74
82/* acpi_perf_data is a pointer to percpu data. */ 75/* acpi_perf_data is a pointer to percpu data. */
83static struct acpi_processor_performance *acpi_perf_data; 76static struct acpi_processor_performance *acpi_perf_data;
@@ -244,23 +237,12 @@ static u32 get_cur_val(const struct cpumask *mask)
244 return cmd.val; 237 return cmd.val;
245} 238}
246 239
247struct perf_pair {
248 union {
249 struct {
250 u32 lo;
251 u32 hi;
252 } split;
253 u64 whole;
254 } aperf, mperf;
255};
256
257/* Called via smp_call_function_single(), on the target CPU */ 240/* Called via smp_call_function_single(), on the target CPU */
258static void read_measured_perf_ctrs(void *_cur) 241static void read_measured_perf_ctrs(void *_cur)
259{ 242{
260 struct perf_pair *cur = _cur; 243 struct aperfmperf *am = _cur;
261 244
262 rdmsr(MSR_IA32_APERF, cur->aperf.split.lo, cur->aperf.split.hi); 245 get_aperfmperf(am);
263 rdmsr(MSR_IA32_MPERF, cur->mperf.split.lo, cur->mperf.split.hi);
264} 246}
265 247
266/* 248/*
@@ -279,63 +261,17 @@ static void read_measured_perf_ctrs(void *_cur)
279static unsigned int get_measured_perf(struct cpufreq_policy *policy, 261static unsigned int get_measured_perf(struct cpufreq_policy *policy,
280 unsigned int cpu) 262 unsigned int cpu)
281{ 263{
282 struct perf_pair readin, cur; 264 struct aperfmperf perf;
283 unsigned int perf_percent; 265 unsigned long ratio;
284 unsigned int retval; 266 unsigned int retval;
285 267
286 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &readin, 1)) 268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
287 return 0; 269 return 0;
288 270
289 cur.aperf.whole = readin.aperf.whole - 271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf);
290 per_cpu(msr_data, cpu).saved_aperf; 272 per_cpu(old_perf, cpu) = perf;
291 cur.mperf.whole = readin.mperf.whole -
292 per_cpu(msr_data, cpu).saved_mperf;
293 per_cpu(msr_data, cpu).saved_aperf = readin.aperf.whole;
294 per_cpu(msr_data, cpu).saved_mperf = readin.mperf.whole;
295
296#ifdef __i386__
297 /*
298 * We dont want to do 64 bit divide with 32 bit kernel
299 * Get an approximate value. Return failure in case we cannot get
300 * an approximate value.
301 */
302 if (unlikely(cur.aperf.split.hi || cur.mperf.split.hi)) {
303 int shift_count;
304 u32 h;
305
306 h = max_t(u32, cur.aperf.split.hi, cur.mperf.split.hi);
307 shift_count = fls(h);
308
309 cur.aperf.whole >>= shift_count;
310 cur.mperf.whole >>= shift_count;
311 }
312
313 if (((unsigned long)(-1) / 100) < cur.aperf.split.lo) {
314 int shift_count = 7;
315 cur.aperf.split.lo >>= shift_count;
316 cur.mperf.split.lo >>= shift_count;
317 }
318
319 if (cur.aperf.split.lo && cur.mperf.split.lo)
320 perf_percent = (cur.aperf.split.lo * 100) / cur.mperf.split.lo;
321 else
322 perf_percent = 0;
323
324#else
325 if (unlikely(((unsigned long)(-1) / 100) < cur.aperf.whole)) {
326 int shift_count = 7;
327 cur.aperf.whole >>= shift_count;
328 cur.mperf.whole >>= shift_count;
329 }
330
331 if (cur.aperf.whole && cur.mperf.whole)
332 perf_percent = (cur.aperf.whole * 100) / cur.mperf.whole;
333 else
334 perf_percent = 0;
335
336#endif
337 273
338 retval = (policy->cpuinfo.max_freq * perf_percent) / 100; 274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
339 275
340 return retval; 276 return retval;
341} 277}
@@ -394,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
394 unsigned int next_perf_state = 0; /* Index into perf table */ 330 unsigned int next_perf_state = 0; /* Index into perf table */
395 unsigned int i; 331 unsigned int i;
396 int result = 0; 332 int result = 0;
397 struct power_trace it;
398 333
399 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); 334 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
400 335
@@ -426,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
426 } 361 }
427 } 362 }
428 363
429 trace_power_mark(&it, POWER_PSTATE, next_perf_state); 364 trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
430 365
431 switch (data->cpu_feature) { 366 switch (data->cpu_feature) {
432 case SYSTEM_INTEL_MSR_CAPABLE: 367 case SYSTEM_INTEL_MSR_CAPABLE:
@@ -588,6 +523,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = {
588 }, 523 },
589 { } 524 { }
590}; 525};
526
527static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
528{
529 /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf
530 * AL30: A Machine Check Exception (MCE) Occurring during an
531 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
532 * Both Processor Cores to Lock Up when HT is enabled*/
533 if (c->x86_vendor == X86_VENDOR_INTEL) {
534 if ((c->x86 == 15) &&
535 (c->x86_model == 6) &&
536 (c->x86_mask == 8) && smt_capable())
537 return -ENODEV;
538 }
539 return 0;
540}
591#endif 541#endif
592 542
593static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) 543static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
@@ -602,6 +552,12 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
602 552
603 dprintk("acpi_cpufreq_cpu_init\n"); 553 dprintk("acpi_cpufreq_cpu_init\n");
604 554
555#ifdef CONFIG_SMP
556 result = acpi_cpufreq_blacklist(c);
557 if (result)
558 return result;
559#endif
560
605 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); 561 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
606 if (!data) 562 if (!data)
607 return -ENOMEM; 563 return -ENOMEM;
@@ -731,12 +687,8 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
731 acpi_processor_notify_smm(THIS_MODULE); 687 acpi_processor_notify_smm(THIS_MODULE);
732 688
733 /* Check for APERF/MPERF support in hardware */ 689 /* Check for APERF/MPERF support in hardware */
734 if (c->x86_vendor == X86_VENDOR_INTEL && c->cpuid_level >= 6) { 690 if (cpu_has(c, X86_FEATURE_APERFMPERF))
735 unsigned int ecx; 691 acpi_cpufreq_driver.getavg = get_measured_perf;
736 ecx = cpuid_ecx(6);
737 if (ecx & CPUID_6_ECX_APERFMPERF_CAPABILITY)
738 acpi_cpufreq_driver.getavg = get_measured_perf;
739 }
740 692
741 dprintk("CPU%u - ACPI performance management activated.\n", cpu); 693 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
742 for (i = 0; i < perf->state_count; i++) 694 for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 2a50ef891000..6394aa5c7985 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -605,9 +605,10 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
605 return 0; 605 return 0;
606} 606}
607 607
608static void invalidate_entry(struct powernow_k8_data *data, unsigned int entry) 608static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
609 unsigned int entry)
609{ 610{
610 data->powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID; 611 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
611} 612}
612 613
613static void print_basics(struct powernow_k8_data *data) 614static void print_basics(struct powernow_k8_data *data)
@@ -854,6 +855,10 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
854 goto err_out; 855 goto err_out;
855 } 856 }
856 857
858 /* fill in data */
859 data->numps = data->acpi_data.state_count;
860 powernow_k8_acpi_pst_values(data, 0);
861
857 if (cpu_family == CPU_HW_PSTATE) 862 if (cpu_family == CPU_HW_PSTATE)
858 ret_val = fill_powernow_table_pstate(data, powernow_table); 863 ret_val = fill_powernow_table_pstate(data, powernow_table);
859 else 864 else
@@ -866,11 +871,8 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
866 powernow_table[data->acpi_data.state_count].index = 0; 871 powernow_table[data->acpi_data.state_count].index = 0;
867 data->powernow_table = powernow_table; 872 data->powernow_table = powernow_table;
868 873
869 /* fill in data */
870 data->numps = data->acpi_data.state_count;
871 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu) 874 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
872 print_basics(data); 875 print_basics(data);
873 powernow_k8_acpi_pst_values(data, 0);
874 876
875 /* notify BIOS that we exist */ 877 /* notify BIOS that we exist */
876 acpi_processor_notify_smm(THIS_MODULE); 878 acpi_processor_notify_smm(THIS_MODULE);
@@ -914,13 +916,13 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
914 "bad value %d.\n", i, index); 916 "bad value %d.\n", i, index);
915 printk(KERN_ERR PFX "Please report to BIOS " 917 printk(KERN_ERR PFX "Please report to BIOS "
916 "manufacturer\n"); 918 "manufacturer\n");
917 invalidate_entry(data, i); 919 invalidate_entry(powernow_table, i);
918 continue; 920 continue;
919 } 921 }
920 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); 922 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
921 if (!(hi & HW_PSTATE_VALID_MASK)) { 923 if (!(hi & HW_PSTATE_VALID_MASK)) {
922 dprintk("invalid pstate %d, ignoring\n", index); 924 dprintk("invalid pstate %d, ignoring\n", index);
923 invalidate_entry(data, i); 925 invalidate_entry(powernow_table, i);
924 continue; 926 continue;
925 } 927 }
926 928
@@ -941,7 +943,6 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
941 struct cpufreq_frequency_table *powernow_table) 943 struct cpufreq_frequency_table *powernow_table)
942{ 944{
943 int i; 945 int i;
944 int cntlofreq = 0;
945 946
946 for (i = 0; i < data->acpi_data.state_count; i++) { 947 for (i = 0; i < data->acpi_data.state_count; i++) {
947 u32 fid; 948 u32 fid;
@@ -970,7 +971,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
970 /* verify frequency is OK */ 971 /* verify frequency is OK */
971 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) { 972 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
972 dprintk("invalid freq %u kHz, ignoring\n", freq); 973 dprintk("invalid freq %u kHz, ignoring\n", freq);
973 invalidate_entry(data, i); 974 invalidate_entry(powernow_table, i);
974 continue; 975 continue;
975 } 976 }
976 977
@@ -978,38 +979,17 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
978 * BIOSs are using "off" to indicate invalid */ 979 * BIOSs are using "off" to indicate invalid */
979 if (vid == VID_OFF) { 980 if (vid == VID_OFF) {
980 dprintk("invalid vid %u, ignoring\n", vid); 981 dprintk("invalid vid %u, ignoring\n", vid);
981 invalidate_entry(data, i); 982 invalidate_entry(powernow_table, i);
982 continue; 983 continue;
983 } 984 }
984 985
985 /* verify only 1 entry from the lo frequency table */
986 if (fid < HI_FID_TABLE_BOTTOM) {
987 if (cntlofreq) {
988 /* if both entries are the same,
989 * ignore this one ... */
990 if ((freq != powernow_table[cntlofreq].frequency) ||
991 (index != powernow_table[cntlofreq].index)) {
992 printk(KERN_ERR PFX
993 "Too many lo freq table "
994 "entries\n");
995 return 1;
996 }
997
998 dprintk("double low frequency table entry, "
999 "ignoring it.\n");
1000 invalidate_entry(data, i);
1001 continue;
1002 } else
1003 cntlofreq = i;
1004 }
1005
1006 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) { 986 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
1007 printk(KERN_INFO PFX "invalid freq entries " 987 printk(KERN_INFO PFX "invalid freq entries "
1008 "%u kHz vs. %u kHz\n", freq, 988 "%u kHz vs. %u kHz\n", freq,
1009 (unsigned int) 989 (unsigned int)
1010 (data->acpi_data.states[i].core_frequency 990 (data->acpi_data.states[i].core_frequency
1011 * 1000)); 991 * 1000));
1012 invalidate_entry(data, i); 992 invalidate_entry(powernow_table, i);
1013 continue; 993 continue;
1014 } 994 }
1015 } 995 }
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 93ba8eeb100a..08be922de33a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -34,13 +34,6 @@ detect_hypervisor_vendor(struct cpuinfo_x86 *c)
34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; 34 c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
35} 35}
36 36
37unsigned long get_hypervisor_tsc_freq(void)
38{
39 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
40 return vmware_get_tsc_khz();
41 return 0;
42}
43
44static inline void __cpuinit 37static inline void __cpuinit
45hypervisor_set_feature_bits(struct cpuinfo_x86 *c) 38hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
46{ 39{
@@ -55,3 +48,10 @@ void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
55 detect_hypervisor_vendor(c); 48 detect_hypervisor_vendor(c);
56 hypervisor_set_feature_bits(c); 49 hypervisor_set_feature_bits(c);
57} 50}
51
52void __init init_hypervisor_platform(void)
53{
54 init_hypervisor(&boot_cpu_data);
55 if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
56 vmware_platform_setup();
57}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 80a722a071b5..40e1835b35e8 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -350,6 +350,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 350 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
351 } 351 }
352 352
353 if (c->cpuid_level > 6) {
354 unsigned ecx = cpuid_ecx(6);
355 if (ecx & 0x01)
356 set_cpu_cap(c, X86_FEATURE_APERFMPERF);
357 }
358
353 if (cpu_has_xmm2) 359 if (cpu_has_xmm2)
354 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 360 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
355 if (cpu_has_ds) { 361 if (cpu_has_ds) {
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 188a1ca5ad2b..4ac6d48fe11b 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,11 +1,8 @@
1obj-y = mce.o 1obj-y = mce.o mce-severity.o
2 2
3obj-$(CONFIG_X86_NEW_MCE) += mce-severity.o
4obj-$(CONFIG_X86_OLD_MCE) += k7.o p4.o p6.o
5obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o 3obj-$(CONFIG_X86_ANCIENT_MCE) += winchip.o p5.o
6obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o 4obj-$(CONFIG_X86_MCE_INTEL) += mce_intel.o
7obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o 5obj-$(CONFIG_X86_MCE_AMD) += mce_amd.o
8obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o
9obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o 6obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
10obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o 7obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
11 8
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
deleted file mode 100644
index b945d5dbc609..000000000000
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For AMD Athlon/Duron: */
17static void k7_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 1; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57
58 /* Clear it: */
59 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
60 /* Serialize: */
61 wmb();
62 add_taint(TAINT_MACHINE_CHECK);
63 }
64 }
65
66 if (recover & 2)
67 panic("CPU context corrupt");
68 if (recover & 1)
69 panic("Unable to continue");
70
71 printk(KERN_EMERG "Attempting to continue.\n");
72
73 mcgstl &= ~(1<<2);
74 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
75}
76
77
78/* AMD K7 machine check is Intel like: */
79void amd_mcheck_init(struct cpuinfo_x86 *c)
80{
81 u32 l, h;
82 int i;
83
84 if (!cpu_has(c, X86_FEATURE_MCE))
85 return;
86
87 machine_check_vector = k7_machine_check;
88 /* Make sure the vector pointer is visible before we enable MCEs: */
89 wmb();
90
91 printk(KERN_INFO "Intel machine check architecture supported.\n");
92
93 rdmsr(MSR_IA32_MCG_CAP, l, h);
94 if (l & (1<<8)) /* Control register present ? */
95 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
96 nr_mce_banks = l & 0xff;
97
98 /*
99 * Clear status for MC index 0 separately, we don't touch CTL,
100 * as some K7 Athlons cause spurious MCEs when its enabled:
101 */
102 if (boot_cpu_data.x86 == 6) {
103 wrmsr(MSR_IA32_MC0_STATUS, 0x0, 0x0);
104 i = 1;
105 } else
106 i = 0;
107
108 for (; i < nr_mce_banks; i++) {
109 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
110 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
111 }
112
113 set_in_cr4(X86_CR4_MCE);
114 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
115 smp_processor_id());
116}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index a3a235a53f09..472763d92098 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -18,7 +18,12 @@
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/smp.h> 20#include <linux/smp.h>
21#include <linux/notifier.h>
22#include <linux/kdebug.h>
23#include <linux/cpu.h>
24#include <linux/sched.h>
21#include <asm/mce.h> 25#include <asm/mce.h>
26#include <asm/apic.h>
22 27
23/* Update fake mce registers on current CPU. */ 28/* Update fake mce registers on current CPU. */
24static void inject_mce(struct mce *m) 29static void inject_mce(struct mce *m)
@@ -39,44 +44,142 @@ static void inject_mce(struct mce *m)
39 i->finished = 1; 44 i->finished = 1;
40} 45}
41 46
42struct delayed_mce { 47static void raise_poll(struct mce *m)
43 struct timer_list timer; 48{
44 struct mce m; 49 unsigned long flags;
45}; 50 mce_banks_t b;
46 51
47/* Inject mce on current CPU */ 52 memset(&b, 0xff, sizeof(mce_banks_t));
48static void raise_mce(unsigned long data) 53 local_irq_save(flags);
54 machine_check_poll(0, &b);
55 local_irq_restore(flags);
56 m->finished = 0;
57}
58
59static void raise_exception(struct mce *m, struct pt_regs *pregs)
49{ 60{
50 struct delayed_mce *dm = (struct delayed_mce *)data; 61 struct pt_regs regs;
51 struct mce *m = &dm->m; 62 unsigned long flags;
52 int cpu = m->extcpu;
53 63
54 inject_mce(m); 64 if (!pregs) {
55 if (m->status & MCI_STATUS_UC) {
56 struct pt_regs regs;
57 memset(&regs, 0, sizeof(struct pt_regs)); 65 memset(&regs, 0, sizeof(struct pt_regs));
58 regs.ip = m->ip; 66 regs.ip = m->ip;
59 regs.cs = m->cs; 67 regs.cs = m->cs;
68 pregs = &regs;
69 }
70 /* in mcheck exeception handler, irq will be disabled */
71 local_irq_save(flags);
72 do_machine_check(pregs, 0);
73 local_irq_restore(flags);
74 m->finished = 0;
75}
76
77static cpumask_t mce_inject_cpumask;
78
79static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data)
81{
82 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs);
90 else if (m->status)
91 raise_poll(m);
92 return NOTIFY_STOP;
93}
94
95static struct notifier_block mce_raise_nb = {
96 .notifier_call = mce_raise_notify,
97 .priority = 1000,
98};
99
100/* Inject mce on current CPU */
101static int raise_local(void)
102{
103 struct mce *m = &__get_cpu_var(injectm);
104 int context = MCJ_CTX(m->inject_flags);
105 int ret = 0;
106 int cpu = m->extcpu;
107
108 if (m->inject_flags & MCJ_EXCEPTION) {
60 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu); 109 printk(KERN_INFO "Triggering MCE exception on CPU %d\n", cpu);
61 do_machine_check(&regs, 0); 110 switch (context) {
111 case MCJ_CTX_IRQ:
112 /*
113 * Could do more to fake interrupts like
114 * calling irq_enter, but the necessary
115 * machinery isn't exported currently.
116 */
117 /*FALL THROUGH*/
118 case MCJ_CTX_PROCESS:
119 raise_exception(m, NULL);
120 break;
121 default:
122 printk(KERN_INFO "Invalid MCE context\n");
123 ret = -EINVAL;
124 }
62 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu); 125 printk(KERN_INFO "MCE exception done on CPU %d\n", cpu);
63 } else { 126 } else if (m->status) {
64 mce_banks_t b;
65 memset(&b, 0xff, sizeof(mce_banks_t));
66 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu); 127 printk(KERN_INFO "Starting machine check poll CPU %d\n", cpu);
67 machine_check_poll(0, &b); 128 raise_poll(m);
68 mce_notify_irq(); 129 mce_notify_irq();
69 printk(KERN_INFO "Finished machine check poll on CPU %d\n", 130 printk(KERN_INFO "Machine check poll done on CPU %d\n", cpu);
70 cpu); 131 } else
71 } 132 m->finished = 0;
72 kfree(dm); 133
134 return ret;
135}
136
137static void raise_mce(struct mce *m)
138{
139 int context = MCJ_CTX(m->inject_flags);
140
141 inject_mce(m);
142
143 if (context == MCJ_CTX_RANDOM)
144 return;
145
146#ifdef CONFIG_X86_LOCAL_APIC
147 if (m->inject_flags & MCJ_NMI_BROADCAST) {
148 unsigned long start;
149 int cpu;
150 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map;
152 cpu_clear(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask);
158 }
159 if (!cpus_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask));
167 break;
168 }
169 cpu_relax();
170 }
171 raise_local();
172 put_cpu();
173 put_online_cpus();
174 } else
175#endif
176 raise_local();
73} 177}
74 178
75/* Error injection interface */ 179/* Error injection interface */
76static ssize_t mce_write(struct file *filp, const char __user *ubuf, 180static ssize_t mce_write(struct file *filp, const char __user *ubuf,
77 size_t usize, loff_t *off) 181 size_t usize, loff_t *off)
78{ 182{
79 struct delayed_mce *dm;
80 struct mce m; 183 struct mce m;
81 184
82 if (!capable(CAP_SYS_ADMIN)) 185 if (!capable(CAP_SYS_ADMIN))
@@ -96,19 +199,12 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
96 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu)) 199 if (m.extcpu >= num_possible_cpus() || !cpu_online(m.extcpu))
97 return -EINVAL; 200 return -EINVAL;
98 201
99 dm = kmalloc(sizeof(struct delayed_mce), GFP_KERNEL);
100 if (!dm)
101 return -ENOMEM;
102
103 /* 202 /*
104 * Need to give user space some time to set everything up, 203 * Need to give user space some time to set everything up,
105 * so do it a jiffie or two later everywhere. 204 * so do it a jiffie or two later everywhere.
106 * Should we use a hrtimer here for better synchronization?
107 */ 205 */
108 memcpy(&dm->m, &m, sizeof(struct mce)); 206 schedule_timeout(2);
109 setup_timer(&dm->timer, raise_mce, (unsigned long)dm); 207 raise_mce(&m);
110 dm->timer.expires = jiffies + 2;
111 add_timer_on(&dm->timer, m.extcpu);
112 return usize; 208 return usize;
113} 209}
114 210
@@ -116,6 +212,7 @@ static int inject_init(void)
116{ 212{
117 printk(KERN_INFO "Machine check injector initialized\n"); 213 printk(KERN_INFO "Machine check injector initialized\n");
118 mce_chrdev_ops.write = mce_write; 214 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb);
119 return 0; 216 return 0;
120} 217}
121 218
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 54dcb8ff12e5..32996f9fab67 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,3 +1,4 @@
1#include <linux/sysdev.h>
1#include <asm/mce.h> 2#include <asm/mce.h>
2 3
3enum severity_level { 4enum severity_level {
@@ -10,6 +11,20 @@ enum severity_level {
10 MCE_PANIC_SEVERITY, 11 MCE_PANIC_SEVERITY,
11}; 12};
12 13
14#define ATTR_LEN 16
15
16/* One object for each MCE bank, shared by all CPUs */
17struct mce_bank {
18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */
20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */
22};
23
13int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void);
14 26
15extern int mce_ser; 27extern int mce_ser;
28
29extern struct mce_bank *mce_banks;
30
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index ff0807f97056..8a85dd1b1aa1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -139,6 +139,7 @@ int mce_severity(struct mce *a, int tolerant, char **msg)
139 } 139 }
140} 140}
141 141
142#ifdef CONFIG_DEBUG_FS
142static void *s_start(struct seq_file *f, loff_t *pos) 143static void *s_start(struct seq_file *f, loff_t *pos)
143{ 144{
144 if (*pos >= ARRAY_SIZE(severities)) 145 if (*pos >= ARRAY_SIZE(severities))
@@ -197,7 +198,7 @@ static int __init severities_debugfs_init(void)
197{ 198{
198 struct dentry *dmce = NULL, *fseverities_coverage = NULL; 199 struct dentry *dmce = NULL, *fseverities_coverage = NULL;
199 200
200 dmce = debugfs_create_dir("mce", NULL); 201 dmce = mce_get_debugfs_dir();
201 if (dmce == NULL) 202 if (dmce == NULL)
202 goto err_out; 203 goto err_out;
203 fseverities_coverage = debugfs_create_file("severities-coverage", 204 fseverities_coverage = debugfs_create_file("severities-coverage",
@@ -209,10 +210,7 @@ static int __init severities_debugfs_init(void)
209 return 0; 210 return 0;
210 211
211err_out: 212err_out:
212 if (fseverities_coverage)
213 debugfs_remove(fseverities_coverage);
214 if (dmce)
215 debugfs_remove(dmce);
216 return -ENOMEM; 213 return -ENOMEM;
217} 214}
218late_initcall(severities_debugfs_init); 215late_initcall(severities_debugfs_init);
216#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9bfe9d2ea615..b1598a9436d0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -34,6 +34,7 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/fs.h> 35#include <linux/fs.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37#include <linux/debugfs.h>
37 38
38#include <asm/processor.h> 39#include <asm/processor.h>
39#include <asm/hw_irq.h> 40#include <asm/hw_irq.h>
@@ -45,21 +46,8 @@
45 46
46#include "mce-internal.h" 47#include "mce-internal.h"
47 48
48/* Handle unconfigured int18 (should never happen) */
49static void unexpected_machine_check(struct pt_regs *regs, long error_code)
50{
51 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
52 smp_processor_id());
53}
54
55/* Call the installed machine check handler for this CPU setup. */
56void (*machine_check_vector)(struct pt_regs *, long error_code) =
57 unexpected_machine_check;
58
59int mce_disabled __read_mostly; 49int mce_disabled __read_mostly;
60 50
61#ifdef CONFIG_X86_NEW_MCE
62
63#define MISC_MCELOG_MINOR 227 51#define MISC_MCELOG_MINOR 227
64 52
65#define SPINUNIT 100 /* 100ns */ 53#define SPINUNIT 100 /* 100ns */
@@ -77,7 +65,6 @@ DEFINE_PER_CPU(unsigned, mce_exception_count);
77 */ 65 */
78static int tolerant __read_mostly = 1; 66static int tolerant __read_mostly = 1;
79static int banks __read_mostly; 67static int banks __read_mostly;
80static u64 *bank __read_mostly;
81static int rip_msr __read_mostly; 68static int rip_msr __read_mostly;
82static int mce_bootlog __read_mostly = -1; 69static int mce_bootlog __read_mostly = -1;
83static int monarch_timeout __read_mostly = -1; 70static int monarch_timeout __read_mostly = -1;
@@ -87,28 +74,35 @@ int mce_cmci_disabled __read_mostly;
87int mce_ignore_ce __read_mostly; 74int mce_ignore_ce __read_mostly;
88int mce_ser __read_mostly; 75int mce_ser __read_mostly;
89 76
77struct mce_bank *mce_banks __read_mostly;
78
90/* User mode helper program triggered by machine check event */ 79/* User mode helper program triggered by machine check event */
91static unsigned long mce_need_notify; 80static unsigned long mce_need_notify;
92static char mce_helper[128]; 81static char mce_helper[128];
93static char *mce_helper_argv[2] = { mce_helper, NULL }; 82static char *mce_helper_argv[2] = { mce_helper, NULL };
94 83
95static unsigned long dont_init_banks;
96
97static DECLARE_WAIT_QUEUE_HEAD(mce_wait); 84static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
98static DEFINE_PER_CPU(struct mce, mces_seen); 85static DEFINE_PER_CPU(struct mce, mces_seen);
99static int cpu_missing; 86static int cpu_missing;
100 87
88static void default_decode_mce(struct mce *m)
89{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
92}
93
94/*
95 * CPU/chipset specific EDAC code can register a callback here to print
96 * MCE errors in a human-readable form:
97 */
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
101 100
102/* MCA banks polled by the period polling timer for corrected events */ 101/* MCA banks polled by the period polling timer for corrected events */
103DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
104 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 103 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
105}; 104};
106 105
107static inline int skip_bank_init(int i)
108{
109 return i < BITS_PER_LONG && test_bit(i, &dont_init_banks);
110}
111
112static DEFINE_PER_CPU(struct work_struct, mce_work); 106static DEFINE_PER_CPU(struct work_struct, mce_work);
113 107
114/* Do initial initialization of a struct mce */ 108/* Do initial initialization of a struct mce */
@@ -183,59 +177,60 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 177 set_bit(0, &mce_need_notify);
184} 178}
185 179
186void __weak decode_mce(struct mce *m)
187{
188 return;
189}
190
191static void print_mce(struct mce *m) 180static void print_mce(struct mce *m)
192{ 181{
193 printk(KERN_EMERG 182 pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
194 "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
195 m->extcpu, m->mcgstatus, m->bank, m->status); 183 m->extcpu, m->mcgstatus, m->bank, m->status);
184
196 if (m->ip) { 185 if (m->ip) {
197 printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", 186 pr_emerg("RIP%s %02x:<%016Lx> ",
198 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", 187 !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
199 m->cs, m->ip); 188 m->cs, m->ip);
189
200 if (m->cs == __KERNEL_CS) 190 if (m->cs == __KERNEL_CS)
201 print_symbol("{%s}", m->ip); 191 print_symbol("{%s}", m->ip);
202 printk(KERN_CONT "\n"); 192 pr_cont("\n");
203 } 193 }
204 printk(KERN_EMERG "TSC %llx ", m->tsc); 194
195 pr_emerg("TSC %llx ", m->tsc);
205 if (m->addr) 196 if (m->addr)
206 printk(KERN_CONT "ADDR %llx ", m->addr); 197 pr_cont("ADDR %llx ", m->addr);
207 if (m->misc) 198 if (m->misc)
208 printk(KERN_CONT "MISC %llx ", m->misc); 199 pr_cont("MISC %llx ", m->misc);
209 printk(KERN_CONT "\n");
210 printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
211 m->cpuvendor, m->cpuid, m->time, m->socketid,
212 m->apicid);
213 200
214 decode_mce(m); 201 pr_cont("\n");
202 pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
203 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
204
205 /*
206 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that):
208 */
209 x86_mce_decode_callback(m);
215} 210}
216 211
217static void print_mce_head(void) 212static void print_mce_head(void)
218{ 213{
219 printk(KERN_EMERG "\nHARDWARE ERROR\n"); 214 pr_emerg("\nHARDWARE ERROR\n");
220} 215}
221 216
222static void print_mce_tail(void) 217static void print_mce_tail(void)
223{ 218{
224 printk(KERN_EMERG "This is not a software problem!\n" 219 pr_emerg("This is not a software problem!\n");
225#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
226 "Run through mcelog --ascii to decode and contact your hardware vendor\n"
227#endif
228 );
229} 220}
230 221
231#define PANIC_TIMEOUT 5 /* 5 seconds */ 222#define PANIC_TIMEOUT 5 /* 5 seconds */
232 223
233static atomic_t mce_paniced; 224static atomic_t mce_paniced;
234 225
226static int fake_panic;
227static atomic_t mce_fake_paniced;
228
235/* Panic in progress. Enable interrupts and wait for final IPI */ 229/* Panic in progress. Enable interrupts and wait for final IPI */
236static void wait_for_panic(void) 230static void wait_for_panic(void)
237{ 231{
238 long timeout = PANIC_TIMEOUT*USEC_PER_SEC; 232 long timeout = PANIC_TIMEOUT*USEC_PER_SEC;
233
239 preempt_disable(); 234 preempt_disable();
240 local_irq_enable(); 235 local_irq_enable();
241 while (timeout-- > 0) 236 while (timeout-- > 0)
@@ -249,15 +244,21 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
249{ 244{
250 int i; 245 int i;
251 246
252 /* 247 if (!fake_panic) {
253 * Make sure only one CPU runs in machine check panic 248 /*
254 */ 249 * Make sure only one CPU runs in machine check panic
255 if (atomic_add_return(1, &mce_paniced) > 1) 250 */
256 wait_for_panic(); 251 if (atomic_inc_return(&mce_paniced) > 1)
257 barrier(); 252 wait_for_panic();
253 barrier();
258 254
259 bust_spinlocks(1); 255 bust_spinlocks(1);
260 console_verbose(); 256 console_verbose();
257 } else {
258 /* Don't log too much for fake panic */
259 if (atomic_inc_return(&mce_fake_paniced) > 1)
260 return;
261 }
261 print_mce_head(); 262 print_mce_head();
262 /* First print corrected ones that are still unlogged */ 263 /* First print corrected ones that are still unlogged */
263 for (i = 0; i < MCE_LOG_LEN; i++) { 264 for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -284,9 +285,12 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
284 print_mce_tail(); 285 print_mce_tail();
285 if (exp) 286 if (exp)
286 printk(KERN_EMERG "Machine check: %s\n", exp); 287 printk(KERN_EMERG "Machine check: %s\n", exp);
287 if (panic_timeout == 0) 288 if (!fake_panic) {
288 panic_timeout = mce_panic_timeout; 289 if (panic_timeout == 0)
289 panic(msg); 290 panic_timeout = mce_panic_timeout;
291 panic(msg);
292 } else
293 printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
290} 294}
291 295
292/* Support code for software error injection */ 296/* Support code for software error injection */
@@ -294,13 +298,14 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
294static int msr_to_offset(u32 msr) 298static int msr_to_offset(u32 msr)
295{ 299{
296 unsigned bank = __get_cpu_var(injectm.bank); 300 unsigned bank = __get_cpu_var(injectm.bank);
301
297 if (msr == rip_msr) 302 if (msr == rip_msr)
298 return offsetof(struct mce, ip); 303 return offsetof(struct mce, ip);
299 if (msr == MSR_IA32_MC0_STATUS + bank*4) 304 if (msr == MSR_IA32_MCx_STATUS(bank))
300 return offsetof(struct mce, status); 305 return offsetof(struct mce, status);
301 if (msr == MSR_IA32_MC0_ADDR + bank*4) 306 if (msr == MSR_IA32_MCx_ADDR(bank))
302 return offsetof(struct mce, addr); 307 return offsetof(struct mce, addr);
303 if (msr == MSR_IA32_MC0_MISC + bank*4) 308 if (msr == MSR_IA32_MCx_MISC(bank))
304 return offsetof(struct mce, misc); 309 return offsetof(struct mce, misc);
305 if (msr == MSR_IA32_MCG_STATUS) 310 if (msr == MSR_IA32_MCG_STATUS)
306 return offsetof(struct mce, mcgstatus); 311 return offsetof(struct mce, mcgstatus);
@@ -311,13 +316,25 @@ static int msr_to_offset(u32 msr)
311static u64 mce_rdmsrl(u32 msr) 316static u64 mce_rdmsrl(u32 msr)
312{ 317{
313 u64 v; 318 u64 v;
319
314 if (__get_cpu_var(injectm).finished) { 320 if (__get_cpu_var(injectm).finished) {
315 int offset = msr_to_offset(msr); 321 int offset = msr_to_offset(msr);
322
316 if (offset < 0) 323 if (offset < 0)
317 return 0; 324 return 0;
318 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); 325 return *(u64 *)((char *)&__get_cpu_var(injectm) + offset);
319 } 326 }
320 rdmsrl(msr, v); 327
328 if (rdmsrl_safe(msr, &v)) {
329 WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr);
330 /*
331 * Return zero in case the access faulted. This should
332 * not happen normally but can happen if the CPU does
333 * something weird, or if the code is buggy.
334 */
335 v = 0;
336 }
337
321 return v; 338 return v;
322} 339}
323 340
@@ -325,6 +342,7 @@ static void mce_wrmsrl(u32 msr, u64 v)
325{ 342{
326 if (__get_cpu_var(injectm).finished) { 343 if (__get_cpu_var(injectm).finished) {
327 int offset = msr_to_offset(msr); 344 int offset = msr_to_offset(msr);
345
328 if (offset >= 0) 346 if (offset >= 0)
329 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; 347 *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v;
330 return; 348 return;
@@ -421,7 +439,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
421 m->ip = mce_rdmsrl(rip_msr); 439 m->ip = mce_rdmsrl(rip_msr);
422} 440}
423 441
424#ifdef CONFIG_X86_LOCAL_APIC 442#ifdef CONFIG_X86_LOCAL_APIC
425/* 443/*
426 * Called after interrupts have been reenabled again 444 * Called after interrupts have been reenabled again
427 * when a MCE happened during an interrupts off region 445 * when a MCE happened during an interrupts off region
@@ -505,7 +523,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
505 523
506 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 524 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
507 for (i = 0; i < banks; i++) { 525 for (i = 0; i < banks; i++) {
508 if (!bank[i] || !test_bit(i, *b)) 526 if (!mce_banks[i].ctl || !test_bit(i, *b))
509 continue; 527 continue;
510 528
511 m.misc = 0; 529 m.misc = 0;
@@ -514,7 +532,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
514 m.tsc = 0; 532 m.tsc = 0;
515 533
516 barrier(); 534 barrier();
517 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 535 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
518 if (!(m.status & MCI_STATUS_VAL)) 536 if (!(m.status & MCI_STATUS_VAL))
519 continue; 537 continue;
520 538
@@ -529,9 +547,9 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
529 continue; 547 continue;
530 548
531 if (m.status & MCI_STATUS_MISCV) 549 if (m.status & MCI_STATUS_MISCV)
532 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 550 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
533 if (m.status & MCI_STATUS_ADDRV) 551 if (m.status & MCI_STATUS_ADDRV)
534 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 552 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
535 553
536 if (!(flags & MCP_TIMESTAMP)) 554 if (!(flags & MCP_TIMESTAMP))
537 m.tsc = 0; 555 m.tsc = 0;
@@ -547,7 +565,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
547 /* 565 /*
548 * Clear state for this bank. 566 * Clear state for this bank.
549 */ 567 */
550 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 568 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
551 } 569 }
552 570
553 /* 571 /*
@@ -568,7 +586,7 @@ static int mce_no_way_out(struct mce *m, char **msg)
568 int i; 586 int i;
569 587
570 for (i = 0; i < banks; i++) { 588 for (i = 0; i < banks; i++) {
571 m->status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 589 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
572 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) 590 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
573 return 1; 591 return 1;
574 } 592 }
@@ -628,7 +646,7 @@ out:
628 * This way we prevent any potential data corruption in a unrecoverable case 646 * This way we prevent any potential data corruption in a unrecoverable case
629 * and also makes sure always all CPU's errors are examined. 647 * and also makes sure always all CPU's errors are examined.
630 * 648 *
631 * Also this detects the case of an machine check event coming from outer 649 * Also this detects the case of a machine check event coming from outer
632 * space (not detected by any CPUs) In this case some external agent wants 650 * space (not detected by any CPUs) In this case some external agent wants
633 * us to shut down, so panic too. 651 * us to shut down, so panic too.
634 * 652 *
@@ -681,7 +699,7 @@ static void mce_reign(void)
681 * No machine check event found. Must be some external 699 * No machine check event found. Must be some external
682 * source or one CPU is hung. Panic. 700 * source or one CPU is hung. Panic.
683 */ 701 */
684 if (!m && tolerant < 3) 702 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
685 mce_panic("Machine check from unknown source", NULL, NULL); 703 mce_panic("Machine check from unknown source", NULL, NULL);
686 704
687 /* 705 /*
@@ -715,7 +733,7 @@ static int mce_start(int *no_way_out)
715 * global_nwo should be updated before mce_callin 733 * global_nwo should be updated before mce_callin
716 */ 734 */
717 smp_wmb(); 735 smp_wmb();
718 order = atomic_add_return(1, &mce_callin); 736 order = atomic_inc_return(&mce_callin);
719 737
720 /* 738 /*
721 * Wait for everyone. 739 * Wait for everyone.
@@ -852,7 +870,7 @@ static void mce_clear_state(unsigned long *toclear)
852 870
853 for (i = 0; i < banks; i++) { 871 for (i = 0; i < banks; i++) {
854 if (test_bit(i, toclear)) 872 if (test_bit(i, toclear))
855 mce_wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 873 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
856 } 874 }
857} 875}
858 876
@@ -905,11 +923,11 @@ void do_machine_check(struct pt_regs *regs, long error_code)
905 mce_setup(&m); 923 mce_setup(&m);
906 924
907 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); 925 m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
908 no_way_out = mce_no_way_out(&m, &msg);
909
910 final = &__get_cpu_var(mces_seen); 926 final = &__get_cpu_var(mces_seen);
911 *final = m; 927 *final = m;
912 928
929 no_way_out = mce_no_way_out(&m, &msg);
930
913 barrier(); 931 barrier();
914 932
915 /* 933 /*
@@ -926,14 +944,14 @@ void do_machine_check(struct pt_regs *regs, long error_code)
926 order = mce_start(&no_way_out); 944 order = mce_start(&no_way_out);
927 for (i = 0; i < banks; i++) { 945 for (i = 0; i < banks; i++) {
928 __clear_bit(i, toclear); 946 __clear_bit(i, toclear);
929 if (!bank[i]) 947 if (!mce_banks[i].ctl)
930 continue; 948 continue;
931 949
932 m.misc = 0; 950 m.misc = 0;
933 m.addr = 0; 951 m.addr = 0;
934 m.bank = i; 952 m.bank = i;
935 953
936 m.status = mce_rdmsrl(MSR_IA32_MC0_STATUS + i*4); 954 m.status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
937 if ((m.status & MCI_STATUS_VAL) == 0) 955 if ((m.status & MCI_STATUS_VAL) == 0)
938 continue; 956 continue;
939 957
@@ -974,9 +992,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)
974 kill_it = 1; 992 kill_it = 1;
975 993
976 if (m.status & MCI_STATUS_MISCV) 994 if (m.status & MCI_STATUS_MISCV)
977 m.misc = mce_rdmsrl(MSR_IA32_MC0_MISC + i*4); 995 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
978 if (m.status & MCI_STATUS_ADDRV) 996 if (m.status & MCI_STATUS_ADDRV)
979 m.addr = mce_rdmsrl(MSR_IA32_MC0_ADDR + i*4); 997 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
980 998
981 /* 999 /*
982 * Action optional error. Queue address for later processing. 1000 * Action optional error. Queue address for later processing.
@@ -1101,7 +1119,7 @@ void mce_log_therm_throt_event(__u64 status)
1101 */ 1119 */
1102static int check_interval = 5 * 60; /* 5 minutes */ 1120static int check_interval = 5 * 60; /* 5 minutes */
1103 1121
1104static DEFINE_PER_CPU(int, next_interval); /* in jiffies */ 1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1105static DEFINE_PER_CPU(struct timer_list, mce_timer); 1123static DEFINE_PER_CPU(struct timer_list, mce_timer);
1106 1124
1107static void mcheck_timer(unsigned long data) 1125static void mcheck_timer(unsigned long data)
@@ -1120,7 +1138,7 @@ static void mcheck_timer(unsigned long data)
1120 * Alert userspace if needed. If we logged an MCE, reduce the 1138 * Alert userspace if needed. If we logged an MCE, reduce the
1121 * polling interval, otherwise increase the polling interval. 1139 * polling interval, otherwise increase the polling interval.
1122 */ 1140 */
1123 n = &__get_cpu_var(next_interval); 1141 n = &__get_cpu_var(mce_next_interval);
1124 if (mce_notify_irq()) 1142 if (mce_notify_irq())
1125 *n = max(*n/2, HZ/100); 1143 *n = max(*n/2, HZ/100);
1126 else 1144 else
@@ -1169,10 +1187,26 @@ int mce_notify_irq(void)
1169} 1187}
1170EXPORT_SYMBOL_GPL(mce_notify_irq); 1188EXPORT_SYMBOL_GPL(mce_notify_irq);
1171 1189
1190static int mce_banks_init(void)
1191{
1192 int i;
1193
1194 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1195 if (!mce_banks)
1196 return -ENOMEM;
1197 for (i = 0; i < banks; i++) {
1198 struct mce_bank *b = &mce_banks[i];
1199
1200 b->ctl = -1ULL;
1201 b->init = 1;
1202 }
1203 return 0;
1204}
1205
1172/* 1206/*
1173 * Initialize Machine Checks for a CPU. 1207 * Initialize Machine Checks for a CPU.
1174 */ 1208 */
1175static int mce_cap_init(void) 1209static int __cpuinit mce_cap_init(void)
1176{ 1210{
1177 unsigned b; 1211 unsigned b;
1178 u64 cap; 1212 u64 cap;
@@ -1192,11 +1226,11 @@ static int mce_cap_init(void)
1192 /* Don't support asymmetric configurations today */ 1226 /* Don't support asymmetric configurations today */
1193 WARN_ON(banks != 0 && b != banks); 1227 WARN_ON(banks != 0 && b != banks);
1194 banks = b; 1228 banks = b;
1195 if (!bank) { 1229 if (!mce_banks) {
1196 bank = kmalloc(banks * sizeof(u64), GFP_KERNEL); 1230 int err = mce_banks_init();
1197 if (!bank) 1231
1198 return -ENOMEM; 1232 if (err)
1199 memset(bank, 0xff, banks * sizeof(u64)); 1233 return err;
1200 } 1234 }
1201 1235
1202 /* Use accurate RIP reporting if available. */ 1236 /* Use accurate RIP reporting if available. */
@@ -1228,15 +1262,17 @@ static void mce_init(void)
1228 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1262 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1229 1263
1230 for (i = 0; i < banks; i++) { 1264 for (i = 0; i < banks; i++) {
1231 if (skip_bank_init(i)) 1265 struct mce_bank *b = &mce_banks[i];
1266
1267 if (!b->init)
1232 continue; 1268 continue;
1233 wrmsrl(MSR_IA32_MC0_CTL+4*i, bank[i]); 1269 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1234 wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); 1270 wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
1235 } 1271 }
1236} 1272}
1237 1273
1238/* Add per CPU specific workarounds here */ 1274/* Add per CPU specific workarounds here */
1239static int mce_cpu_quirks(struct cpuinfo_x86 *c) 1275static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1240{ 1276{
1241 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1277 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1242 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1278 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1251,7 +1287,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1251 * trips off incorrectly with the IOMMU & 3ware 1287 * trips off incorrectly with the IOMMU & 3ware
1252 * & Cerberus: 1288 * & Cerberus:
1253 */ 1289 */
1254 clear_bit(10, (unsigned long *)&bank[4]); 1290 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1255 } 1291 }
1256 if (c->x86 <= 17 && mce_bootlog < 0) { 1292 if (c->x86 <= 17 && mce_bootlog < 0) {
1257 /* 1293 /*
@@ -1265,7 +1301,7 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1265 * by default. 1301 * by default.
1266 */ 1302 */
1267 if (c->x86 == 6 && banks > 0) 1303 if (c->x86 == 6 && banks > 0)
1268 bank[0] = 0; 1304 mce_banks[0].ctl = 0;
1269 } 1305 }
1270 1306
1271 if (c->x86_vendor == X86_VENDOR_INTEL) { 1307 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1278,8 +1314,8 @@ static int mce_cpu_quirks(struct cpuinfo_x86 *c)
1278 * valid event later, merely don't write CTL0. 1314 * valid event later, merely don't write CTL0.
1279 */ 1315 */
1280 1316
1281 if (c->x86 == 6 && c->x86_model < 0x1A) 1317 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1282 __set_bit(0, &dont_init_banks); 1318 mce_banks[0].init = 0;
1283 1319
1284 /* 1320 /*
1285 * All newer Intel systems support MCE broadcasting. Enable 1321 * All newer Intel systems support MCE broadcasting. Enable
@@ -1335,7 +1371,7 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1335static void mce_init_timer(void) 1371static void mce_init_timer(void)
1336{ 1372{
1337 struct timer_list *t = &__get_cpu_var(mce_timer); 1373 struct timer_list *t = &__get_cpu_var(mce_timer);
1338 int *n = &__get_cpu_var(next_interval); 1374 int *n = &__get_cpu_var(mce_next_interval);
1339 1375
1340 if (mce_ignore_ce) 1376 if (mce_ignore_ce)
1341 return; 1377 return;
@@ -1348,6 +1384,17 @@ static void mce_init_timer(void)
1348 add_timer_on(t, smp_processor_id()); 1384 add_timer_on(t, smp_processor_id());
1349} 1385}
1350 1386
1387/* Handle unconfigured int18 (should never happen) */
1388static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1389{
1390 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1391 smp_processor_id());
1392}
1393
1394/* Call the installed machine check handler for this CPU setup. */
1395void (*machine_check_vector)(struct pt_regs *, long error_code) =
1396 unexpected_machine_check;
1397
1351/* 1398/*
1352 * Called for each booted CPU to set up machine checks. 1399 * Called for each booted CPU to set up machine checks.
1353 * Must be called with preempt off: 1400 * Must be called with preempt off:
@@ -1561,8 +1608,10 @@ static struct miscdevice mce_log_device = {
1561 */ 1608 */
1562static int __init mcheck_enable(char *str) 1609static int __init mcheck_enable(char *str)
1563{ 1610{
1564 if (*str == 0) 1611 if (*str == 0) {
1565 enable_p5_mce(); 1612 enable_p5_mce();
1613 return 1;
1614 }
1566 if (*str == '=') 1615 if (*str == '=')
1567 str++; 1616 str++;
1568 if (!strcmp(str, "off")) 1617 if (!strcmp(str, "off"))
@@ -1603,8 +1652,10 @@ static int mce_disable(void)
1603 int i; 1652 int i;
1604 1653
1605 for (i = 0; i < banks; i++) { 1654 for (i = 0; i < banks; i++) {
1606 if (!skip_bank_init(i)) 1655 struct mce_bank *b = &mce_banks[i];
1607 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1656
1657 if (b->init)
1658 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1608 } 1659 }
1609 return 0; 1660 return 0;
1610} 1661}
@@ -1679,14 +1730,15 @@ DEFINE_PER_CPU(struct sys_device, mce_dev);
1679__cpuinitdata 1730__cpuinitdata
1680void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1731void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
1681 1732
1682static struct sysdev_attribute *bank_attrs; 1733static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
1734{
1735 return container_of(attr, struct mce_bank, attr);
1736}
1683 1737
1684static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, 1738static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
1685 char *buf) 1739 char *buf)
1686{ 1740{
1687 u64 b = bank[attr - bank_attrs]; 1741 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
1688
1689 return sprintf(buf, "%llx\n", b);
1690} 1742}
1691 1743
1692static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, 1744static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
@@ -1697,7 +1749,7 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
1697 if (strict_strtoull(buf, 0, &new) < 0) 1749 if (strict_strtoull(buf, 0, &new) < 0)
1698 return -EINVAL; 1750 return -EINVAL;
1699 1751
1700 bank[attr - bank_attrs] = new; 1752 attr_to_bank(attr)->ctl = new;
1701 mce_restart(); 1753 mce_restart();
1702 1754
1703 return size; 1755 return size;
@@ -1839,7 +1891,7 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1839 } 1891 }
1840 for (j = 0; j < banks; j++) { 1892 for (j = 0; j < banks; j++) {
1841 err = sysdev_create_file(&per_cpu(mce_dev, cpu), 1893 err = sysdev_create_file(&per_cpu(mce_dev, cpu),
1842 &bank_attrs[j]); 1894 &mce_banks[j].attr);
1843 if (err) 1895 if (err)
1844 goto error2; 1896 goto error2;
1845 } 1897 }
@@ -1848,10 +1900,10 @@ static __cpuinit int mce_create_device(unsigned int cpu)
1848 return 0; 1900 return 0;
1849error2: 1901error2:
1850 while (--j >= 0) 1902 while (--j >= 0)
1851 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[j]); 1903 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1852error: 1904error:
1853 while (--i >= 0) 1905 while (--i >= 0)
1854 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1906 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1855 1907
1856 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1908 sysdev_unregister(&per_cpu(mce_dev, cpu));
1857 1909
@@ -1869,7 +1921,7 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1869 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); 1921 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1870 1922
1871 for (i = 0; i < banks; i++) 1923 for (i = 0; i < banks; i++)
1872 sysdev_remove_file(&per_cpu(mce_dev, cpu), &bank_attrs[i]); 1924 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr);
1873 1925
1874 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1926 sysdev_unregister(&per_cpu(mce_dev, cpu));
1875 cpumask_clear_cpu(cpu, mce_dev_initialized); 1927 cpumask_clear_cpu(cpu, mce_dev_initialized);
@@ -1886,8 +1938,10 @@ static void mce_disable_cpu(void *h)
1886 if (!(action & CPU_TASKS_FROZEN)) 1938 if (!(action & CPU_TASKS_FROZEN))
1887 cmci_clear(); 1939 cmci_clear();
1888 for (i = 0; i < banks; i++) { 1940 for (i = 0; i < banks; i++) {
1889 if (!skip_bank_init(i)) 1941 struct mce_bank *b = &mce_banks[i];
1890 wrmsrl(MSR_IA32_MC0_CTL + i*4, 0); 1942
1943 if (b->init)
1944 wrmsrl(MSR_IA32_MCx_CTL(i), 0);
1891 } 1945 }
1892} 1946}
1893 1947
@@ -1902,8 +1956,10 @@ static void mce_reenable_cpu(void *h)
1902 if (!(action & CPU_TASKS_FROZEN)) 1956 if (!(action & CPU_TASKS_FROZEN))
1903 cmci_reenable(); 1957 cmci_reenable();
1904 for (i = 0; i < banks; i++) { 1958 for (i = 0; i < banks; i++) {
1905 if (!skip_bank_init(i)) 1959 struct mce_bank *b = &mce_banks[i];
1906 wrmsrl(MSR_IA32_MC0_CTL + i*4, bank[i]); 1960
1961 if (b->init)
1962 wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl);
1907 } 1963 }
1908} 1964}
1909 1965
@@ -1935,7 +1991,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1935 case CPU_DOWN_FAILED: 1991 case CPU_DOWN_FAILED:
1936 case CPU_DOWN_FAILED_FROZEN: 1992 case CPU_DOWN_FAILED_FROZEN:
1937 t->expires = round_jiffies(jiffies + 1993 t->expires = round_jiffies(jiffies +
1938 __get_cpu_var(next_interval)); 1994 __get_cpu_var(mce_next_interval));
1939 add_timer_on(t, cpu); 1995 add_timer_on(t, cpu);
1940 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 1996 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1941 break; 1997 break;
@@ -1951,35 +2007,21 @@ static struct notifier_block mce_cpu_notifier __cpuinitdata = {
1951 .notifier_call = mce_cpu_callback, 2007 .notifier_call = mce_cpu_callback,
1952}; 2008};
1953 2009
1954static __init int mce_init_banks(void) 2010static __init void mce_init_banks(void)
1955{ 2011{
1956 int i; 2012 int i;
1957 2013
1958 bank_attrs = kzalloc(sizeof(struct sysdev_attribute) * banks,
1959 GFP_KERNEL);
1960 if (!bank_attrs)
1961 return -ENOMEM;
1962
1963 for (i = 0; i < banks; i++) { 2014 for (i = 0; i < banks; i++) {
1964 struct sysdev_attribute *a = &bank_attrs[i]; 2015 struct mce_bank *b = &mce_banks[i];
2016 struct sysdev_attribute *a = &b->attr;
1965 2017
1966 a->attr.name = kasprintf(GFP_KERNEL, "bank%d", i); 2018 a->attr.name = b->attrname;
1967 if (!a->attr.name) 2019 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
1968 goto nomem;
1969 2020
1970 a->attr.mode = 0644; 2021 a->attr.mode = 0644;
1971 a->show = show_bank; 2022 a->show = show_bank;
1972 a->store = set_bank; 2023 a->store = set_bank;
1973 } 2024 }
1974 return 0;
1975
1976nomem:
1977 while (--i >= 0)
1978 kfree(bank_attrs[i].attr.name);
1979 kfree(bank_attrs);
1980 bank_attrs = NULL;
1981
1982 return -ENOMEM;
1983} 2025}
1984 2026
1985static __init int mce_init_device(void) 2027static __init int mce_init_device(void)
@@ -1992,9 +2034,7 @@ static __init int mce_init_device(void)
1992 2034
1993 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL); 2035 zalloc_cpumask_var(&mce_dev_initialized, GFP_KERNEL);
1994 2036
1995 err = mce_init_banks(); 2037 mce_init_banks();
1996 if (err)
1997 return err;
1998 2038
1999 err = sysdev_class_register(&mce_sysclass); 2039 err = sysdev_class_register(&mce_sysclass);
2000 if (err) 2040 if (err)
@@ -2014,57 +2054,65 @@ static __init int mce_init_device(void)
2014 2054
2015device_initcall(mce_init_device); 2055device_initcall(mce_init_device);
2016 2056
2017#else /* CONFIG_X86_OLD_MCE: */ 2057/*
2018 2058 * Old style boot options parsing. Only for compatibility.
2019int nr_mce_banks; 2059 */
2020EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ 2060static int __init mcheck_disable(char *str)
2061{
2062 mce_disabled = 1;
2063 return 1;
2064}
2065__setup("nomce", mcheck_disable);
2021 2066
2022/* This has to be run for each processor */ 2067#ifdef CONFIG_DEBUG_FS
2023void mcheck_init(struct cpuinfo_x86 *c) 2068struct dentry *mce_get_debugfs_dir(void)
2024{ 2069{
2025 if (mce_disabled) 2070 static struct dentry *dmce;
2026 return;
2027 2071
2028 switch (c->x86_vendor) { 2072 if (!dmce)
2029 case X86_VENDOR_AMD: 2073 dmce = debugfs_create_dir("mce", NULL);
2030 amd_mcheck_init(c);
2031 break;
2032 2074
2033 case X86_VENDOR_INTEL: 2075 return dmce;
2034 if (c->x86 == 5) 2076}
2035 intel_p5_mcheck_init(c);
2036 if (c->x86 == 6)
2037 intel_p6_mcheck_init(c);
2038 if (c->x86 == 15)
2039 intel_p4_mcheck_init(c);
2040 break;
2041 2077
2042 case X86_VENDOR_CENTAUR: 2078static void mce_reset(void)
2043 if (c->x86 == 5) 2079{
2044 winchip_mcheck_init(c); 2080 cpu_missing = 0;
2045 break; 2081 atomic_set(&mce_fake_paniced, 0);
2082 atomic_set(&mce_executing, 0);
2083 atomic_set(&mce_callin, 0);
2084 atomic_set(&global_nwo, 0);
2085}
2046 2086
2047 default: 2087static int fake_panic_get(void *data, u64 *val)
2048 break; 2088{
2049 } 2089 *val = fake_panic;
2050 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", nr_mce_banks); 2090 return 0;
2051} 2091}
2052 2092
2053static int __init mcheck_enable(char *str) 2093static int fake_panic_set(void *data, u64 val)
2054{ 2094{
2055 mce_p5_enabled = 1; 2095 mce_reset();
2056 return 1; 2096 fake_panic = val;
2097 return 0;
2057} 2098}
2058__setup("mce", mcheck_enable);
2059 2099
2060#endif /* CONFIG_X86_OLD_MCE */ 2100DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2101 fake_panic_set, "%llu\n");
2061 2102
2062/* 2103static int __init mce_debugfs_init(void)
2063 * Old style boot options parsing. Only for compatibility.
2064 */
2065static int __init mcheck_disable(char *str)
2066{ 2104{
2067 mce_disabled = 1; 2105 struct dentry *dmce, *ffake_panic;
2068 return 1; 2106
2107 dmce = mce_get_debugfs_dir();
2108 if (!dmce)
2109 return -ENOMEM;
2110 ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL,
2111 &fake_panic_fops);
2112 if (!ffake_panic)
2113 return -ENOMEM;
2114
2115 return 0;
2069} 2116}
2070__setup("nomce", mcheck_disable); 2117late_initcall(mce_debugfs_init);
2118#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1fecba404fd8..83a3d1f4efca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -69,7 +69,7 @@ struct threshold_bank {
69 struct threshold_block *blocks; 69 struct threshold_block *blocks;
70 cpumask_var_t cpus; 70 cpumask_var_t cpus;
71}; 71};
72static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]); 72static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
73 73
74#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
75static unsigned char shared_bank[NR_BANKS] = { 75static unsigned char shared_bank[NR_BANKS] = {
@@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
489 int i, err = 0; 489 int i, err = 0;
490 struct threshold_bank *b = NULL; 490 struct threshold_bank *b = NULL;
491 char name[32]; 491 char name[32];
492#ifdef CONFIG_SMP
492 struct cpuinfo_x86 *c = &cpu_data(cpu); 493 struct cpuinfo_x86 *c = &cpu_data(cpu);
493 494#endif
494 495
495 sprintf(name, "threshold_bank%i", bank); 496 sprintf(name, "threshold_bank%i", bank);
496 497
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index e1acec0f7a32..7c785634af2b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/interrupt.h> 9#include <linux/interrupt.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/sched.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/processor.h> 13#include <asm/processor.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
@@ -90,7 +91,7 @@ static void cmci_discover(int banks, int boot)
90 if (test_bit(i, owned)) 91 if (test_bit(i, owned))
91 continue; 92 continue;
92 93
93 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 94 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
94 95
95 /* Already owned by someone else? */ 96 /* Already owned by someone else? */
96 if (val & CMCI_EN) { 97 if (val & CMCI_EN) {
@@ -101,8 +102,8 @@ static void cmci_discover(int banks, int boot)
101 } 102 }
102 103
103 val |= CMCI_EN | CMCI_THRESHOLD; 104 val |= CMCI_EN | CMCI_THRESHOLD;
104 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 105 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
105 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 106 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
106 107
107 /* Did the enable bit stick? -- the bank supports CMCI */ 108 /* Did the enable bit stick? -- the bank supports CMCI */
108 if (val & CMCI_EN) { 109 if (val & CMCI_EN) {
@@ -152,9 +153,9 @@ void cmci_clear(void)
152 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 153 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
153 continue; 154 continue;
154 /* Disable CMCI */ 155 /* Disable CMCI */
155 rdmsrl(MSR_IA32_MC0_CTL2 + i, val); 156 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
156 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); 157 val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
157 wrmsrl(MSR_IA32_MC0_CTL2 + i, val); 158 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
158 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 159 __clear_bit(i, __get_cpu_var(mce_banks_owned));
159 } 160 }
160 spin_unlock_irqrestore(&cmci_discover_lock, flags); 161 spin_unlock_irqrestore(&cmci_discover_lock, flags);
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
deleted file mode 100644
index f5f2d6f71fb6..000000000000
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ /dev/null
@@ -1,94 +0,0 @@
1/*
2 * Non Fatal Machine Check Exception Reporting
3 *
4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 *
6 * This file contains routines to check for non-fatal MCEs every 15s
7 *
8 */
9#include <linux/interrupt.h>
10#include <linux/workqueue.h>
11#include <linux/jiffies.h>
12#include <linux/kernel.h>
13#include <linux/module.h>
14#include <linux/types.h>
15#include <linux/init.h>
16#include <linux/smp.h>
17
18#include <asm/processor.h>
19#include <asm/system.h>
20#include <asm/mce.h>
21#include <asm/msr.h>
22
23static int firstbank;
24
25#define MCE_RATE (15*HZ) /* timer rate is 15s */
26
27static void mce_checkregs(void *info)
28{
29 u32 low, high;
30 int i;
31
32 for (i = firstbank; i < nr_mce_banks; i++) {
33 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
34
35 if (!(high & (1<<31)))
36 continue;
37
38 printk(KERN_INFO "MCE: The hardware reports a non fatal, "
39 "correctable incident occurred on CPU %d.\n",
40 smp_processor_id());
41
42 printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
43
44 /*
45 * Scrub the error so we don't pick it up in MCE_RATE
46 * seconds time:
47 */
48 wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
49
50 /* Serialize: */
51 wmb();
52 add_taint(TAINT_MACHINE_CHECK);
53 }
54}
55
56static void mce_work_fn(struct work_struct *work);
57static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
58
59static void mce_work_fn(struct work_struct *work)
60{
61 on_each_cpu(mce_checkregs, NULL, 1);
62 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
63}
64
65static int __init init_nonfatal_mce_checker(void)
66{
67 struct cpuinfo_x86 *c = &boot_cpu_data;
68
69 /* Check for MCE support */
70 if (!cpu_has(c, X86_FEATURE_MCE))
71 return -ENODEV;
72
73 /* Check for PPro style MCA */
74 if (!cpu_has(c, X86_FEATURE_MCA))
75 return -ENODEV;
76
77 /* Some Athlons misbehave when we frob bank 0 */
78 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
79 boot_cpu_data.x86 == 6)
80 firstbank = 1;
81 else
82 firstbank = 0;
83
84 /*
85 * Check for non-fatal errors every MCE_RATE s
86 */
87 schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
88 printk(KERN_INFO "Machine check exception polling timer started.\n");
89
90 return 0;
91}
92module_init(init_nonfatal_mce_checker);
93
94MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
deleted file mode 100644
index 4482aea9aa2e..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,163 +0,0 @@
1/*
2 * P4 specific Machine Check Exception Reporting
3 */
4#include <linux/kernel.h>
5#include <linux/types.h>
6#include <linux/init.h>
7#include <linux/smp.h>
8
9#include <asm/processor.h>
10#include <asm/mce.h>
11#include <asm/msr.h>
12
13/* as supported by the P4/Xeon family */
14struct intel_mce_extended_msrs {
15 u32 eax;
16 u32 ebx;
17 u32 ecx;
18 u32 edx;
19 u32 esi;
20 u32 edi;
21 u32 ebp;
22 u32 esp;
23 u32 eflags;
24 u32 eip;
25 /* u32 *reserved[]; */
26};
27
28static int mce_num_extended_msrs;
29
30/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
31static void intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
32{
33 u32 h;
34
35 rdmsr(MSR_IA32_MCG_EAX, r->eax, h);
36 rdmsr(MSR_IA32_MCG_EBX, r->ebx, h);
37 rdmsr(MSR_IA32_MCG_ECX, r->ecx, h);
38 rdmsr(MSR_IA32_MCG_EDX, r->edx, h);
39 rdmsr(MSR_IA32_MCG_ESI, r->esi, h);
40 rdmsr(MSR_IA32_MCG_EDI, r->edi, h);
41 rdmsr(MSR_IA32_MCG_EBP, r->ebp, h);
42 rdmsr(MSR_IA32_MCG_ESP, r->esp, h);
43 rdmsr(MSR_IA32_MCG_EFLAGS, r->eflags, h);
44 rdmsr(MSR_IA32_MCG_EIP, r->eip, h);
45}
46
47static void intel_machine_check(struct pt_regs *regs, long error_code)
48{
49 u32 alow, ahigh, high, low;
50 u32 mcgstl, mcgsth;
51 int recover = 1;
52 int i;
53
54 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
55 if (mcgstl & (1<<0)) /* Recoverable ? */
56 recover = 0;
57
58 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
59 smp_processor_id(), mcgsth, mcgstl);
60
61 if (mce_num_extended_msrs > 0) {
62 struct intel_mce_extended_msrs dbg;
63
64 intel_get_extended_msrs(&dbg);
65
66 printk(KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n"
67 "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n"
68 "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
69 smp_processor_id(), dbg.eip, dbg.eflags,
70 dbg.eax, dbg.ebx, dbg.ecx, dbg.edx,
71 dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
72 }
73
74 for (i = 0; i < nr_mce_banks; i++) {
75 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
76 if (high & (1<<31)) {
77 char misc[20];
78 char addr[24];
79
80 misc[0] = addr[0] = '\0';
81 if (high & (1<<29))
82 recover |= 1;
83 if (high & (1<<25))
84 recover |= 2;
85 high &= ~(1<<31);
86 if (high & (1<<27)) {
87 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
88 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
89 }
90 if (high & (1<<26)) {
91 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
92 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
93 }
94 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
95 smp_processor_id(), i, high, low, misc, addr);
96 }
97 }
98
99 if (recover & 2)
100 panic("CPU context corrupt");
101 if (recover & 1)
102 panic("Unable to continue");
103
104 printk(KERN_EMERG "Attempting to continue.\n");
105
106 /*
107 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
108 * recoverable/continuable.This will allow BIOS to look at the MSRs
109 * for errors if the OS could not log the error.
110 */
111 for (i = 0; i < nr_mce_banks; i++) {
112 u32 msr;
113 msr = MSR_IA32_MC0_STATUS+i*4;
114 rdmsr(msr, low, high);
115 if (high&(1<<31)) {
116 /* Clear it */
117 wrmsr(msr, 0UL, 0UL);
118 /* Serialize */
119 wmb();
120 add_taint(TAINT_MACHINE_CHECK);
121 }
122 }
123 mcgstl &= ~(1<<2);
124 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
125}
126
127void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
128{
129 u32 l, h;
130 int i;
131
132 machine_check_vector = intel_machine_check;
133 wmb();
134
135 printk(KERN_INFO "Intel machine check architecture supported.\n");
136 rdmsr(MSR_IA32_MCG_CAP, l, h);
137 if (l & (1<<8)) /* Control register present ? */
138 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
139 nr_mce_banks = l & 0xff;
140
141 for (i = 0; i < nr_mce_banks; i++) {
142 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
143 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
144 }
145
146 set_in_cr4(X86_CR4_MCE);
147 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
148 smp_processor_id());
149
150 /* Check for P4/Xeon extended MCE MSRs */
151 rdmsr(MSR_IA32_MCG_CAP, l, h);
152 if (l & (1<<9)) {/* MCG_EXT_P */
153 mce_num_extended_msrs = (l >> 16) & 0xff;
154 printk(KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
155 " available\n",
156 smp_processor_id(), mce_num_extended_msrs);
157
158#ifdef CONFIG_X86_MCE_P4THERMAL
159 /* Check for P4/Xeon Thermal monitor */
160 intel_init_thermal(c);
161#endif
162 }
163}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
deleted file mode 100644
index 01e4f8178183..000000000000
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,127 +0,0 @@
1/*
2 * P6 specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Alan Cox <alan@lxorguk.ukuu.org.uk>
4 */
5#include <linux/interrupt.h>
6#include <linux/kernel.h>
7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h>
10
11#include <asm/processor.h>
12#include <asm/system.h>
13#include <asm/mce.h>
14#include <asm/msr.h>
15
16/* Machine Check Handler For PII/PIII */
17static void intel_machine_check(struct pt_regs *regs, long error_code)
18{
19 u32 alow, ahigh, high, low;
20 u32 mcgstl, mcgsth;
21 int recover = 1;
22 int i;
23
24 rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
25 if (mcgstl & (1<<0)) /* Recoverable ? */
26 recover = 0;
27
28 printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
29 smp_processor_id(), mcgsth, mcgstl);
30
31 for (i = 0; i < nr_mce_banks; i++) {
32 rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
33 if (high & (1<<31)) {
34 char misc[20];
35 char addr[24];
36
37 misc[0] = '\0';
38 addr[0] = '\0';
39
40 if (high & (1<<29))
41 recover |= 1;
42 if (high & (1<<25))
43 recover |= 2;
44 high &= ~(1<<31);
45
46 if (high & (1<<27)) {
47 rdmsr(MSR_IA32_MC0_MISC+i*4, alow, ahigh);
48 snprintf(misc, 20, "[%08x%08x]", ahigh, alow);
49 }
50 if (high & (1<<26)) {
51 rdmsr(MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
52 snprintf(addr, 24, " at %08x%08x", ahigh, alow);
53 }
54
55 printk(KERN_EMERG "CPU %d: Bank %d: %08x%08x%s%s\n",
56 smp_processor_id(), i, high, low, misc, addr);
57 }
58 }
59
60 if (recover & 2)
61 panic("CPU context corrupt");
62 if (recover & 1)
63 panic("Unable to continue");
64
65 printk(KERN_EMERG "Attempting to continue.\n");
66 /*
67 * Do not clear the MSR_IA32_MCi_STATUS if the error is not
68 * recoverable/continuable.This will allow BIOS to look at the MSRs
69 * for errors if the OS could not log the error:
70 */
71 for (i = 0; i < nr_mce_banks; i++) {
72 unsigned int msr;
73
74 msr = MSR_IA32_MC0_STATUS+i*4;
75 rdmsr(msr, low, high);
76 if (high & (1<<31)) {
77 /* Clear it: */
78 wrmsr(msr, 0UL, 0UL);
79 /* Serialize: */
80 wmb();
81 add_taint(TAINT_MACHINE_CHECK);
82 }
83 }
84 mcgstl &= ~(1<<2);
85 wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
86}
87
88/* Set up machine check reporting for processors with Intel style MCE: */
89void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
90{
91 u32 l, h;
92 int i;
93
94 /* Check for MCE support */
95 if (!cpu_has(c, X86_FEATURE_MCE))
96 return;
97
98 /* Check for PPro style MCA */
99 if (!cpu_has(c, X86_FEATURE_MCA))
100 return;
101
102 /* Ok machine check is available */
103 machine_check_vector = intel_machine_check;
104 /* Make sure the vector pointer is visible before we enable MCEs: */
105 wmb();
106
107 printk(KERN_INFO "Intel machine check architecture supported.\n");
108 rdmsr(MSR_IA32_MCG_CAP, l, h);
109 if (l & (1<<8)) /* Control register present ? */
110 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
111 nr_mce_banks = l & 0xff;
112
113 /*
114 * Following the example in IA-32 SDM Vol 3:
115 * - MC0_CTL should not be written
116 * - Status registers on all banks should be cleared on reset
117 */
118 for (i = 1; i < nr_mce_banks; i++)
119 wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
120
121 for (i = 0; i < nr_mce_banks; i++)
122 wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
123
124 set_in_cr4(X86_CR4_MCE);
125 printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
126 smp_processor_id());
127}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 5957a93e5173..b3a1dba75330 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,20 +34,31 @@
34/* How long to wait between reporting thermal events */ 34/* How long to wait between reporting thermal events */
35#define CHECK_INTERVAL (300 * HZ) 35#define CHECK_INTERVAL (300 * HZ)
36 36
37static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; 37/*
38static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); 38 * Current thermal throttling state:
39static DEFINE_PER_CPU(bool, thermal_throttle_active); 39 */
40struct thermal_state {
41 bool is_throttled;
42
43 u64 next_check;
44 unsigned long throttle_count;
45 unsigned long last_throttle_count;
46};
47
48static DEFINE_PER_CPU(struct thermal_state, thermal_state);
40 49
41static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
42 51
43#ifdef CONFIG_SYSFS 52#ifdef CONFIG_SYSFS
44#define define_therm_throt_sysdev_one_ro(_name) \ 53#define define_therm_throt_sysdev_one_ro(_name) \
45 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
46 55
47#define define_therm_throt_sysdev_show_func(name) \ 56#define define_therm_throt_sysdev_show_func(name) \
48static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 57 \
49 struct sysdev_attribute *attr, \ 58static ssize_t therm_throt_sysdev_show_##name( \
50 char *buf) \ 59 struct sys_device *dev, \
60 struct sysdev_attribute *attr, \
61 char *buf) \
51{ \ 62{ \
52 unsigned int cpu = dev->id; \ 63 unsigned int cpu = dev->id; \
53 ssize_t ret; \ 64 ssize_t ret; \
@@ -55,7 +66,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
55 preempt_disable(); /* CPU hotplug */ \ 66 preempt_disable(); /* CPU hotplug */ \
56 if (cpu_online(cpu)) \ 67 if (cpu_online(cpu)) \
57 ret = sprintf(buf, "%lu\n", \ 68 ret = sprintf(buf, "%lu\n", \
58 per_cpu(thermal_throttle_##name, cpu)); \ 69 per_cpu(thermal_state, cpu).name); \
59 else \ 70 else \
60 ret = 0; \ 71 ret = 0; \
61 preempt_enable(); \ 72 preempt_enable(); \
@@ -63,11 +74,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
63 return ret; \ 74 return ret; \
64} 75}
65 76
66define_therm_throt_sysdev_show_func(count); 77define_therm_throt_sysdev_show_func(throttle_count);
67define_therm_throt_sysdev_one_ro(count); 78define_therm_throt_sysdev_one_ro(throttle_count);
68 79
69static struct attribute *thermal_throttle_attrs[] = { 80static struct attribute *thermal_throttle_attrs[] = {
70 &attr_count.attr, 81 &attr_throttle_count.attr,
71 NULL 82 NULL
72}; 83};
73 84
@@ -93,33 +104,39 @@ static struct attribute_group thermal_throttle_attr_group = {
93 * 1 : Event should be logged further, and a message has been 104 * 1 : Event should be logged further, and a message has been
94 * printed to the syslog. 105 * printed to the syslog.
95 */ 106 */
96static int therm_throt_process(int curr) 107static int therm_throt_process(bool is_throttled)
97{ 108{
98 unsigned int cpu = smp_processor_id(); 109 struct thermal_state *state;
99 __u64 tmp_jiffs = get_jiffies_64(); 110 unsigned int this_cpu;
100 bool was_throttled = __get_cpu_var(thermal_throttle_active); 111 bool was_throttled;
101 bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; 112 u64 now;
113
114 this_cpu = smp_processor_id();
115 now = get_jiffies_64();
116 state = &per_cpu(thermal_state, this_cpu);
117
118 was_throttled = state->is_throttled;
119 state->is_throttled = is_throttled;
102 120
103 if (is_throttled) 121 if (is_throttled)
104 __get_cpu_var(thermal_throttle_count)++; 122 state->throttle_count++;
105 123
106 if (!(was_throttled ^ is_throttled) && 124 if (time_before64(now, state->next_check) &&
107 time_before64(tmp_jiffs, __get_cpu_var(next_check))) 125 state->throttle_count != state->last_throttle_count)
108 return 0; 126 return 0;
109 127
110 __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; 128 state->next_check = now + CHECK_INTERVAL;
129 state->last_throttle_count = state->throttle_count;
111 130
112 /* if we just entered the thermal event */ 131 /* if we just entered the thermal event */
113 if (is_throttled) { 132 if (is_throttled) {
114 printk(KERN_CRIT "CPU%d: Temperature above threshold, " 133 printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
115 "cpu clock throttled (total events = %lu)\n",
116 cpu, __get_cpu_var(thermal_throttle_count));
117 134
118 add_taint(TAINT_MACHINE_CHECK); 135 add_taint(TAINT_MACHINE_CHECK);
119 return 1; 136 return 1;
120 } 137 }
121 if (was_throttled) { 138 if (was_throttled) {
122 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); 139 printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
123 return 1; 140 return 1;
124 } 141 }
125 142
@@ -213,7 +230,7 @@ static void intel_thermal_interrupt(void)
213 __u64 msr_val; 230 __u64 msr_val;
214 231
215 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 232 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
216 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) 233 if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
217 mce_log_therm_throt_event(msr_val); 234 mce_log_therm_throt_event(msr_val);
218} 235}
219 236
@@ -260,9 +277,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
260 return; 277 return;
261 } 278 }
262 279
263 if (cpu_has(c, X86_FEATURE_TM2) && (l & MSR_IA32_MISC_ENABLE_TM2))
264 tm2 = 1;
265
266 /* Check whether a vector already exists */ 280 /* Check whether a vector already exists */
267 if (h & APIC_VECTOR_MASK) { 281 if (h & APIC_VECTOR_MASK) {
268 printk(KERN_DEBUG 282 printk(KERN_DEBUG
@@ -271,6 +285,16 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
271 return; 285 return;
272 } 286 }
273 287
288 /* early Pentium M models use different method for enabling TM2 */
289 if (cpu_has(c, X86_FEATURE_TM2)) {
290 if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
291 rdmsr(MSR_THERM2_CTL, l, h);
292 if (l & MSR_THERM2_CTL_TM_SELECT)
293 tm2 = 1;
294 } else if (l & MSR_IA32_MISC_ENABLE_TM2)
295 tm2 = 1;
296 }
297
274 /* We'll mask the thermal vector in the lapic till we're ready: */ 298 /* We'll mask the thermal vector in the lapic till we're ready: */
275 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED; 299 h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
276 apic_write(APIC_LVTTHMR, h); 300 apic_write(APIC_LVTTHMR, h);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 08b6ea4c62b4..3c1b12d461d1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
96 unsigned long long base, size; 96 unsigned long long base, size;
97 char *ptr; 97 char *ptr;
98 char line[LINE_SIZE]; 98 char line[LINE_SIZE];
99 int length;
99 size_t linelen; 100 size_t linelen;
100 101
101 if (!capable(CAP_SYS_ADMIN)) 102 if (!capable(CAP_SYS_ADMIN))
102 return -EPERM; 103 return -EPERM;
103 if (!len)
104 return -EINVAL;
105 104
106 memset(line, 0, LINE_SIZE); 105 memset(line, 0, LINE_SIZE);
107 if (len > LINE_SIZE) 106
108 len = LINE_SIZE; 107 length = len;
109 if (copy_from_user(line, buf, len - 1)) 108 length--;
109
110 if (length > LINE_SIZE - 1)
111 length = LINE_SIZE - 1;
112
113 if (length < 0)
114 return -EINVAL;
115
116 if (copy_from_user(line, buf, length))
110 return -EFAULT; 117 return -EFAULT;
111 118
112 linelen = strlen(line); 119 linelen = strlen(line);
@@ -126,8 +133,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
126 return -EINVAL; 133 return -EINVAL;
127 134
128 base = simple_strtoull(line + 5, &ptr, 0); 135 base = simple_strtoull(line + 5, &ptr, 0);
129 for (; isspace(*ptr); ++ptr) 136 while (isspace(*ptr))
130 ; 137 ptr++;
131 138
132 if (strncmp(ptr, "size=", 5)) 139 if (strncmp(ptr, "size=", 5))
133 return -EINVAL; 140 return -EINVAL;
@@ -135,14 +142,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
135 size = simple_strtoull(ptr + 5, &ptr, 0); 142 size = simple_strtoull(ptr + 5, &ptr, 0);
136 if ((base & 0xfff) || (size & 0xfff)) 143 if ((base & 0xfff) || (size & 0xfff))
137 return -EINVAL; 144 return -EINVAL;
138 for (; isspace(*ptr); ++ptr) 145 while (isspace(*ptr))
139 ; 146 ptr++;
140 147
141 if (strncmp(ptr, "type=", 5)) 148 if (strncmp(ptr, "type=", 5))
142 return -EINVAL; 149 return -EINVAL;
143 ptr += 5; 150 ptr += 5;
144 for (; isspace(*ptr); ++ptr) 151 while (isspace(*ptr))
145 ; 152 ptr++;
146 153
147 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 154 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
148 if (strcmp(ptr, mtrr_strings[i])) 155 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 7af0f88a4163..84e83de54575 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -58,6 +58,7 @@ unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
58static DEFINE_MUTEX(mtrr_mutex); 58static DEFINE_MUTEX(mtrr_mutex);
59 59
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init;
61 62
62static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
63 64
@@ -163,7 +164,10 @@ static void ipi_handler(void *info)
163 if (data->smp_reg != ~0U) { 164 if (data->smp_reg != ~0U) {
164 mtrr_if->set(data->smp_reg, data->smp_base, 165 mtrr_if->set(data->smp_reg, data->smp_base,
165 data->smp_size, data->smp_type); 166 data->smp_size, data->smp_type);
166 } else { 167 } else if (mtrr_aps_delayed_init) {
168 /*
169 * Initialize the MTRRs inaddition to the synchronisation.
170 */
167 mtrr_if->set_all(); 171 mtrr_if->set_all();
168 } 172 }
169 173
@@ -265,6 +269,8 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
265 */ 269 */
266 if (reg != ~0U) 270 if (reg != ~0U)
267 mtrr_if->set(reg, base, size, type); 271 mtrr_if->set(reg, base, size, type);
272 else if (!mtrr_aps_delayed_init)
273 mtrr_if->set_all();
268 274
269 /* Wait for the others */ 275 /* Wait for the others */
270 while (atomic_read(&data.count)) 276 while (atomic_read(&data.count))
@@ -721,9 +727,7 @@ void __init mtrr_bp_init(void)
721 727
722void mtrr_ap_init(void) 728void mtrr_ap_init(void)
723{ 729{
724 unsigned long flags; 730 if (!use_intel() || mtrr_aps_delayed_init)
725
726 if (!mtrr_if || !use_intel())
727 return; 731 return;
728 /* 732 /*
729 * Ideally we should hold mtrr_mutex here to avoid mtrr entries 733 * Ideally we should hold mtrr_mutex here to avoid mtrr entries
@@ -738,11 +742,7 @@ void mtrr_ap_init(void)
738 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug 742 * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug
739 * lock to prevent mtrr entry changes 743 * lock to prevent mtrr entry changes
740 */ 744 */
741 local_irq_save(flags); 745 set_mtrr(~0U, 0, 0, 0);
742
743 mtrr_if->set_all();
744
745 local_irq_restore(flags);
746} 746}
747 747
748/** 748/**
@@ -753,6 +753,34 @@ void mtrr_save_state(void)
753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1); 753 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
754} 754}
755 755
756void set_mtrr_aps_delayed_init(void)
757{
758 if (!use_intel())
759 return;
760
761 mtrr_aps_delayed_init = true;
762}
763
764/*
765 * MTRR initialization for all AP's
766 */
767void mtrr_aps_init(void)
768{
769 if (!use_intel())
770 return;
771
772 set_mtrr(~0U, 0, 0, 0);
773 mtrr_aps_delayed_init = false;
774}
775
776void mtrr_bp_restore(void)
777{
778 if (!use_intel())
779 return;
780
781 mtrr_if->set_all();
782}
783
756static int __init mtrr_init_finialize(void) 784static int __init mtrr_init_finialize(void)
757{ 785{
758 if (!mtrr_if) 786 if (!mtrr_if)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_event.c
index f9cd0849bd42..2e20bca3cca1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Performance counter x86 architecture code 2 * Performance events x86 architecture code
3 * 3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> 4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar 5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
@@ -11,7 +11,7 @@
11 * For licencing details see kernel-base/COPYING 11 * For licencing details see kernel-base/COPYING
12 */ 12 */
13 13
14#include <linux/perf_counter.h> 14#include <linux/perf_event.h>
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/notifier.h> 16#include <linux/notifier.h>
17#include <linux/hardirq.h> 17#include <linux/hardirq.h>
@@ -27,19 +27,19 @@
27#include <asm/stacktrace.h> 27#include <asm/stacktrace.h>
28#include <asm/nmi.h> 28#include <asm/nmi.h>
29 29
30static u64 perf_counter_mask __read_mostly; 30static u64 perf_event_mask __read_mostly;
31 31
32/* The maximal number of PEBS counters: */ 32/* The maximal number of PEBS events: */
33#define MAX_PEBS_COUNTERS 4 33#define MAX_PEBS_EVENTS 4
34 34
35/* The size of a BTS record in bytes: */ 35/* The size of a BTS record in bytes: */
36#define BTS_RECORD_SIZE 24 36#define BTS_RECORD_SIZE 24
37 37
38/* The size of a per-cpu BTS buffer in bytes: */ 38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) 39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
40 40
41/* The BTS overflow threshold in bytes from the end of the buffer: */ 41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) 42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43 43
44 44
45/* 45/*
@@ -65,11 +65,11 @@ struct debug_store {
65 u64 pebs_index; 65 u64 pebs_index;
66 u64 pebs_absolute_maximum; 66 u64 pebs_absolute_maximum;
67 u64 pebs_interrupt_threshold; 67 u64 pebs_interrupt_threshold;
68 u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; 68 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 69};
70 70
71struct cpu_hw_counters { 71struct cpu_hw_events {
72 struct perf_counter *counters[X86_PMC_IDX_MAX]; 72 struct perf_event *events[X86_PMC_IDX_MAX];
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 75 unsigned long interrupts;
@@ -77,6 +77,18 @@ struct cpu_hw_counters {
77 struct debug_store *ds; 77 struct debug_store *ds;
78}; 78};
79 79
80struct event_constraint {
81 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
82 int code;
83};
84
85#define EVENT_CONSTRAINT(c, m) { .code = (c), .idxmsk[0] = (m) }
86#define EVENT_CONSTRAINT_END { .code = 0, .idxmsk[0] = 0 }
87
88#define for_each_event_constraint(e, c) \
89 for ((e) = (c); (e)->idxmsk[0]; (e)++)
90
91
80/* 92/*
81 * struct x86_pmu - generic x86 pmu 93 * struct x86_pmu - generic x86 pmu
82 */ 94 */
@@ -86,30 +98,34 @@ struct x86_pmu {
86 int (*handle_irq)(struct pt_regs *); 98 int (*handle_irq)(struct pt_regs *);
87 void (*disable_all)(void); 99 void (*disable_all)(void);
88 void (*enable_all)(void); 100 void (*enable_all)(void);
89 void (*enable)(struct hw_perf_counter *, int); 101 void (*enable)(struct hw_perf_event *, int);
90 void (*disable)(struct hw_perf_counter *, int); 102 void (*disable)(struct hw_perf_event *, int);
91 unsigned eventsel; 103 unsigned eventsel;
92 unsigned perfctr; 104 unsigned perfctr;
93 u64 (*event_map)(int); 105 u64 (*event_map)(int);
94 u64 (*raw_event)(u64); 106 u64 (*raw_event)(u64);
95 int max_events; 107 int max_events;
96 int num_counters; 108 int num_events;
97 int num_counters_fixed; 109 int num_events_fixed;
98 int counter_bits; 110 int event_bits;
99 u64 counter_mask; 111 u64 event_mask;
100 int apic; 112 int apic;
101 u64 max_period; 113 u64 max_period;
102 u64 intel_ctrl; 114 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 115 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 116 void (*disable_bts)(void);
117 int (*get_event_idx)(struct cpu_hw_events *cpuc,
118 struct hw_perf_event *hwc);
105}; 119};
106 120
107static struct x86_pmu x86_pmu __read_mostly; 121static struct x86_pmu x86_pmu __read_mostly;
108 122
109static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { 123static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
110 .enabled = 1, 124 .enabled = 1,
111}; 125};
112 126
127static const struct event_constraint *event_constraints;
128
113/* 129/*
114 * Not sure about some of these 130 * Not sure about some of these
115 */ 131 */
@@ -124,37 +140,47 @@ static const u64 p6_perfmon_event_map[] =
124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, 140 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
125}; 141};
126 142
127static u64 p6_pmu_event_map(int event) 143static u64 p6_pmu_event_map(int hw_event)
128{ 144{
129 return p6_perfmon_event_map[event]; 145 return p6_perfmon_event_map[hw_event];
130} 146}
131 147
132/* 148/*
133 * Counter setting that is specified not to count anything. 149 * Event setting that is specified not to count anything.
134 * We use this to effectively disable a counter. 150 * We use this to effectively disable a counter.
135 * 151 *
136 * L2_RQSTS with 0 MESI unit mask. 152 * L2_RQSTS with 0 MESI unit mask.
137 */ 153 */
138#define P6_NOP_COUNTER 0x0000002EULL 154#define P6_NOP_EVENT 0x0000002EULL
139 155
140static u64 p6_pmu_raw_event(u64 event) 156static u64 p6_pmu_raw_event(u64 hw_event)
141{ 157{
142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL 158#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL 159#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL 160#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
145#define P6_EVNTSEL_INV_MASK 0x00800000ULL 161#define P6_EVNTSEL_INV_MASK 0x00800000ULL
146#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL 162#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
147 163
148#define P6_EVNTSEL_MASK \ 164#define P6_EVNTSEL_MASK \
149 (P6_EVNTSEL_EVENT_MASK | \ 165 (P6_EVNTSEL_EVENT_MASK | \
150 P6_EVNTSEL_UNIT_MASK | \ 166 P6_EVNTSEL_UNIT_MASK | \
151 P6_EVNTSEL_EDGE_MASK | \ 167 P6_EVNTSEL_EDGE_MASK | \
152 P6_EVNTSEL_INV_MASK | \ 168 P6_EVNTSEL_INV_MASK | \
153 P6_EVNTSEL_COUNTER_MASK) 169 P6_EVNTSEL_REG_MASK)
154 170
155 return event & P6_EVNTSEL_MASK; 171 return hw_event & P6_EVNTSEL_MASK;
156} 172}
157 173
174static const struct event_constraint intel_p6_event_constraints[] =
175{
176 EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
177 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
178 EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
179 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
180 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
181 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
182 EVENT_CONSTRAINT_END
183};
158 184
159/* 185/*
160 * Intel PerfMon v3. Used on Core2 and later. 186 * Intel PerfMon v3. Used on Core2 and later.
@@ -170,16 +196,45 @@ static const u64 intel_perfmon_event_map[] =
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 196 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 197};
172 198
173static u64 intel_pmu_event_map(int event) 199static const struct event_constraint intel_core_event_constraints[] =
200{
201 EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
202 EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
203 EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
204 EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
205 EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
206 EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
207 EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
208 EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
209 EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
210 EVENT_CONSTRAINT_END
211};
212
213static const struct event_constraint intel_nehalem_event_constraints[] =
214{
215 EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
216 EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
217 EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
218 EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
219 EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
220 EVENT_CONSTRAINT(0x4c, 0x3), /* LOAD_HIT_PRE */
221 EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
222 EVENT_CONSTRAINT(0x52, 0x3), /* L1D_CACHE_PREFETCH_LOCK_FB_HIT */
223 EVENT_CONSTRAINT(0x53, 0x3), /* L1D_CACHE_LOCK_FB_HIT */
224 EVENT_CONSTRAINT(0xc5, 0x3), /* CACHE_LOCK_CYCLES */
225 EVENT_CONSTRAINT_END
226};
227
228static u64 intel_pmu_event_map(int hw_event)
174{ 229{
175 return intel_perfmon_event_map[event]; 230 return intel_perfmon_event_map[hw_event];
176} 231}
177 232
178/* 233/*
179 * Generalized hw caching related event table, filled 234 * Generalized hw caching related hw_event table, filled
180 * in on a per model basis. A value of 0 means 235 * in on a per model basis. A value of 0 means
181 * 'not supported', -1 means 'event makes no sense on 236 * 'not supported', -1 means 'hw_event makes no sense on
182 * this CPU', any other value means the raw event 237 * this CPU', any other value means the raw hw_event
183 * ID. 238 * ID.
184 */ 239 */
185 240
@@ -463,22 +518,22 @@ static const u64 atom_hw_cache_event_ids
463 }, 518 },
464}; 519};
465 520
466static u64 intel_pmu_raw_event(u64 event) 521static u64 intel_pmu_raw_event(u64 hw_event)
467{ 522{
468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL 523#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL 524#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL 525#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL 526#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL 527#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473 528
474#define CORE_EVNTSEL_MASK \ 529#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \ 530 (CORE_EVNTSEL_EVENT_MASK | \
476 CORE_EVNTSEL_UNIT_MASK | \ 531 CORE_EVNTSEL_UNIT_MASK | \
477 CORE_EVNTSEL_EDGE_MASK | \ 532 CORE_EVNTSEL_EDGE_MASK | \
478 CORE_EVNTSEL_INV_MASK | \ 533 CORE_EVNTSEL_INV_MASK | \
479 CORE_EVNTSEL_COUNTER_MASK) 534 CORE_EVNTSEL_REG_MASK)
480 535
481 return event & CORE_EVNTSEL_MASK; 536 return hw_event & CORE_EVNTSEL_MASK;
482} 537}
483 538
484static const u64 amd_hw_cache_event_ids 539static const u64 amd_hw_cache_event_ids
@@ -585,39 +640,39 @@ static const u64 amd_perfmon_event_map[] =
585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 640 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
586}; 641};
587 642
588static u64 amd_pmu_event_map(int event) 643static u64 amd_pmu_event_map(int hw_event)
589{ 644{
590 return amd_perfmon_event_map[event]; 645 return amd_perfmon_event_map[hw_event];
591} 646}
592 647
593static u64 amd_pmu_raw_event(u64 event) 648static u64 amd_pmu_raw_event(u64 hw_event)
594{ 649{
595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL 650#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL 651#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL 652#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
598#define K7_EVNTSEL_INV_MASK 0x000800000ULL 653#define K7_EVNTSEL_INV_MASK 0x000800000ULL
599#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL 654#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
600 655
601#define K7_EVNTSEL_MASK \ 656#define K7_EVNTSEL_MASK \
602 (K7_EVNTSEL_EVENT_MASK | \ 657 (K7_EVNTSEL_EVENT_MASK | \
603 K7_EVNTSEL_UNIT_MASK | \ 658 K7_EVNTSEL_UNIT_MASK | \
604 K7_EVNTSEL_EDGE_MASK | \ 659 K7_EVNTSEL_EDGE_MASK | \
605 K7_EVNTSEL_INV_MASK | \ 660 K7_EVNTSEL_INV_MASK | \
606 K7_EVNTSEL_COUNTER_MASK) 661 K7_EVNTSEL_REG_MASK)
607 662
608 return event & K7_EVNTSEL_MASK; 663 return hw_event & K7_EVNTSEL_MASK;
609} 664}
610 665
611/* 666/*
612 * Propagate counter elapsed time into the generic counter. 667 * Propagate event elapsed time into the generic event.
613 * Can only be executed on the CPU where the counter is active. 668 * Can only be executed on the CPU where the event is active.
614 * Returns the delta events processed. 669 * Returns the delta events processed.
615 */ 670 */
616static u64 671static u64
617x86_perf_counter_update(struct perf_counter *counter, 672x86_perf_event_update(struct perf_event *event,
618 struct hw_perf_counter *hwc, int idx) 673 struct hw_perf_event *hwc, int idx)
619{ 674{
620 int shift = 64 - x86_pmu.counter_bits; 675 int shift = 64 - x86_pmu.event_bits;
621 u64 prev_raw_count, new_raw_count; 676 u64 prev_raw_count, new_raw_count;
622 s64 delta; 677 s64 delta;
623 678
@@ -625,15 +680,15 @@ x86_perf_counter_update(struct perf_counter *counter,
625 return 0; 680 return 0;
626 681
627 /* 682 /*
628 * Careful: an NMI might modify the previous counter value. 683 * Careful: an NMI might modify the previous event value.
629 * 684 *
630 * Our tactic to handle this is to first atomically read and 685 * Our tactic to handle this is to first atomically read and
631 * exchange a new raw count - then add that new-prev delta 686 * exchange a new raw count - then add that new-prev delta
632 * count to the generic counter atomically: 687 * count to the generic event atomically:
633 */ 688 */
634again: 689again:
635 prev_raw_count = atomic64_read(&hwc->prev_count); 690 prev_raw_count = atomic64_read(&hwc->prev_count);
636 rdmsrl(hwc->counter_base + idx, new_raw_count); 691 rdmsrl(hwc->event_base + idx, new_raw_count);
637 692
638 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, 693 if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
639 new_raw_count) != prev_raw_count) 694 new_raw_count) != prev_raw_count)
@@ -642,7 +697,7 @@ again:
642 /* 697 /*
643 * Now we have the new raw value and have updated the prev 698 * Now we have the new raw value and have updated the prev
644 * timestamp already. We can now calculate the elapsed delta 699 * timestamp already. We can now calculate the elapsed delta
645 * (counter-)time and add that to the generic counter. 700 * (event-)time and add that to the generic event.
646 * 701 *
647 * Careful, not all hw sign-extends above the physical width 702 * Careful, not all hw sign-extends above the physical width
648 * of the count. 703 * of the count.
@@ -650,13 +705,13 @@ again:
650 delta = (new_raw_count << shift) - (prev_raw_count << shift); 705 delta = (new_raw_count << shift) - (prev_raw_count << shift);
651 delta >>= shift; 706 delta >>= shift;
652 707
653 atomic64_add(delta, &counter->count); 708 atomic64_add(delta, &event->count);
654 atomic64_sub(delta, &hwc->period_left); 709 atomic64_sub(delta, &hwc->period_left);
655 710
656 return new_raw_count; 711 return new_raw_count;
657} 712}
658 713
659static atomic_t active_counters; 714static atomic_t active_events;
660static DEFINE_MUTEX(pmc_reserve_mutex); 715static DEFINE_MUTEX(pmc_reserve_mutex);
661 716
662static bool reserve_pmc_hardware(void) 717static bool reserve_pmc_hardware(void)
@@ -667,12 +722,12 @@ static bool reserve_pmc_hardware(void)
667 if (nmi_watchdog == NMI_LOCAL_APIC) 722 if (nmi_watchdog == NMI_LOCAL_APIC)
668 disable_lapic_nmi_watchdog(); 723 disable_lapic_nmi_watchdog();
669 724
670 for (i = 0; i < x86_pmu.num_counters; i++) { 725 for (i = 0; i < x86_pmu.num_events; i++) {
671 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 726 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
672 goto perfctr_fail; 727 goto perfctr_fail;
673 } 728 }
674 729
675 for (i = 0; i < x86_pmu.num_counters; i++) { 730 for (i = 0; i < x86_pmu.num_events; i++) {
676 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 731 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
677 goto eventsel_fail; 732 goto eventsel_fail;
678 } 733 }
@@ -685,7 +740,7 @@ eventsel_fail:
685 for (i--; i >= 0; i--) 740 for (i--; i >= 0; i--)
686 release_evntsel_nmi(x86_pmu.eventsel + i); 741 release_evntsel_nmi(x86_pmu.eventsel + i);
687 742
688 i = x86_pmu.num_counters; 743 i = x86_pmu.num_events;
689 744
690perfctr_fail: 745perfctr_fail:
691 for (i--; i >= 0; i--) 746 for (i--; i >= 0; i--)
@@ -703,7 +758,7 @@ static void release_pmc_hardware(void)
703#ifdef CONFIG_X86_LOCAL_APIC 758#ifdef CONFIG_X86_LOCAL_APIC
704 int i; 759 int i;
705 760
706 for (i = 0; i < x86_pmu.num_counters; i++) { 761 for (i = 0; i < x86_pmu.num_events; i++) {
707 release_perfctr_nmi(x86_pmu.perfctr + i); 762 release_perfctr_nmi(x86_pmu.perfctr + i);
708 release_evntsel_nmi(x86_pmu.eventsel + i); 763 release_evntsel_nmi(x86_pmu.eventsel + i);
709 } 764 }
@@ -720,7 +775,7 @@ static inline bool bts_available(void)
720 775
721static inline void init_debug_store_on_cpu(int cpu) 776static inline void init_debug_store_on_cpu(int cpu)
722{ 777{
723 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; 778 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724 779
725 if (!ds) 780 if (!ds)
726 return; 781 return;
@@ -732,7 +787,7 @@ static inline void init_debug_store_on_cpu(int cpu)
732 787
733static inline void fini_debug_store_on_cpu(int cpu) 788static inline void fini_debug_store_on_cpu(int cpu)
734{ 789{
735 if (!per_cpu(cpu_hw_counters, cpu).ds) 790 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return; 791 return;
737 792
738 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); 793 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
@@ -751,12 +806,12 @@ static void release_bts_hardware(void)
751 fini_debug_store_on_cpu(cpu); 806 fini_debug_store_on_cpu(cpu);
752 807
753 for_each_possible_cpu(cpu) { 808 for_each_possible_cpu(cpu) {
754 struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; 809 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
755 810
756 if (!ds) 811 if (!ds)
757 continue; 812 continue;
758 813
759 per_cpu(cpu_hw_counters, cpu).ds = NULL; 814 per_cpu(cpu_hw_events, cpu).ds = NULL;
760 815
761 kfree((void *)(unsigned long)ds->bts_buffer_base); 816 kfree((void *)(unsigned long)ds->bts_buffer_base);
762 kfree(ds); 817 kfree(ds);
@@ -796,7 +851,7 @@ static int reserve_bts_hardware(void)
796 ds->bts_interrupt_threshold = 851 ds->bts_interrupt_threshold =
797 ds->bts_absolute_maximum - BTS_OVFL_TH; 852 ds->bts_absolute_maximum - BTS_OVFL_TH;
798 853
799 per_cpu(cpu_hw_counters, cpu).ds = ds; 854 per_cpu(cpu_hw_events, cpu).ds = ds;
800 err = 0; 855 err = 0;
801 } 856 }
802 857
@@ -812,9 +867,9 @@ static int reserve_bts_hardware(void)
812 return err; 867 return err;
813} 868}
814 869
815static void hw_perf_counter_destroy(struct perf_counter *counter) 870static void hw_perf_event_destroy(struct perf_event *event)
816{ 871{
817 if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { 872 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
818 release_pmc_hardware(); 873 release_pmc_hardware();
819 release_bts_hardware(); 874 release_bts_hardware();
820 mutex_unlock(&pmc_reserve_mutex); 875 mutex_unlock(&pmc_reserve_mutex);
@@ -827,7 +882,7 @@ static inline int x86_pmu_initialized(void)
827} 882}
828 883
829static inline int 884static inline int
830set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) 885set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
831{ 886{
832 unsigned int cache_type, cache_op, cache_result; 887 unsigned int cache_type, cache_op, cache_result;
833 u64 config, val; 888 u64 config, val;
@@ -880,7 +935,7 @@ static void intel_pmu_enable_bts(u64 config)
880 935
881static void intel_pmu_disable_bts(void) 936static void intel_pmu_disable_bts(void)
882{ 937{
883 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 938 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr; 939 unsigned long debugctlmsr;
885 940
886 if (!cpuc->ds) 941 if (!cpuc->ds)
@@ -898,10 +953,10 @@ static void intel_pmu_disable_bts(void)
898/* 953/*
899 * Setup the hardware configuration for a given attr_type 954 * Setup the hardware configuration for a given attr_type
900 */ 955 */
901static int __hw_perf_counter_init(struct perf_counter *counter) 956static int __hw_perf_event_init(struct perf_event *event)
902{ 957{
903 struct perf_counter_attr *attr = &counter->attr; 958 struct perf_event_attr *attr = &event->attr;
904 struct hw_perf_counter *hwc = &counter->hw; 959 struct hw_perf_event *hwc = &event->hw;
905 u64 config; 960 u64 config;
906 int err; 961 int err;
907 962
@@ -909,27 +964,31 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
909 return -ENODEV; 964 return -ENODEV;
910 965
911 err = 0; 966 err = 0;
912 if (!atomic_inc_not_zero(&active_counters)) { 967 if (!atomic_inc_not_zero(&active_events)) {
913 mutex_lock(&pmc_reserve_mutex); 968 mutex_lock(&pmc_reserve_mutex);
914 if (atomic_read(&active_counters) == 0) { 969 if (atomic_read(&active_events) == 0) {
915 if (!reserve_pmc_hardware()) 970 if (!reserve_pmc_hardware())
916 err = -EBUSY; 971 err = -EBUSY;
917 else 972 else
918 err = reserve_bts_hardware(); 973 err = reserve_bts_hardware();
919 } 974 }
920 if (!err) 975 if (!err)
921 atomic_inc(&active_counters); 976 atomic_inc(&active_events);
922 mutex_unlock(&pmc_reserve_mutex); 977 mutex_unlock(&pmc_reserve_mutex);
923 } 978 }
924 if (err) 979 if (err)
925 return err; 980 return err;
926 981
982 event->destroy = hw_perf_event_destroy;
983
927 /* 984 /*
928 * Generate PMC IRQs: 985 * Generate PMC IRQs:
929 * (keep 'enabled' bit clear for now) 986 * (keep 'enabled' bit clear for now)
930 */ 987 */
931 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 988 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
932 989
990 hwc->idx = -1;
991
933 /* 992 /*
934 * Count user and OS events unless requested not to. 993 * Count user and OS events unless requested not to.
935 */ 994 */
@@ -946,17 +1005,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
946 /* 1005 /*
947 * If we have a PMU initialized but no APIC 1006 * If we have a PMU initialized but no APIC
948 * interrupts, we cannot sample hardware 1007 * interrupts, we cannot sample hardware
949 * counters (user-space has to fall back and 1008 * events (user-space has to fall back and
950 * sample via a hrtimer based software counter): 1009 * sample via a hrtimer based software event):
951 */ 1010 */
952 if (!x86_pmu.apic) 1011 if (!x86_pmu.apic)
953 return -EOPNOTSUPP; 1012 return -EOPNOTSUPP;
954 } 1013 }
955 1014
956 counter->destroy = hw_perf_counter_destroy;
957
958 /* 1015 /*
959 * Raw event type provide the config in the event structure 1016 * Raw hw_event type provide the config in the hw_event structure
960 */ 1017 */
961 if (attr->type == PERF_TYPE_RAW) { 1018 if (attr->type == PERF_TYPE_RAW) {
962 hwc->config |= x86_pmu.raw_event(attr->config); 1019 hwc->config |= x86_pmu.raw_event(attr->config);
@@ -1001,7 +1058,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
1001 1058
1002static void p6_pmu_disable_all(void) 1059static void p6_pmu_disable_all(void)
1003{ 1060{
1004 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1061 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1005 u64 val; 1062 u64 val;
1006 1063
1007 if (!cpuc->enabled) 1064 if (!cpuc->enabled)
@@ -1018,7 +1075,7 @@ static void p6_pmu_disable_all(void)
1018 1075
1019static void intel_pmu_disable_all(void) 1076static void intel_pmu_disable_all(void)
1020{ 1077{
1021 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1078 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022 1079
1023 if (!cpuc->enabled) 1080 if (!cpuc->enabled)
1024 return; 1081 return;
@@ -1034,7 +1091,7 @@ static void intel_pmu_disable_all(void)
1034 1091
1035static void amd_pmu_disable_all(void) 1092static void amd_pmu_disable_all(void)
1036{ 1093{
1037 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1094 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1038 int idx; 1095 int idx;
1039 1096
1040 if (!cpuc->enabled) 1097 if (!cpuc->enabled)
@@ -1043,12 +1100,12 @@ static void amd_pmu_disable_all(void)
1043 cpuc->enabled = 0; 1100 cpuc->enabled = 0;
1044 /* 1101 /*
1045 * ensure we write the disable before we start disabling the 1102 * ensure we write the disable before we start disabling the
1046 * counters proper, so that amd_pmu_enable_counter() does the 1103 * events proper, so that amd_pmu_enable_event() does the
1047 * right thing. 1104 * right thing.
1048 */ 1105 */
1049 barrier(); 1106 barrier();
1050 1107
1051 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1108 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1052 u64 val; 1109 u64 val;
1053 1110
1054 if (!test_bit(idx, cpuc->active_mask)) 1111 if (!test_bit(idx, cpuc->active_mask))
@@ -1070,7 +1127,7 @@ void hw_perf_disable(void)
1070 1127
1071static void p6_pmu_enable_all(void) 1128static void p6_pmu_enable_all(void)
1072{ 1129{
1073 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1074 unsigned long val; 1131 unsigned long val;
1075 1132
1076 if (cpuc->enabled) 1133 if (cpuc->enabled)
@@ -1087,7 +1144,7 @@ static void p6_pmu_enable_all(void)
1087 1144
1088static void intel_pmu_enable_all(void) 1145static void intel_pmu_enable_all(void)
1089{ 1146{
1090 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1147 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1091 1148
1092 if (cpuc->enabled) 1149 if (cpuc->enabled)
1093 return; 1150 return;
@@ -1098,19 +1155,19 @@ static void intel_pmu_enable_all(void)
1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 1155 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
1099 1156
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 1157 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
1101 struct perf_counter *counter = 1158 struct perf_event *event =
1102 cpuc->counters[X86_PMC_IDX_FIXED_BTS]; 1159 cpuc->events[X86_PMC_IDX_FIXED_BTS];
1103 1160
1104 if (WARN_ON_ONCE(!counter)) 1161 if (WARN_ON_ONCE(!event))
1105 return; 1162 return;
1106 1163
1107 intel_pmu_enable_bts(counter->hw.config); 1164 intel_pmu_enable_bts(event->hw.config);
1108 } 1165 }
1109} 1166}
1110 1167
1111static void amd_pmu_enable_all(void) 1168static void amd_pmu_enable_all(void)
1112{ 1169{
1113 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1170 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1114 int idx; 1171 int idx;
1115 1172
1116 if (cpuc->enabled) 1173 if (cpuc->enabled)
@@ -1119,14 +1176,14 @@ static void amd_pmu_enable_all(void)
1119 cpuc->enabled = 1; 1176 cpuc->enabled = 1;
1120 barrier(); 1177 barrier();
1121 1178
1122 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1179 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1123 struct perf_counter *counter = cpuc->counters[idx]; 1180 struct perf_event *event = cpuc->events[idx];
1124 u64 val; 1181 u64 val;
1125 1182
1126 if (!test_bit(idx, cpuc->active_mask)) 1183 if (!test_bit(idx, cpuc->active_mask))
1127 continue; 1184 continue;
1128 1185
1129 val = counter->hw.config; 1186 val = event->hw.config;
1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 1188 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1132 } 1189 }
@@ -1153,19 +1210,19 @@ static inline void intel_pmu_ack_status(u64 ack)
1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 1210 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
1154} 1211}
1155 1212
1156static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1213static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1157{ 1214{
1158 (void)checking_wrmsrl(hwc->config_base + idx, 1215 (void)checking_wrmsrl(hwc->config_base + idx,
1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); 1216 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1160} 1217}
1161 1218
1162static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1219static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1163{ 1220{
1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 1221 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
1165} 1222}
1166 1223
1167static inline void 1224static inline void
1168intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) 1225intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx)
1169{ 1226{
1170 int idx = __idx - X86_PMC_IDX_FIXED; 1227 int idx = __idx - X86_PMC_IDX_FIXED;
1171 u64 ctrl_val, mask; 1228 u64 ctrl_val, mask;
@@ -1178,10 +1235,10 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
1178} 1235}
1179 1236
1180static inline void 1237static inline void
1181p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1238p6_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1182{ 1239{
1183 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1240 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1184 u64 val = P6_NOP_COUNTER; 1241 u64 val = P6_NOP_EVENT;
1185 1242
1186 if (cpuc->enabled) 1243 if (cpuc->enabled)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 1244 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
@@ -1190,7 +1247,7 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
1190} 1247}
1191 1248
1192static inline void 1249static inline void
1193intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1250intel_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1194{ 1251{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1252 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1196 intel_pmu_disable_bts(); 1253 intel_pmu_disable_bts();
@@ -1202,24 +1259,24 @@ intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
1202 return; 1259 return;
1203 } 1260 }
1204 1261
1205 x86_pmu_disable_counter(hwc, idx); 1262 x86_pmu_disable_event(hwc, idx);
1206} 1263}
1207 1264
1208static inline void 1265static inline void
1209amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) 1266amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1210{ 1267{
1211 x86_pmu_disable_counter(hwc, idx); 1268 x86_pmu_disable_event(hwc, idx);
1212} 1269}
1213 1270
1214static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); 1271static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1215 1272
1216/* 1273/*
1217 * Set the next IRQ period, based on the hwc->period_left value. 1274 * Set the next IRQ period, based on the hwc->period_left value.
1218 * To be called with the counter disabled in hw: 1275 * To be called with the event disabled in hw:
1219 */ 1276 */
1220static int 1277static int
1221x86_perf_counter_set_period(struct perf_counter *counter, 1278x86_perf_event_set_period(struct perf_event *event,
1222 struct hw_perf_counter *hwc, int idx) 1279 struct hw_perf_event *hwc, int idx)
1223{ 1280{
1224 s64 left = atomic64_read(&hwc->period_left); 1281 s64 left = atomic64_read(&hwc->period_left);
1225 s64 period = hwc->sample_period; 1282 s64 period = hwc->sample_period;
@@ -1245,7 +1302,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1245 ret = 1; 1302 ret = 1;
1246 } 1303 }
1247 /* 1304 /*
1248 * Quirk: certain CPUs dont like it if just 1 event is left: 1305 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1249 */ 1306 */
1250 if (unlikely(left < 2)) 1307 if (unlikely(left < 2))
1251 left = 2; 1308 left = 2;
@@ -1253,24 +1310,24 @@ x86_perf_counter_set_period(struct perf_counter *counter,
1253 if (left > x86_pmu.max_period) 1310 if (left > x86_pmu.max_period)
1254 left = x86_pmu.max_period; 1311 left = x86_pmu.max_period;
1255 1312
1256 per_cpu(prev_left[idx], smp_processor_id()) = left; 1313 per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1257 1314
1258 /* 1315 /*
1259 * The hw counter starts counting from this counter offset, 1316 * The hw event starts counting from this event offset,
1260 * mark it to be able to extra future deltas: 1317 * mark it to be able to extra future deltas:
1261 */ 1318 */
1262 atomic64_set(&hwc->prev_count, (u64)-left); 1319 atomic64_set(&hwc->prev_count, (u64)-left);
1263 1320
1264 err = checking_wrmsrl(hwc->counter_base + idx, 1321 err = checking_wrmsrl(hwc->event_base + idx,
1265 (u64)(-left) & x86_pmu.counter_mask); 1322 (u64)(-left) & x86_pmu.event_mask);
1266 1323
1267 perf_counter_update_userpage(counter); 1324 perf_event_update_userpage(event);
1268 1325
1269 return ret; 1326 return ret;
1270} 1327}
1271 1328
1272static inline void 1329static inline void
1273intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) 1330intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1274{ 1331{
1275 int idx = __idx - X86_PMC_IDX_FIXED; 1332 int idx = __idx - X86_PMC_IDX_FIXED;
1276 u64 ctrl_val, bits, mask; 1333 u64 ctrl_val, bits, mask;
@@ -1295,9 +1352,9 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
1295 err = checking_wrmsrl(hwc->config_base, ctrl_val); 1352 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1296} 1353}
1297 1354
1298static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1355static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1299{ 1356{
1300 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1357 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1301 u64 val; 1358 u64 val;
1302 1359
1303 val = hwc->config; 1360 val = hwc->config;
@@ -1308,10 +1365,10 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1308} 1365}
1309 1366
1310 1367
1311static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1368static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1312{ 1369{
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1370 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
1314 if (!__get_cpu_var(cpu_hw_counters).enabled) 1371 if (!__get_cpu_var(cpu_hw_events).enabled)
1315 return; 1372 return;
1316 1373
1317 intel_pmu_enable_bts(hwc->config); 1374 intel_pmu_enable_bts(hwc->config);
@@ -1323,134 +1380,189 @@ static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
1323 return; 1380 return;
1324 } 1381 }
1325 1382
1326 x86_pmu_enable_counter(hwc, idx); 1383 x86_pmu_enable_event(hwc, idx);
1327} 1384}
1328 1385
1329static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) 1386static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330{ 1387{
1331 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1388 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1332 1389
1333 if (cpuc->enabled) 1390 if (cpuc->enabled)
1334 x86_pmu_enable_counter(hwc, idx); 1391 x86_pmu_enable_event(hwc, idx);
1335} 1392}
1336 1393
1337static int 1394static int fixed_mode_idx(struct hw_perf_event *hwc)
1338fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
1339{ 1395{
1340 unsigned int event; 1396 unsigned int hw_event;
1341 1397
1342 event = hwc->config & ARCH_PERFMON_EVENT_MASK; 1398 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1343 1399
1344 if (unlikely((event == 1400 if (unlikely((hw_event ==
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 1401 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
1346 (hwc->sample_period == 1))) 1402 (hwc->sample_period == 1)))
1347 return X86_PMC_IDX_FIXED_BTS; 1403 return X86_PMC_IDX_FIXED_BTS;
1348 1404
1349 if (!x86_pmu.num_counters_fixed) 1405 if (!x86_pmu.num_events_fixed)
1350 return -1; 1406 return -1;
1351 1407
1352 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 1408 /*
1409 * fixed counters do not take all possible filters
1410 */
1411 if (hwc->config & ARCH_PERFMON_EVENT_FILTER_MASK)
1412 return -1;
1413
1414 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 1415 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
1354 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) 1416 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1355 return X86_PMC_IDX_FIXED_CPU_CYCLES; 1417 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1356 if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) 1418 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1357 return X86_PMC_IDX_FIXED_BUS_CYCLES; 1419 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1358 1420
1359 return -1; 1421 return -1;
1360} 1422}
1361 1423
1362/* 1424/*
1363 * Find a PMC slot for the freshly enabled / scheduled in counter: 1425 * generic counter allocator: get next free counter
1364 */ 1426 */
1365static int x86_pmu_enable(struct perf_counter *counter) 1427static int
1428gen_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1366{ 1429{
1367 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
1368 struct hw_perf_counter *hwc = &counter->hw;
1369 int idx; 1430 int idx;
1370 1431
1371 idx = fixed_mode_idx(counter, hwc); 1432 idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_events);
1433 return idx == x86_pmu.num_events ? -1 : idx;
1434}
1435
1436/*
1437 * intel-specific counter allocator: check event constraints
1438 */
1439static int
1440intel_get_event_idx(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1441{
1442 const struct event_constraint *event_constraint;
1443 int i, code;
1444
1445 if (!event_constraints)
1446 goto skip;
1447
1448 code = hwc->config & CORE_EVNTSEL_EVENT_MASK;
1449
1450 for_each_event_constraint(event_constraint, event_constraints) {
1451 if (code == event_constraint->code) {
1452 for_each_bit(i, event_constraint->idxmsk, X86_PMC_IDX_MAX) {
1453 if (!test_and_set_bit(i, cpuc->used_mask))
1454 return i;
1455 }
1456 return -1;
1457 }
1458 }
1459skip:
1460 return gen_get_event_idx(cpuc, hwc);
1461}
1462
1463static int
1464x86_schedule_event(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc)
1465{
1466 int idx;
1467
1468 idx = fixed_mode_idx(hwc);
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 1469 if (idx == X86_PMC_IDX_FIXED_BTS) {
1373 /* BTS is already occupied. */ 1470 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask)) 1471 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN; 1472 return -EAGAIN;
1376 1473
1377 hwc->config_base = 0; 1474 hwc->config_base = 0;
1378 hwc->counter_base = 0; 1475 hwc->event_base = 0;
1379 hwc->idx = idx; 1476 hwc->idx = idx;
1380 } else if (idx >= 0) { 1477 } else if (idx >= 0) {
1381 /* 1478 /*
1382 * Try to get the fixed counter, if that is already taken 1479 * Try to get the fixed event, if that is already taken
1383 * then try to get a generic counter: 1480 * then try to get a generic event:
1384 */ 1481 */
1385 if (test_and_set_bit(idx, cpuc->used_mask)) 1482 if (test_and_set_bit(idx, cpuc->used_mask))
1386 goto try_generic; 1483 goto try_generic;
1387 1484
1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 1485 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1389 /* 1486 /*
1390 * We set it so that counter_base + idx in wrmsr/rdmsr maps to 1487 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: 1488 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1392 */ 1489 */
1393 hwc->counter_base = 1490 hwc->event_base =
1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; 1491 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1395 hwc->idx = idx; 1492 hwc->idx = idx;
1396 } else { 1493 } else {
1397 idx = hwc->idx; 1494 idx = hwc->idx;
1398 /* Try to get the previous generic counter again */ 1495 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) { 1496 if (idx == -1 || test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic: 1497try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask, 1498 idx = x86_pmu.get_event_idx(cpuc, hwc);
1402 x86_pmu.num_counters); 1499 if (idx == -1)
1403 if (idx == x86_pmu.num_counters)
1404 return -EAGAIN; 1500 return -EAGAIN;
1405 1501
1406 set_bit(idx, cpuc->used_mask); 1502 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx; 1503 hwc->idx = idx;
1408 } 1504 }
1409 hwc->config_base = x86_pmu.eventsel; 1505 hwc->config_base = x86_pmu.eventsel;
1410 hwc->counter_base = x86_pmu.perfctr; 1506 hwc->event_base = x86_pmu.perfctr;
1411 } 1507 }
1412 1508
1413 perf_counters_lapic_init(); 1509 return idx;
1510}
1511
1512/*
1513 * Find a PMC slot for the freshly enabled / scheduled in event:
1514 */
1515static int x86_pmu_enable(struct perf_event *event)
1516{
1517 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1518 struct hw_perf_event *hwc = &event->hw;
1519 int idx;
1520
1521 idx = x86_schedule_event(cpuc, hwc);
1522 if (idx < 0)
1523 return idx;
1524
1525 perf_events_lapic_init();
1414 1526
1415 x86_pmu.disable(hwc, idx); 1527 x86_pmu.disable(hwc, idx);
1416 1528
1417 cpuc->counters[idx] = counter; 1529 cpuc->events[idx] = event;
1418 set_bit(idx, cpuc->active_mask); 1530 set_bit(idx, cpuc->active_mask);
1419 1531
1420 x86_perf_counter_set_period(counter, hwc, idx); 1532 x86_perf_event_set_period(event, hwc, idx);
1421 x86_pmu.enable(hwc, idx); 1533 x86_pmu.enable(hwc, idx);
1422 1534
1423 perf_counter_update_userpage(counter); 1535 perf_event_update_userpage(event);
1424 1536
1425 return 0; 1537 return 0;
1426} 1538}
1427 1539
1428static void x86_pmu_unthrottle(struct perf_counter *counter) 1540static void x86_pmu_unthrottle(struct perf_event *event)
1429{ 1541{
1430 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1542 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1431 struct hw_perf_counter *hwc = &counter->hw; 1543 struct hw_perf_event *hwc = &event->hw;
1432 1544
1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || 1545 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1434 cpuc->counters[hwc->idx] != counter)) 1546 cpuc->events[hwc->idx] != event))
1435 return; 1547 return;
1436 1548
1437 x86_pmu.enable(hwc, hwc->idx); 1549 x86_pmu.enable(hwc, hwc->idx);
1438} 1550}
1439 1551
1440void perf_counter_print_debug(void) 1552void perf_event_print_debug(void)
1441{ 1553{
1442 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; 1554 u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1443 struct cpu_hw_counters *cpuc; 1555 struct cpu_hw_events *cpuc;
1444 unsigned long flags; 1556 unsigned long flags;
1445 int cpu, idx; 1557 int cpu, idx;
1446 1558
1447 if (!x86_pmu.num_counters) 1559 if (!x86_pmu.num_events)
1448 return; 1560 return;
1449 1561
1450 local_irq_save(flags); 1562 local_irq_save(flags);
1451 1563
1452 cpu = smp_processor_id(); 1564 cpu = smp_processor_id();
1453 cpuc = &per_cpu(cpu_hw_counters, cpu); 1565 cpuc = &per_cpu(cpu_hw_events, cpu);
1454 1566
1455 if (x86_pmu.version >= 2) { 1567 if (x86_pmu.version >= 2) {
1456 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); 1568 rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
@@ -1466,11 +1578,11 @@ void perf_counter_print_debug(void)
1466 } 1578 }
1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1579 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask);
1468 1580
1469 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1581 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1582 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
1471 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1583 rdmsrl(x86_pmu.perfctr + idx, pmc_count);
1472 1584
1473 prev_left = per_cpu(prev_left[idx], cpu); 1585 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1474 1586
1475 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", 1587 pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n",
1476 cpu, idx, pmc_ctrl); 1588 cpu, idx, pmc_ctrl);
@@ -1479,7 +1591,7 @@ void perf_counter_print_debug(void)
1479 pr_info("CPU#%d: gen-PMC%d left: %016llx\n", 1591 pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
1480 cpu, idx, prev_left); 1592 cpu, idx, prev_left);
1481 } 1593 }
1482 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1594 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1483 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); 1595 rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
1484 1596
1485 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", 1597 pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1488,8 +1600,7 @@ void perf_counter_print_debug(void)
1488 local_irq_restore(flags); 1600 local_irq_restore(flags);
1489} 1601}
1490 1602
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, 1603static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc)
1492 struct perf_sample_data *data)
1493{ 1604{
1494 struct debug_store *ds = cpuc->ds; 1605 struct debug_store *ds = cpuc->ds;
1495 struct bts_record { 1606 struct bts_record {
@@ -1497,11 +1608,14 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1497 u64 to; 1608 u64 to;
1498 u64 flags; 1609 u64 flags;
1499 }; 1610 };
1500 struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; 1611 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1501 unsigned long orig_ip = data->regs->ip;
1502 struct bts_record *at, *top; 1612 struct bts_record *at, *top;
1613 struct perf_output_handle handle;
1614 struct perf_event_header header;
1615 struct perf_sample_data data;
1616 struct pt_regs regs;
1503 1617
1504 if (!counter) 1618 if (!event)
1505 return; 1619 return;
1506 1620
1507 if (!ds) 1621 if (!ds)
@@ -1510,26 +1624,45 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1510 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 1624 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1511 top = (struct bts_record *)(unsigned long)ds->bts_index; 1625 top = (struct bts_record *)(unsigned long)ds->bts_index;
1512 1626
1627 if (top <= at)
1628 return;
1629
1513 ds->bts_index = ds->bts_buffer_base; 1630 ds->bts_index = ds->bts_buffer_base;
1514 1631
1632
1633 data.period = event->hw.last_period;
1634 data.addr = 0;
1635 regs.ip = 0;
1636
1637 /*
1638 * Prepare a generic sample, i.e. fill in the invariant fields.
1639 * We will overwrite the from and to address before we output
1640 * the sample.
1641 */
1642 perf_prepare_sample(&header, &data, event, &regs);
1643
1644 if (perf_output_begin(&handle, event,
1645 header.size * (top - at), 1, 1))
1646 return;
1647
1515 for (; at < top; at++) { 1648 for (; at < top; at++) {
1516 data->regs->ip = at->from; 1649 data.ip = at->from;
1517 data->addr = at->to; 1650 data.addr = at->to;
1518 1651
1519 perf_counter_output(counter, 1, data); 1652 perf_output_sample(&handle, &header, &data, event);
1520 } 1653 }
1521 1654
1522 data->regs->ip = orig_ip; 1655 perf_output_end(&handle);
1523 data->addr = 0;
1524 1656
1525 /* There's new data available. */ 1657 /* There's new data available. */
1526 counter->pending_kill = POLL_IN; 1658 event->hw.interrupts++;
1659 event->pending_kill = POLL_IN;
1527} 1660}
1528 1661
1529static void x86_pmu_disable(struct perf_counter *counter) 1662static void x86_pmu_disable(struct perf_event *event)
1530{ 1663{
1531 struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); 1664 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1532 struct hw_perf_counter *hwc = &counter->hw; 1665 struct hw_perf_event *hwc = &event->hw;
1533 int idx = hwc->idx; 1666 int idx = hwc->idx;
1534 1667
1535 /* 1668 /*
@@ -1541,67 +1674,63 @@ static void x86_pmu_disable(struct perf_counter *counter)
1541 1674
1542 /* 1675 /*
1543 * Make sure the cleared pointer becomes visible before we 1676 * Make sure the cleared pointer becomes visible before we
1544 * (potentially) free the counter: 1677 * (potentially) free the event:
1545 */ 1678 */
1546 barrier(); 1679 barrier();
1547 1680
1548 /* 1681 /*
1549 * Drain the remaining delta count out of a counter 1682 * Drain the remaining delta count out of a event
1550 * that we are disabling: 1683 * that we are disabling:
1551 */ 1684 */
1552 x86_perf_counter_update(counter, hwc, idx); 1685 x86_perf_event_update(event, hwc, idx);
1553 1686
1554 /* Drain the remaining BTS records. */ 1687 /* Drain the remaining BTS records. */
1555 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1688 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1556 struct perf_sample_data data; 1689 intel_pmu_drain_bts_buffer(cpuc);
1557 struct pt_regs regs;
1558 1690
1559 data.regs = &regs; 1691 cpuc->events[idx] = NULL;
1560 intel_pmu_drain_bts_buffer(cpuc, &data);
1561 }
1562 cpuc->counters[idx] = NULL;
1563 clear_bit(idx, cpuc->used_mask); 1692 clear_bit(idx, cpuc->used_mask);
1564 1693
1565 perf_counter_update_userpage(counter); 1694 perf_event_update_userpage(event);
1566} 1695}
1567 1696
1568/* 1697/*
1569 * Save and restart an expired counter. Called by NMI contexts, 1698 * Save and restart an expired event. Called by NMI contexts,
1570 * so it has to be careful about preempting normal counter ops: 1699 * so it has to be careful about preempting normal event ops:
1571 */ 1700 */
1572static int intel_pmu_save_and_restart(struct perf_counter *counter) 1701static int intel_pmu_save_and_restart(struct perf_event *event)
1573{ 1702{
1574 struct hw_perf_counter *hwc = &counter->hw; 1703 struct hw_perf_event *hwc = &event->hw;
1575 int idx = hwc->idx; 1704 int idx = hwc->idx;
1576 int ret; 1705 int ret;
1577 1706
1578 x86_perf_counter_update(counter, hwc, idx); 1707 x86_perf_event_update(event, hwc, idx);
1579 ret = x86_perf_counter_set_period(counter, hwc, idx); 1708 ret = x86_perf_event_set_period(event, hwc, idx);
1580 1709
1581 if (counter->state == PERF_COUNTER_STATE_ACTIVE) 1710 if (event->state == PERF_EVENT_STATE_ACTIVE)
1582 intel_pmu_enable_counter(hwc, idx); 1711 intel_pmu_enable_event(hwc, idx);
1583 1712
1584 return ret; 1713 return ret;
1585} 1714}
1586 1715
1587static void intel_pmu_reset(void) 1716static void intel_pmu_reset(void)
1588{ 1717{
1589 struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; 1718 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1590 unsigned long flags; 1719 unsigned long flags;
1591 int idx; 1720 int idx;
1592 1721
1593 if (!x86_pmu.num_counters) 1722 if (!x86_pmu.num_events)
1594 return; 1723 return;
1595 1724
1596 local_irq_save(flags); 1725 local_irq_save(flags);
1597 1726
1598 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 1727 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1599 1728
1600 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1729 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1601 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 1730 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1602 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 1731 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1603 } 1732 }
1604 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { 1733 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1605 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 1734 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1606 } 1735 }
1607 if (ds) 1736 if (ds)
@@ -1613,39 +1742,38 @@ static void intel_pmu_reset(void)
1613static int p6_pmu_handle_irq(struct pt_regs *regs) 1742static int p6_pmu_handle_irq(struct pt_regs *regs)
1614{ 1743{
1615 struct perf_sample_data data; 1744 struct perf_sample_data data;
1616 struct cpu_hw_counters *cpuc; 1745 struct cpu_hw_events *cpuc;
1617 struct perf_counter *counter; 1746 struct perf_event *event;
1618 struct hw_perf_counter *hwc; 1747 struct hw_perf_event *hwc;
1619 int idx, handled = 0; 1748 int idx, handled = 0;
1620 u64 val; 1749 u64 val;
1621 1750
1622 data.regs = regs;
1623 data.addr = 0; 1751 data.addr = 0;
1624 1752
1625 cpuc = &__get_cpu_var(cpu_hw_counters); 1753 cpuc = &__get_cpu_var(cpu_hw_events);
1626 1754
1627 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1755 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1628 if (!test_bit(idx, cpuc->active_mask)) 1756 if (!test_bit(idx, cpuc->active_mask))
1629 continue; 1757 continue;
1630 1758
1631 counter = cpuc->counters[idx]; 1759 event = cpuc->events[idx];
1632 hwc = &counter->hw; 1760 hwc = &event->hw;
1633 1761
1634 val = x86_perf_counter_update(counter, hwc, idx); 1762 val = x86_perf_event_update(event, hwc, idx);
1635 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1763 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1636 continue; 1764 continue;
1637 1765
1638 /* 1766 /*
1639 * counter overflow 1767 * event overflow
1640 */ 1768 */
1641 handled = 1; 1769 handled = 1;
1642 data.period = counter->hw.last_period; 1770 data.period = event->hw.last_period;
1643 1771
1644 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1772 if (!x86_perf_event_set_period(event, hwc, idx))
1645 continue; 1773 continue;
1646 1774
1647 if (perf_counter_overflow(counter, 1, &data)) 1775 if (perf_event_overflow(event, 1, &data, regs))
1648 p6_pmu_disable_counter(hwc, idx); 1776 p6_pmu_disable_event(hwc, idx);
1649 } 1777 }
1650 1778
1651 if (handled) 1779 if (handled)
@@ -1661,17 +1789,16 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1661static int intel_pmu_handle_irq(struct pt_regs *regs) 1789static int intel_pmu_handle_irq(struct pt_regs *regs)
1662{ 1790{
1663 struct perf_sample_data data; 1791 struct perf_sample_data data;
1664 struct cpu_hw_counters *cpuc; 1792 struct cpu_hw_events *cpuc;
1665 int bit, loops; 1793 int bit, loops;
1666 u64 ack, status; 1794 u64 ack, status;
1667 1795
1668 data.regs = regs;
1669 data.addr = 0; 1796 data.addr = 0;
1670 1797
1671 cpuc = &__get_cpu_var(cpu_hw_counters); 1798 cpuc = &__get_cpu_var(cpu_hw_events);
1672 1799
1673 perf_disable(); 1800 perf_disable();
1674 intel_pmu_drain_bts_buffer(cpuc, &data); 1801 intel_pmu_drain_bts_buffer(cpuc);
1675 status = intel_pmu_get_status(); 1802 status = intel_pmu_get_status();
1676 if (!status) { 1803 if (!status) {
1677 perf_enable(); 1804 perf_enable();
@@ -1681,8 +1808,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1681 loops = 0; 1808 loops = 0;
1682again: 1809again:
1683 if (++loops > 100) { 1810 if (++loops > 100) {
1684 WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); 1811 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1685 perf_counter_print_debug(); 1812 perf_event_print_debug();
1686 intel_pmu_reset(); 1813 intel_pmu_reset();
1687 perf_enable(); 1814 perf_enable();
1688 return 1; 1815 return 1;
@@ -1691,19 +1818,19 @@ again:
1691 inc_irq_stat(apic_perf_irqs); 1818 inc_irq_stat(apic_perf_irqs);
1692 ack = status; 1819 ack = status;
1693 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { 1820 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1694 struct perf_counter *counter = cpuc->counters[bit]; 1821 struct perf_event *event = cpuc->events[bit];
1695 1822
1696 clear_bit(bit, (unsigned long *) &status); 1823 clear_bit(bit, (unsigned long *) &status);
1697 if (!test_bit(bit, cpuc->active_mask)) 1824 if (!test_bit(bit, cpuc->active_mask))
1698 continue; 1825 continue;
1699 1826
1700 if (!intel_pmu_save_and_restart(counter)) 1827 if (!intel_pmu_save_and_restart(event))
1701 continue; 1828 continue;
1702 1829
1703 data.period = counter->hw.last_period; 1830 data.period = event->hw.last_period;
1704 1831
1705 if (perf_counter_overflow(counter, 1, &data)) 1832 if (perf_event_overflow(event, 1, &data, regs))
1706 intel_pmu_disable_counter(&counter->hw, bit); 1833 intel_pmu_disable_event(&event->hw, bit);
1707 } 1834 }
1708 1835
1709 intel_pmu_ack_status(ack); 1836 intel_pmu_ack_status(ack);
@@ -1723,39 +1850,38 @@ again:
1723static int amd_pmu_handle_irq(struct pt_regs *regs) 1850static int amd_pmu_handle_irq(struct pt_regs *regs)
1724{ 1851{
1725 struct perf_sample_data data; 1852 struct perf_sample_data data;
1726 struct cpu_hw_counters *cpuc; 1853 struct cpu_hw_events *cpuc;
1727 struct perf_counter *counter; 1854 struct perf_event *event;
1728 struct hw_perf_counter *hwc; 1855 struct hw_perf_event *hwc;
1729 int idx, handled = 0; 1856 int idx, handled = 0;
1730 u64 val; 1857 u64 val;
1731 1858
1732 data.regs = regs;
1733 data.addr = 0; 1859 data.addr = 0;
1734 1860
1735 cpuc = &__get_cpu_var(cpu_hw_counters); 1861 cpuc = &__get_cpu_var(cpu_hw_events);
1736 1862
1737 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1863 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1738 if (!test_bit(idx, cpuc->active_mask)) 1864 if (!test_bit(idx, cpuc->active_mask))
1739 continue; 1865 continue;
1740 1866
1741 counter = cpuc->counters[idx]; 1867 event = cpuc->events[idx];
1742 hwc = &counter->hw; 1868 hwc = &event->hw;
1743 1869
1744 val = x86_perf_counter_update(counter, hwc, idx); 1870 val = x86_perf_event_update(event, hwc, idx);
1745 if (val & (1ULL << (x86_pmu.counter_bits - 1))) 1871 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1746 continue; 1872 continue;
1747 1873
1748 /* 1874 /*
1749 * counter overflow 1875 * event overflow
1750 */ 1876 */
1751 handled = 1; 1877 handled = 1;
1752 data.period = counter->hw.last_period; 1878 data.period = event->hw.last_period;
1753 1879
1754 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1880 if (!x86_perf_event_set_period(event, hwc, idx))
1755 continue; 1881 continue;
1756 1882
1757 if (perf_counter_overflow(counter, 1, &data)) 1883 if (perf_event_overflow(event, 1, &data, regs))
1758 amd_pmu_disable_counter(hwc, idx); 1884 amd_pmu_disable_event(hwc, idx);
1759 } 1885 }
1760 1886
1761 if (handled) 1887 if (handled)
@@ -1769,18 +1895,21 @@ void smp_perf_pending_interrupt(struct pt_regs *regs)
1769 irq_enter(); 1895 irq_enter();
1770 ack_APIC_irq(); 1896 ack_APIC_irq();
1771 inc_irq_stat(apic_pending_irqs); 1897 inc_irq_stat(apic_pending_irqs);
1772 perf_counter_do_pending(); 1898 perf_event_do_pending();
1773 irq_exit(); 1899 irq_exit();
1774} 1900}
1775 1901
1776void set_perf_counter_pending(void) 1902void set_perf_event_pending(void)
1777{ 1903{
1778#ifdef CONFIG_X86_LOCAL_APIC 1904#ifdef CONFIG_X86_LOCAL_APIC
1905 if (!x86_pmu.apic || !x86_pmu_initialized())
1906 return;
1907
1779 apic->send_IPI_self(LOCAL_PENDING_VECTOR); 1908 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1780#endif 1909#endif
1781} 1910}
1782 1911
1783void perf_counters_lapic_init(void) 1912void perf_events_lapic_init(void)
1784{ 1913{
1785#ifdef CONFIG_X86_LOCAL_APIC 1914#ifdef CONFIG_X86_LOCAL_APIC
1786 if (!x86_pmu.apic || !x86_pmu_initialized()) 1915 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1794,13 +1923,13 @@ void perf_counters_lapic_init(void)
1794} 1923}
1795 1924
1796static int __kprobes 1925static int __kprobes
1797perf_counter_nmi_handler(struct notifier_block *self, 1926perf_event_nmi_handler(struct notifier_block *self,
1798 unsigned long cmd, void *__args) 1927 unsigned long cmd, void *__args)
1799{ 1928{
1800 struct die_args *args = __args; 1929 struct die_args *args = __args;
1801 struct pt_regs *regs; 1930 struct pt_regs *regs;
1802 1931
1803 if (!atomic_read(&active_counters)) 1932 if (!atomic_read(&active_events))
1804 return NOTIFY_DONE; 1933 return NOTIFY_DONE;
1805 1934
1806 switch (cmd) { 1935 switch (cmd) {
@@ -1819,7 +1948,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
1819#endif 1948#endif
1820 /* 1949 /*
1821 * Can't rely on the handled return value to say it was our NMI, two 1950 * Can't rely on the handled return value to say it was our NMI, two
1822 * counters could trigger 'simultaneously' raising two back-to-back NMIs. 1951 * events could trigger 'simultaneously' raising two back-to-back NMIs.
1823 * 1952 *
1824 * If the first NMI handles both, the latter will be empty and daze 1953 * If the first NMI handles both, the latter will be empty and daze
1825 * the CPU. 1954 * the CPU.
@@ -1829,8 +1958,8 @@ perf_counter_nmi_handler(struct notifier_block *self,
1829 return NOTIFY_STOP; 1958 return NOTIFY_STOP;
1830} 1959}
1831 1960
1832static __read_mostly struct notifier_block perf_counter_nmi_notifier = { 1961static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1833 .notifier_call = perf_counter_nmi_handler, 1962 .notifier_call = perf_event_nmi_handler,
1834 .next = NULL, 1963 .next = NULL,
1835 .priority = 1 1964 .priority = 1
1836}; 1965};
@@ -1840,8 +1969,8 @@ static struct x86_pmu p6_pmu = {
1840 .handle_irq = p6_pmu_handle_irq, 1969 .handle_irq = p6_pmu_handle_irq,
1841 .disable_all = p6_pmu_disable_all, 1970 .disable_all = p6_pmu_disable_all,
1842 .enable_all = p6_pmu_enable_all, 1971 .enable_all = p6_pmu_enable_all,
1843 .enable = p6_pmu_enable_counter, 1972 .enable = p6_pmu_enable_event,
1844 .disable = p6_pmu_disable_counter, 1973 .disable = p6_pmu_disable_event,
1845 .eventsel = MSR_P6_EVNTSEL0, 1974 .eventsel = MSR_P6_EVNTSEL0,
1846 .perfctr = MSR_P6_PERFCTR0, 1975 .perfctr = MSR_P6_PERFCTR0,
1847 .event_map = p6_pmu_event_map, 1976 .event_map = p6_pmu_event_map,
@@ -1850,16 +1979,17 @@ static struct x86_pmu p6_pmu = {
1850 .apic = 1, 1979 .apic = 1,
1851 .max_period = (1ULL << 31) - 1, 1980 .max_period = (1ULL << 31) - 1,
1852 .version = 0, 1981 .version = 0,
1853 .num_counters = 2, 1982 .num_events = 2,
1854 /* 1983 /*
1855 * Counters have 40 bits implemented. However they are designed such 1984 * Events have 40 bits implemented. However they are designed such
1856 * that bits [32-39] are sign extensions of bit 31. As such the 1985 * that bits [32-39] are sign extensions of bit 31. As such the
1857 * effective width of a counter for P6-like PMU is 32 bits only. 1986 * effective width of a event for P6-like PMU is 32 bits only.
1858 * 1987 *
1859 * See IA-32 Intel Architecture Software developer manual Vol 3B 1988 * See IA-32 Intel Architecture Software developer manual Vol 3B
1860 */ 1989 */
1861 .counter_bits = 32, 1990 .event_bits = 32,
1862 .counter_mask = (1ULL << 32) - 1, 1991 .event_mask = (1ULL << 32) - 1,
1992 .get_event_idx = intel_get_event_idx,
1863}; 1993};
1864 1994
1865static struct x86_pmu intel_pmu = { 1995static struct x86_pmu intel_pmu = {
@@ -1867,8 +1997,8 @@ static struct x86_pmu intel_pmu = {
1867 .handle_irq = intel_pmu_handle_irq, 1997 .handle_irq = intel_pmu_handle_irq,
1868 .disable_all = intel_pmu_disable_all, 1998 .disable_all = intel_pmu_disable_all,
1869 .enable_all = intel_pmu_enable_all, 1999 .enable_all = intel_pmu_enable_all,
1870 .enable = intel_pmu_enable_counter, 2000 .enable = intel_pmu_enable_event,
1871 .disable = intel_pmu_disable_counter, 2001 .disable = intel_pmu_disable_event,
1872 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, 2002 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1873 .perfctr = MSR_ARCH_PERFMON_PERFCTR0, 2003 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1874 .event_map = intel_pmu_event_map, 2004 .event_map = intel_pmu_event_map,
@@ -1878,11 +2008,12 @@ static struct x86_pmu intel_pmu = {
1878 /* 2008 /*
1879 * Intel PMCs cannot be accessed sanely above 32 bit width, 2009 * Intel PMCs cannot be accessed sanely above 32 bit width,
1880 * so we install an artificial 1<<31 period regardless of 2010 * so we install an artificial 1<<31 period regardless of
1881 * the generic counter period: 2011 * the generic event period:
1882 */ 2012 */
1883 .max_period = (1ULL << 31) - 1, 2013 .max_period = (1ULL << 31) - 1,
1884 .enable_bts = intel_pmu_enable_bts, 2014 .enable_bts = intel_pmu_enable_bts,
1885 .disable_bts = intel_pmu_disable_bts, 2015 .disable_bts = intel_pmu_disable_bts,
2016 .get_event_idx = intel_get_event_idx,
1886}; 2017};
1887 2018
1888static struct x86_pmu amd_pmu = { 2019static struct x86_pmu amd_pmu = {
@@ -1890,19 +2021,20 @@ static struct x86_pmu amd_pmu = {
1890 .handle_irq = amd_pmu_handle_irq, 2021 .handle_irq = amd_pmu_handle_irq,
1891 .disable_all = amd_pmu_disable_all, 2022 .disable_all = amd_pmu_disable_all,
1892 .enable_all = amd_pmu_enable_all, 2023 .enable_all = amd_pmu_enable_all,
1893 .enable = amd_pmu_enable_counter, 2024 .enable = amd_pmu_enable_event,
1894 .disable = amd_pmu_disable_counter, 2025 .disable = amd_pmu_disable_event,
1895 .eventsel = MSR_K7_EVNTSEL0, 2026 .eventsel = MSR_K7_EVNTSEL0,
1896 .perfctr = MSR_K7_PERFCTR0, 2027 .perfctr = MSR_K7_PERFCTR0,
1897 .event_map = amd_pmu_event_map, 2028 .event_map = amd_pmu_event_map,
1898 .raw_event = amd_pmu_raw_event, 2029 .raw_event = amd_pmu_raw_event,
1899 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 2030 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1900 .num_counters = 4, 2031 .num_events = 4,
1901 .counter_bits = 48, 2032 .event_bits = 48,
1902 .counter_mask = (1ULL << 48) - 1, 2033 .event_mask = (1ULL << 48) - 1,
1903 .apic = 1, 2034 .apic = 1,
1904 /* use highest bit to detect overflow */ 2035 /* use highest bit to detect overflow */
1905 .max_period = (1ULL << 47) - 1, 2036 .max_period = (1ULL << 47) - 1,
2037 .get_event_idx = gen_get_event_idx,
1906}; 2038};
1907 2039
1908static int p6_pmu_init(void) 2040static int p6_pmu_init(void)
@@ -1915,10 +2047,12 @@ static int p6_pmu_init(void)
1915 case 7: 2047 case 7:
1916 case 8: 2048 case 8:
1917 case 11: /* Pentium III */ 2049 case 11: /* Pentium III */
2050 event_constraints = intel_p6_event_constraints;
1918 break; 2051 break;
1919 case 9: 2052 case 9:
1920 case 13: 2053 case 13:
1921 /* Pentium M */ 2054 /* Pentium M */
2055 event_constraints = intel_p6_event_constraints;
1922 break; 2056 break;
1923 default: 2057 default:
1924 pr_cont("unsupported p6 CPU model %d ", 2058 pr_cont("unsupported p6 CPU model %d ",
@@ -1956,7 +2090,7 @@ static int intel_pmu_init(void)
1956 2090
1957 /* 2091 /*
1958 * Check whether the Architectural PerfMon supports 2092 * Check whether the Architectural PerfMon supports
1959 * Branch Misses Retired Event or not. 2093 * Branch Misses Retired hw_event or not.
1960 */ 2094 */
1961 cpuid(10, &eax.full, &ebx, &unused, &edx.full); 2095 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1962 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) 2096 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
@@ -1968,15 +2102,15 @@ static int intel_pmu_init(void)
1968 2102
1969 x86_pmu = intel_pmu; 2103 x86_pmu = intel_pmu;
1970 x86_pmu.version = version; 2104 x86_pmu.version = version;
1971 x86_pmu.num_counters = eax.split.num_counters; 2105 x86_pmu.num_events = eax.split.num_events;
1972 x86_pmu.counter_bits = eax.split.bit_width; 2106 x86_pmu.event_bits = eax.split.bit_width;
1973 x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; 2107 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1974 2108
1975 /* 2109 /*
1976 * Quirk: v2 perfmon does not report fixed-purpose counters, so 2110 * Quirk: v2 perfmon does not report fixed-purpose events, so
1977 * assume at least 3 counters: 2111 * assume at least 3 events:
1978 */ 2112 */
1979 x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); 2113 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
1980 2114
1981 /* 2115 /*
1982 * Install the hw-cache-events table: 2116 * Install the hw-cache-events table:
@@ -1990,12 +2124,14 @@ static int intel_pmu_init(void)
1990 sizeof(hw_cache_event_ids)); 2124 sizeof(hw_cache_event_ids));
1991 2125
1992 pr_cont("Core2 events, "); 2126 pr_cont("Core2 events, ");
2127 event_constraints = intel_core_event_constraints;
1993 break; 2128 break;
1994 default: 2129 default:
1995 case 26: 2130 case 26:
1996 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 2131 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1997 sizeof(hw_cache_event_ids)); 2132 sizeof(hw_cache_event_ids));
1998 2133
2134 event_constraints = intel_nehalem_event_constraints;
1999 pr_cont("Nehalem/Corei7 events, "); 2135 pr_cont("Nehalem/Corei7 events, ");
2000 break; 2136 break;
2001 case 28: 2137 case 28:
@@ -2023,11 +2159,11 @@ static int amd_pmu_init(void)
2023 return 0; 2159 return 0;
2024} 2160}
2025 2161
2026void __init init_hw_perf_counters(void) 2162void __init init_hw_perf_events(void)
2027{ 2163{
2028 int err; 2164 int err;
2029 2165
2030 pr_info("Performance Counters: "); 2166 pr_info("Performance Events: ");
2031 2167
2032 switch (boot_cpu_data.x86_vendor) { 2168 switch (boot_cpu_data.x86_vendor) {
2033 case X86_VENDOR_INTEL: 2169 case X86_VENDOR_INTEL:
@@ -2040,45 +2176,45 @@ void __init init_hw_perf_counters(void)
2040 return; 2176 return;
2041 } 2177 }
2042 if (err != 0) { 2178 if (err != 0) {
2043 pr_cont("no PMU driver, software counters only.\n"); 2179 pr_cont("no PMU driver, software events only.\n");
2044 return; 2180 return;
2045 } 2181 }
2046 2182
2047 pr_cont("%s PMU driver.\n", x86_pmu.name); 2183 pr_cont("%s PMU driver.\n", x86_pmu.name);
2048 2184
2049 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { 2185 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
2050 WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", 2186 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2051 x86_pmu.num_counters, X86_PMC_MAX_GENERIC); 2187 x86_pmu.num_events, X86_PMC_MAX_GENERIC);
2052 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 2188 x86_pmu.num_events = X86_PMC_MAX_GENERIC;
2053 } 2189 }
2054 perf_counter_mask = (1 << x86_pmu.num_counters) - 1; 2190 perf_event_mask = (1 << x86_pmu.num_events) - 1;
2055 perf_max_counters = x86_pmu.num_counters; 2191 perf_max_events = x86_pmu.num_events;
2056 2192
2057 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 2193 if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
2058 WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", 2194 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2059 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); 2195 x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
2060 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; 2196 x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
2061 } 2197 }
2062 2198
2063 perf_counter_mask |= 2199 perf_event_mask |=
2064 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; 2200 ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
2065 x86_pmu.intel_ctrl = perf_counter_mask; 2201 x86_pmu.intel_ctrl = perf_event_mask;
2066 2202
2067 perf_counters_lapic_init(); 2203 perf_events_lapic_init();
2068 register_die_notifier(&perf_counter_nmi_notifier); 2204 register_die_notifier(&perf_event_nmi_notifier);
2069 2205
2070 pr_info("... version: %d\n", x86_pmu.version); 2206 pr_info("... version: %d\n", x86_pmu.version);
2071 pr_info("... bit width: %d\n", x86_pmu.counter_bits); 2207 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2072 pr_info("... generic counters: %d\n", x86_pmu.num_counters); 2208 pr_info("... generic registers: %d\n", x86_pmu.num_events);
2073 pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); 2209 pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
2074 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 2210 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2075 pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); 2211 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2076 pr_info("... counter mask: %016Lx\n", perf_counter_mask); 2212 pr_info("... event mask: %016Lx\n", perf_event_mask);
2077} 2213}
2078 2214
2079static inline void x86_pmu_read(struct perf_counter *counter) 2215static inline void x86_pmu_read(struct perf_event *event)
2080{ 2216{
2081 x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); 2217 x86_perf_event_update(event, &event->hw, event->hw.idx);
2082} 2218}
2083 2219
2084static const struct pmu pmu = { 2220static const struct pmu pmu = {
@@ -2088,13 +2224,52 @@ static const struct pmu pmu = {
2088 .unthrottle = x86_pmu_unthrottle, 2224 .unthrottle = x86_pmu_unthrottle,
2089}; 2225};
2090 2226
2091const struct pmu *hw_perf_counter_init(struct perf_counter *counter) 2227static int
2228validate_event(struct cpu_hw_events *cpuc, struct perf_event *event)
2229{
2230 struct hw_perf_event fake_event = event->hw;
2231
2232 if (event->pmu != &pmu)
2233 return 0;
2234
2235 return x86_schedule_event(cpuc, &fake_event);
2236}
2237
2238static int validate_group(struct perf_event *event)
2239{
2240 struct perf_event *sibling, *leader = event->group_leader;
2241 struct cpu_hw_events fake_pmu;
2242
2243 memset(&fake_pmu, 0, sizeof(fake_pmu));
2244
2245 if (!validate_event(&fake_pmu, leader))
2246 return -ENOSPC;
2247
2248 list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
2249 if (!validate_event(&fake_pmu, sibling))
2250 return -ENOSPC;
2251 }
2252
2253 if (!validate_event(&fake_pmu, event))
2254 return -ENOSPC;
2255
2256 return 0;
2257}
2258
2259const struct pmu *hw_perf_event_init(struct perf_event *event)
2092{ 2260{
2093 int err; 2261 int err;
2094 2262
2095 err = __hw_perf_counter_init(counter); 2263 err = __hw_perf_event_init(event);
2096 if (err) 2264 if (!err) {
2265 if (event->group_leader != event)
2266 err = validate_group(event);
2267 }
2268 if (err) {
2269 if (event->destroy)
2270 event->destroy(event);
2097 return ERR_PTR(err); 2271 return ERR_PTR(err);
2272 }
2098 2273
2099 return &pmu; 2274 return &pmu;
2100} 2275}
@@ -2110,8 +2285,8 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2110 entry->ip[entry->nr++] = ip; 2285 entry->ip[entry->nr++] = ip;
2111} 2286}
2112 2287
2113static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); 2288static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2114static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); 2289static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2115static DEFINE_PER_CPU(int, in_nmi_frame); 2290static DEFINE_PER_CPU(int, in_nmi_frame);
2116 2291
2117 2292
@@ -2264,9 +2439,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2264 struct perf_callchain_entry *entry; 2439 struct perf_callchain_entry *entry;
2265 2440
2266 if (in_nmi()) 2441 if (in_nmi())
2267 entry = &__get_cpu_var(nmi_entry); 2442 entry = &__get_cpu_var(pmc_nmi_entry);
2268 else 2443 else
2269 entry = &__get_cpu_var(irq_entry); 2444 entry = &__get_cpu_var(pmc_irq_entry);
2270 2445
2271 entry->nr = 0; 2446 entry->nr = 0;
2272 2447
@@ -2275,7 +2450,7 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2275 return entry; 2450 return entry;
2276} 2451}
2277 2452
2278void hw_perf_counter_setup_online(int cpu) 2453void hw_perf_event_setup_online(int cpu)
2279{ 2454{
2280 init_debug_store_on_cpu(cpu); 2455 init_debug_store_on_cpu(cpu);
2281} 2456}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 392bea43b890..fab786f60ed6 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_counter.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk { 25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr; 26 unsigned int cccr_msr;
diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c
new file mode 100644
index 000000000000..a640ae5ad201
--- /dev/null
+++ b/arch/x86/kernel/cpu/sched.c
@@ -0,0 +1,55 @@
1#include <linux/sched.h>
2#include <linux/math64.h>
3#include <linux/percpu.h>
4#include <linux/irqflags.h>
5
6#include <asm/cpufeature.h>
7#include <asm/processor.h>
8
9#ifdef CONFIG_SMP
10
11static DEFINE_PER_CPU(struct aperfmperf, old_perf_sched);
12
13static unsigned long scale_aperfmperf(void)
14{
15 struct aperfmperf val, *old = &__get_cpu_var(old_perf_sched);
16 unsigned long ratio, flags;
17
18 local_irq_save(flags);
19 get_aperfmperf(&val);
20 local_irq_restore(flags);
21
22 ratio = calc_aperfmperf_ratio(old, &val);
23 *old = val;
24
25 return ratio;
26}
27
28unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu)
29{
30 /*
31 * do aperf/mperf on the cpu level because it includes things
32 * like turbo mode, which are relevant to full cores.
33 */
34 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
35 return scale_aperfmperf();
36
37 /*
38 * maybe have something cpufreq here
39 */
40
41 return default_scale_freq_power(sd, cpu);
42}
43
44unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu)
45{
46 /*
47 * aperf/mperf already includes the smt gain
48 */
49 if (boot_cpu_has(X86_FEATURE_APERFMPERF))
50 return SCHED_LOAD_SCALE;
51
52 return default_scale_smt_power(sd, cpu);
53}
54
55#endif
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index bc24f514ec93..1cbed97b59cf 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,6 +24,7 @@
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <asm/div64.h> 25#include <asm/div64.h>
26#include <asm/vmware.h> 26#include <asm/vmware.h>
27#include <asm/x86_init.h>
27 28
28#define CPUID_VMWARE_INFO_LEAF 0x40000000 29#define CPUID_VMWARE_INFO_LEAF 0x40000000
29#define VMWARE_HYPERVISOR_MAGIC 0x564D5868 30#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -47,21 +48,35 @@ static inline int __vmware_platform(void)
47 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC; 48 return eax != (uint32_t)-1 && ebx == VMWARE_HYPERVISOR_MAGIC;
48} 49}
49 50
50static unsigned long __vmware_get_tsc_khz(void) 51static unsigned long vmware_get_tsc_khz(void)
51{ 52{
52 uint64_t tsc_hz; 53 uint64_t tsc_hz;
53 uint32_t eax, ebx, ecx, edx; 54 uint32_t eax, ebx, ecx, edx;
54 55
55 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); 56 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
56 57
57 if (ebx == UINT_MAX)
58 return 0;
59 tsc_hz = eax | (((uint64_t)ebx) << 32); 58 tsc_hz = eax | (((uint64_t)ebx) << 32);
60 do_div(tsc_hz, 1000); 59 do_div(tsc_hz, 1000);
61 BUG_ON(tsc_hz >> 32); 60 BUG_ON(tsc_hz >> 32);
61 printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
62 (unsigned long) tsc_hz / 1000,
63 (unsigned long) tsc_hz % 1000);
62 return tsc_hz; 64 return tsc_hz;
63} 65}
64 66
67void __init vmware_platform_setup(void)
68{
69 uint32_t eax, ebx, ecx, edx;
70
71 VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
72
73 if (ebx != UINT_MAX)
74 x86_platform.calibrate_tsc = vmware_get_tsc_khz;
75 else
76 printk(KERN_WARNING
77 "Failed to get TSC freq from the hypervisor\n");
78}
79
65/* 80/*
66 * While checking the dmi string infomation, just checking the product 81 * While checking the dmi string infomation, just checking the product
67 * serial key should be enough, as this will always have a VMware 82 * serial key should be enough, as this will always have a VMware
@@ -87,12 +102,6 @@ int vmware_platform(void)
87 return 0; 102 return 0;
88} 103}
89 104
90unsigned long vmware_get_tsc_khz(void)
91{
92 BUG_ON(!vmware_platform());
93 return __vmware_get_tsc_khz();
94}
95
96/* 105/*
97 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 106 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
98 * Still, due to timing difference when running on virtual cpus, the TSC can 107 * Still, due to timing difference when running on virtual cpus, the TSC can
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index b07af8861244..6a52d4b36a30 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
182 .notifier_call = cpuid_class_cpu_callback, 182 .notifier_call = cpuid_class_cpu_callback,
183}; 183};
184 184
185static char *cpuid_nodename(struct device *dev) 185static char *cpuid_devnode(struct device *dev, mode_t *mode)
186{ 186{
187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 187 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
188} 188}
@@ -203,7 +203,7 @@ static int __init cpuid_init(void)
203 err = PTR_ERR(cpuid_class); 203 err = PTR_ERR(cpuid_class);
204 goto out_chrdev; 204 goto out_chrdev;
205 } 205 }
206 cpuid_class->nodename = cpuid_nodename; 206 cpuid_class->devnode = cpuid_devnode;
207 for_each_online_cpu(i) { 207 for_each_online_cpu(i) {
208 err = cpuid_device_create(i); 208 err = cpuid_device_create(i);
209 if (err != 0) 209 if (err != 0)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index bca5fba91c9e..f7dd2a7c3bf4 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 54b0a3276766..a071e6be177e 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -5,7 +5,6 @@
5#include <linux/kallsyms.h> 5#include <linux/kallsyms.h>
6#include <linux/kprobes.h> 6#include <linux/kprobes.h>
7#include <linux/uaccess.h> 7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h> 8#include <linux/hardirq.h>
10#include <linux/kdebug.h> 9#include <linux/kdebug.h>
11#include <linux/module.h> 10#include <linux/module.h>
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 147005a1cc3c..d17d482a04f4 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void)
1331 struct resource *res; 1331 struct resource *res;
1332 u64 end; 1332 u64 end;
1333 1333
1334 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1334 res = alloc_bootmem(sizeof(struct resource) * e820.nr_map);
1335 e820_res = res; 1335 e820_res = res;
1336 for (i = 0; i < e820.nr_map; i++) { 1336 for (i = 0; i < e820.nr_map; i++) {
1337 end = e820.map[i].addr + e820.map[i].size - 1; 1337 end = e820.map[i].addr + e820.map[i].size - 1;
@@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos)
1378 if (mb < 16) 1378 if (mb < 16)
1379 return 1024*1024; 1379 return 1024*1024;
1380 1380
1381 /* To 32MB for anything above that */ 1381 /* To 64MB for anything above that */
1382 return 32*1024*1024; 1382 return 64*1024*1024;
1383} 1383}
1384 1384
1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1) 1385#define MAX_RESOURCE_SIZE ((resource_size_t)-1)
@@ -1455,28 +1455,11 @@ char *__init default_machine_specific_memory_setup(void)
1455 return who; 1455 return who;
1456} 1456}
1457 1457
1458char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1459{
1460 if (x86_quirks->arch_memory_setup) {
1461 char *who = x86_quirks->arch_memory_setup();
1462
1463 if (who)
1464 return who;
1465 }
1466 return default_machine_specific_memory_setup();
1467}
1468
1469/* Overridden in paravirt.c if CONFIG_PARAVIRT */
1470char * __init __attribute__((weak)) memory_setup(void)
1471{
1472 return machine_specific_memory_setup();
1473}
1474
1475void __init setup_memory_map(void) 1458void __init setup_memory_map(void)
1476{ 1459{
1477 char *who; 1460 char *who;
1478 1461
1479 who = memory_setup(); 1462 who = x86_init.resources.memory_setup();
1480 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1463 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1481 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1464 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1482 e820_print_map(who); 1465 e820_print_map(who);
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 335f049d110f..b9c830c12b4a 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -160,721 +160,6 @@ static struct console early_serial_console = {
160 .index = -1, 160 .index = -1,
161}; 161};
162 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void __init dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void __init dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int __init dbgp_control_msg(unsigned devnum, int requesttype,
390 int request, int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int __init ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int __init ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void __init default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t __initdata set_debug_port = default_set_debug_port;
565
566static void __init nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
878/* Direct interface for emergencies */ 163/* Direct interface for emergencies */
879static struct console *early_console = &early_vga_console; 164static struct console *early_console = &early_vga_console;
880static int __initdata early_console_initialized; 165static int __initdata early_console_initialized;
@@ -891,10 +176,24 @@ asmlinkage void early_printk(const char *fmt, ...)
891 va_end(ap); 176 va_end(ap);
892} 177}
893 178
179static inline void early_console_register(struct console *con, int keep_early)
180{
181 if (early_console->index != -1) {
182 printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
183 con->name);
184 return;
185 }
186 early_console = con;
187 if (keep_early)
188 early_console->flags &= ~CON_BOOT;
189 else
190 early_console->flags |= CON_BOOT;
191 register_console(early_console);
192}
894 193
895static int __init setup_early_printk(char *buf) 194static int __init setup_early_printk(char *buf)
896{ 195{
897 int keep_early; 196 int keep;
898 197
899 if (!buf) 198 if (!buf)
900 return 0; 199 return 0;
@@ -903,42 +202,37 @@ static int __init setup_early_printk(char *buf)
903 return 0; 202 return 0;
904 early_console_initialized = 1; 203 early_console_initialized = 1;
905 204
906 keep_early = (strstr(buf, "keep") != NULL); 205 keep = (strstr(buf, "keep") != NULL);
907 206
908 if (!strncmp(buf, "serial", 6)) { 207 while (*buf != '\0') {
909 early_serial_init(buf + 6); 208 if (!strncmp(buf, "serial", 6)) {
910 early_console = &early_serial_console; 209 buf += 6;
911 } else if (!strncmp(buf, "ttyS", 4)) { 210 early_serial_init(buf);
912 early_serial_init(buf); 211 early_console_register(&early_serial_console, keep);
913 early_console = &early_serial_console; 212 if (!strncmp(buf, ",ttyS", 5))
914 } else if (!strncmp(buf, "vga", 3) 213 buf += 5;
915 && boot_params.screen_info.orig_video_isVGA == 1) { 214 }
916 max_xpos = boot_params.screen_info.orig_video_cols; 215 if (!strncmp(buf, "ttyS", 4)) {
917 max_ypos = boot_params.screen_info.orig_video_lines; 216 early_serial_init(buf + 4);
918 current_ypos = boot_params.screen_info.orig_y; 217 early_console_register(&early_serial_console, keep);
919 early_console = &early_vga_console; 218 }
219 if (!strncmp(buf, "vga", 3) &&
220 boot_params.screen_info.orig_video_isVGA == 1) {
221 max_xpos = boot_params.screen_info.orig_video_cols;
222 max_ypos = boot_params.screen_info.orig_video_lines;
223 current_ypos = boot_params.screen_info.orig_y;
224 early_console_register(&early_vga_console, keep);
225 }
920#ifdef CONFIG_EARLY_PRINTK_DBGP 226#ifdef CONFIG_EARLY_PRINTK_DBGP
921 } else if (!strncmp(buf, "dbgp", 4)) { 227 if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4))
922 if (early_dbgp_init(buf+4) < 0) 228 early_console_register(&early_dbgp_console, keep);
923 return 0;
924 early_console = &early_dbgp_console;
925 /*
926 * usb subsys will reset ehci controller, so don't keep
927 * that early console
928 */
929 keep_early = 0;
930#endif 229#endif
931#ifdef CONFIG_HVC_XEN 230#ifdef CONFIG_HVC_XEN
932 } else if (!strncmp(buf, "xen", 3)) { 231 if (!strncmp(buf, "xen", 3))
933 early_console = &xenboot_console; 232 early_console_register(&xenboot_console, keep);
934#endif 233#endif
234 buf++;
935 } 235 }
936
937 if (keep_early)
938 early_console->flags &= ~CON_BOOT;
939 else
940 early_console->flags |= CON_BOOT;
941 register_console(early_console);
942 return 0; 236 return 0;
943} 237}
944 238
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index fe26ba3e3451..ad5bd988fb79 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -42,6 +42,7 @@
42#include <asm/time.h> 42#include <asm/time.h>
43#include <asm/cacheflush.h> 43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
45#include <asm/x86_init.h>
45 46
46#define EFI_DEBUG 1 47#define EFI_DEBUG 1
47#define PFX "EFI: " 48#define PFX "EFI: "
@@ -453,6 +454,9 @@ void __init efi_init(void)
453 if (add_efi_memmap) 454 if (add_efi_memmap)
454 do_add_efi_memmap(); 455 do_add_efi_memmap();
455 456
457 x86_platform.get_wallclock = efi_get_time;
458 x86_platform.set_wallclock = efi_set_rtc_mmss;
459
456 /* Setup for EFI runtime service */ 460 /* Setup for EFI runtime service */
457 reboot_type = BOOT_EFI; 461 reboot_type = BOOT_EFI;
458 462
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..7d52e9da5e0c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1185,17 +1185,14 @@ END(ftrace_graph_caller)
1185 1185
1186.globl return_to_handler 1186.globl return_to_handler
1187return_to_handler: 1187return_to_handler:
1188 pushl $0
1189 pushl %eax 1188 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1189 pushl %edx
1192 movl %ebp, %eax 1190 movl %ebp, %eax
1193 call ftrace_return_to_handler 1191 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1192 movl %eax, %ecx
1195 popl %edx 1193 popl %edx
1196 popl %ecx
1197 popl %eax 1194 popl %eax
1198 ret 1195 jmp *%ecx
1199#endif 1196#endif
1200 1197
1201.section .rodata,"a" 1198.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c251be745107..bd5bbddddf91 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -146,7 +146,7 @@ ENTRY(ftrace_graph_caller)
146END(ftrace_graph_caller) 146END(ftrace_graph_caller)
147 147
148GLOBAL(return_to_handler) 148GLOBAL(return_to_handler)
149 subq $80, %rsp 149 subq $24, %rsp
150 150
151 /* Save the return values */ 151 /* Save the return values */
152 movq %rax, (%rsp) 152 movq %rax, (%rsp)
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 72(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $72, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -536,20 +536,13 @@ sysret_signal:
536 bt $TIF_SYSCALL_AUDIT,%edx 536 bt $TIF_SYSCALL_AUDIT,%edx
537 jc sysret_audit 537 jc sysret_audit
538#endif 538#endif
539 /* edx: work flags (arg3) */ 539 /*
540 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 540 * We have a signal, or exit tracing or single-step.
541 xorl %esi,%esi # oldset -> arg2 541 * These all wind up with the iret return path anyway,
542 SAVE_REST 542 * so just join that path right now.
543 FIXUP_TOP_OF_STACK %r11 543 */
544 call do_notify_resume 544 FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
545 RESTORE_TOP_OF_STACK %r11 545 jmp int_check_syscall_exit_work
546 RESTORE_REST
547 movl $_TIF_WORK_MASK,%edi
548 /* Use IRET because user could have changed frame. This
549 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
550 DISABLE_INTERRUPTS(CLBR_NONE)
551 TRACE_IRQS_OFF
552 jmp int_with_check
553 546
554badsys: 547badsys:
555 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 548 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
@@ -654,6 +647,7 @@ int_careful:
654int_very_careful: 647int_very_careful:
655 TRACE_IRQS_ON 648 TRACE_IRQS_ON
656 ENABLE_INTERRUPTS(CLBR_NONE) 649 ENABLE_INTERRUPTS(CLBR_NONE)
650int_check_syscall_exit_work:
657 SAVE_REST 651 SAVE_REST
658 /* Check for syscall exit trace */ 652 /* Check for syscall exit trace */
659 testl $_TIF_WORK_SYSCALL_EXIT,%edx 653 testl $_TIF_WORK_SYSCALL_EXIT,%edx
@@ -1021,7 +1015,7 @@ apicinterrupt ERROR_APIC_VECTOR \
1021apicinterrupt SPURIOUS_APIC_VECTOR \ 1015apicinterrupt SPURIOUS_APIC_VECTOR \
1022 spurious_interrupt smp_spurious_interrupt 1016 spurious_interrupt smp_spurious_interrupt
1023 1017
1024#ifdef CONFIG_PERF_COUNTERS 1018#ifdef CONFIG_PERF_EVENTS
1025apicinterrupt LOCAL_PENDING_VECTOR \ 1019apicinterrupt LOCAL_PENDING_VECTOR \
1026 perf_pending_interrupt smp_perf_pending_interrupt 1020 perf_pending_interrupt smp_perf_pending_interrupt
1027#endif 1021#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..5a1b9758fd62 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -336,15 +338,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 338
337 switch (faulted) { 339 switch (faulted) {
338 case 0: 340 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 341 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 342 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 343 break;
342 case 1: 344 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 345 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 346 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 347 break;
346 case 2: 348 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 349 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 350 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 351 break;
350 } 352 }
@@ -468,82 +470,10 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
468 470
469#ifdef CONFIG_FTRACE_SYSCALLS 471#ifdef CONFIG_FTRACE_SYSCALLS
470 472
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table; 473extern unsigned long *sys_call_table;
474 474
475static struct syscall_metadata **syscalls_metadata; 475unsigned long __init arch_syscall_addr(int nr)
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{ 476{
505 int i; 477 return (unsigned long)(&sys_call_table)[nr];
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547} 478}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif 479#endif
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 3f8579f8d42c..4f8e2507e8f3 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -11,8 +11,21 @@
11#include <asm/setup.h> 11#include <asm/setup.h>
12#include <asm/sections.h> 12#include <asm/sections.h>
13#include <asm/e820.h> 13#include <asm/e820.h>
14#include <asm/bios_ebda.h> 14#include <asm/page.h>
15#include <asm/trampoline.h> 15#include <asm/trampoline.h>
16#include <asm/apic.h>
17#include <asm/io_apic.h>
18#include <asm/bios_ebda.h>
19
20static void __init i386_default_early_setup(void)
21{
22 /* Initilize 32bit specific setup functions */
23 x86_init.resources.probe_roms = probe_roms;
24 x86_init.resources.reserve_resources = i386_reserve_resources;
25 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
26
27 reserve_ebda_region();
28}
16 29
17void __init i386_start_kernel(void) 30void __init i386_start_kernel(void)
18{ 31{
@@ -29,7 +42,16 @@ void __init i386_start_kernel(void)
29 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
30 } 43 }
31#endif 44#endif
32 reserve_ebda_region(); 45
46 /* Call the subarch specific early setup function */
47 switch (boot_params.hdr.hardware_subarch) {
48 case X86_SUBARCH_MRST:
49 x86_mrst_early_setup();
50 break;
51 default:
52 i386_default_early_setup();
53 break;
54 }
33 55
34 /* 56 /*
35 * At this point everything still needed from the boot loader 57 * At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 70eaa852c732..0b06cd778fd9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -23,8 +23,8 @@
23#include <asm/sections.h> 23#include <asm/sections.h>
24#include <asm/kdebug.h> 24#include <asm/kdebug.h>
25#include <asm/e820.h> 25#include <asm/e820.h>
26#include <asm/bios_ebda.h>
27#include <asm/trampoline.h> 26#include <asm/trampoline.h>
27#include <asm/bios_ebda.h>
28 28
29static void __init zap_identity_mappings(void) 29static void __init zap_identity_mappings(void)
30{ 30{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 7ffec6b3b331..050c278481b1 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
79 * any particular GDT layout, because we load our own as soon as we 79 * any particular GDT layout, because we load our own as soon as we
80 * can. 80 * can.
81 */ 81 */
82.section .text.head,"ax",@progbits 82__HEAD
83ENTRY(startup_32) 83ENTRY(startup_32)
84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 84 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
85 us to not reload segments */ 85 us to not reload segments */
@@ -157,6 +157,7 @@ subarch_entries:
157 .long default_entry /* normal x86/PC */ 157 .long default_entry /* normal x86/PC */
158 .long lguest_entry /* lguest hypervisor */ 158 .long lguest_entry /* lguest hypervisor */
159 .long xen_entry /* Xen hypervisor */ 159 .long xen_entry /* Xen hypervisor */
160 .long default_entry /* Moorestown MID */
160num_subarch_entries = (. - subarch_entries) / 4 161num_subarch_entries = (. - subarch_entries) / 4
161.previous 162.previous
162#endif /* CONFIG_PARAVIRT */ 163#endif /* CONFIG_PARAVIRT */
@@ -607,7 +608,7 @@ ENTRY(initial_code)
607/* 608/*
608 * BSS section 609 * BSS section
609 */ 610 */
610.section ".bss.page_aligned","wa" 611__PAGE_ALIGNED_BSS
611 .align PAGE_SIZE_asm 612 .align PAGE_SIZE_asm
612#ifdef CONFIG_X86_PAE 613#ifdef CONFIG_X86_PAE
613swapper_pg_pmd: 614swapper_pg_pmd:
@@ -625,7 +626,7 @@ ENTRY(empty_zero_page)
625 * This starts the data section. 626 * This starts the data section.
626 */ 627 */
627#ifdef CONFIG_X86_PAE 628#ifdef CONFIG_X86_PAE
628.section ".data.page_aligned","wa" 629__PAGE_ALIGNED_DATA
629 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
630 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
631ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index fa54f78e2a05..780cd928fcd5 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map)
40L3_START_KERNEL = pud_index(__START_KERNEL_map) 40L3_START_KERNEL = pud_index(__START_KERNEL_map)
41 41
42 .text 42 .text
43 .section .text.head 43 __HEAD
44 .code64 44 .code64
45 .globl startup_64 45 .globl startup_64
46startup_64: 46startup_64:
@@ -418,7 +418,7 @@ ENTRY(phys_base)
418ENTRY(idt_table) 418ENTRY(idt_table)
419 .skip IDT_ENTRIES * 16 419 .skip IDT_ENTRIES * 16
420 420
421 .section .bss.page_aligned, "aw", @nobits 421 __PAGE_ALIGNED_BSS
422 .align PAGE_SIZE 422 .align PAGE_SIZE
423ENTRY(empty_zero_page) 423ENTRY(empty_zero_page)
424 .skip PAGE_SIZE 424 .skip PAGE_SIZE
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 43cec6bdda63..9c3bd4a2050e 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -10,6 +10,16 @@
10EXPORT_SYMBOL(mcount); 10EXPORT_SYMBOL(mcount);
11#endif 11#endif
12 12
13/*
14 * Note, this is a prototype to get at the symbol for
15 * the export, but dont use it from C code, it is used
16 * by assembly code and is not using C calling convention!
17 */
18#ifndef CONFIG_X86_CMPXCHG64
19extern void cmpxchg8b_emu(void);
20EXPORT_SYMBOL(cmpxchg8b_emu);
21#endif
22
13/* Networking helper routines. */ 23/* Networking helper routines. */
14EXPORT_SYMBOL(csum_partial_copy_generic); 24EXPORT_SYMBOL(csum_partial_copy_generic);
15 25
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 5cf36c053ac4..23c167925a5c 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -19,12 +19,6 @@
19DEFINE_SPINLOCK(i8253_lock); 19DEFINE_SPINLOCK(i8253_lock);
20EXPORT_SYMBOL(i8253_lock); 20EXPORT_SYMBOL(i8253_lock);
21 21
22#ifdef CONFIG_X86_32
23static void pit_disable_clocksource(void);
24#else
25static inline void pit_disable_clocksource(void) { }
26#endif
27
28/* 22/*
29 * HPET replaces the PIT, when enabled. So we need to know, which of 23 * HPET replaces the PIT, when enabled. So we need to know, which of
30 * the two timers is used 24 * the two timers is used
@@ -57,12 +51,10 @@ static void init_pit_timer(enum clock_event_mode mode,
57 outb_pit(0, PIT_CH0); 51 outb_pit(0, PIT_CH0);
58 outb_pit(0, PIT_CH0); 52 outb_pit(0, PIT_CH0);
59 } 53 }
60 pit_disable_clocksource();
61 break; 54 break;
62 55
63 case CLOCK_EVT_MODE_ONESHOT: 56 case CLOCK_EVT_MODE_ONESHOT:
64 /* One shot setup */ 57 /* One shot setup */
65 pit_disable_clocksource();
66 outb_pit(0x38, PIT_MODE); 58 outb_pit(0x38, PIT_MODE);
67 break; 59 break;
68 60
@@ -200,17 +192,6 @@ static struct clocksource pit_cs = {
200 .shift = 20, 192 .shift = 20,
201}; 193};
202 194
203static void pit_disable_clocksource(void)
204{
205 /*
206 * Use mult to check whether it is registered or not
207 */
208 if (pit_cs.mult) {
209 clocksource_unregister(&pit_cs);
210 pit_cs.mult = 0;
211 }
212}
213
214static int __init init_pit_clocksource(void) 195static int __init init_pit_clocksource(void)
215{ 196{
216 /* 197 /*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 270ff83efc11..3a54dcb9cd0e 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
20 * way process stacks are handled. This is done by having a special 20 * way process stacks are handled. This is done by having a special
21 * "init_task" linker map entry.. 21 * "init_task" linker map entry..
22 */ 22 */
23union thread_union init_thread_union 23union thread_union init_thread_union __init_task_data =
24 __attribute__((__section__(".data.init_task"))) = 24 { INIT_THREAD_INFO(init_task) };
25 { INIT_THREAD_INFO(init_task) };
26 25
27/* 26/*
28 * Initial task structure. 27 * Initial task structure.
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b0cdde6932f5..74656d1d4e30 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -104,7 +104,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
104 seq_printf(p, " Threshold APIC interrupts\n"); 104 seq_printf(p, " Threshold APIC interrupts\n");
105# endif 105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_NEW_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
109 for_each_online_cpu(j) 109 for_each_online_cpu(j)
110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j)); 110 seq_printf(p, "%10u ", per_cpu(mce_exception_count, j));
@@ -200,7 +200,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
200 sum += irq_stats(cpu)->irq_threshold_count; 200 sum += irq_stats(cpu)->irq_threshold_count;
201# endif 201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_NEW_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
205 sum += per_cpu(mce_poll_count, cpu); 205 sum += per_cpu(mce_poll_count, cpu);
206#endif 206#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 92b7703d3d58..40f30773fb29 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -116,7 +116,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
116 return 0; 116 return 0;
117} 117}
118 118
119static void __init init_ISA_irqs(void) 119void __init init_ISA_irqs(void)
120{ 120{
121 int i; 121 int i;
122 122
@@ -140,8 +140,10 @@ static void __init init_ISA_irqs(void)
140 } 140 }
141} 141}
142 142
143/* Overridden in paravirt.c */ 143void __init init_IRQ(void)
144void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 144{
145 x86_init.irqs.intr_init();
146}
145 147
146static void __init smp_intr_init(void) 148static void __init smp_intr_init(void)
147{ 149{
@@ -190,7 +192,7 @@ static void __init apic_intr_init(void)
190#ifdef CONFIG_X86_MCE_THRESHOLD 192#ifdef CONFIG_X86_MCE_THRESHOLD
191 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 193 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
192#endif 194#endif
193#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) 195#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_LOCAL_APIC)
194 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); 196 alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt);
195#endif 197#endif
196 198
@@ -206,39 +208,19 @@ static void __init apic_intr_init(void)
206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 208 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207 209
208 /* Performance monitoring interrupts: */ 210 /* Performance monitoring interrupts: */
209# ifdef CONFIG_PERF_COUNTERS 211# ifdef CONFIG_PERF_EVENTS
210 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 212 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
211# endif 213# endif
212 214
213#endif 215#endif
214} 216}
215 217
216/**
217 * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
218 *
219 * Description:
220 * Perform any necessary interrupt initialisation prior to setting up
221 * the "ordinary" interrupt call gates. For legacy reasons, the ISA
222 * interrupts should be initialised here if the machine emulates a PC
223 * in any way.
224 **/
225static void __init x86_quirk_pre_intr_init(void)
226{
227#ifdef CONFIG_X86_32
228 if (x86_quirks->arch_pre_intr_init) {
229 if (x86_quirks->arch_pre_intr_init())
230 return;
231 }
232#endif
233 init_ISA_irqs();
234}
235
236void __init native_init_IRQ(void) 218void __init native_init_IRQ(void)
237{ 219{
238 int i; 220 int i;
239 221
240 /* Execute any quirks before the call gates are initialised: */ 222 /* Execute any quirks before the call gates are initialised: */
241 x86_quirk_pre_intr_init(); 223 x86_init.irqs.pre_vector_init();
242 224
243 apic_intr_init(); 225 apic_intr_init();
244 226
@@ -258,12 +240,6 @@ void __init native_init_IRQ(void)
258 240
259#ifdef CONFIG_X86_32 241#ifdef CONFIG_X86_32
260 /* 242 /*
261 * Call quirks after call gates are initialised (usually add in
262 * the architecture specific gates):
263 */
264 x86_quirk_intr_init();
265
266 /*
267 * External FPU? Set up irq13 if so, for 243 * External FPU? Set up irq13 if so, for
268 * original braindamaged IBM FERR coupling. 244 * original braindamaged IBM FERR coupling.
269 */ 245 */
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e5efcdcca31b..feaeb0d3aa4f 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,6 +22,8 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25
26#include <asm/x86_init.h>
25#include <asm/reboot.h> 27#include <asm/reboot.h>
26 28
27#define KVM_SCALE 22 29#define KVM_SCALE 22
@@ -182,12 +184,13 @@ void __init kvmclock_init(void)
182 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { 184 if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
183 if (kvm_register_clock("boot clock")) 185 if (kvm_register_clock("boot clock"))
184 return; 186 return;
185 pv_time_ops.get_wallclock = kvm_get_wallclock;
186 pv_time_ops.set_wallclock = kvm_set_wallclock;
187 pv_time_ops.sched_clock = kvm_clock_read; 187 pv_time_ops.sched_clock = kvm_clock_read;
188 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; 188 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
189 x86_platform.get_wallclock = kvm_get_wallclock;
190 x86_platform.set_wallclock = kvm_set_wallclock;
189#ifdef CONFIG_X86_LOCAL_APIC 191#ifdef CONFIG_X86_LOCAL_APIC
190 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 192 x86_cpuinit.setup_percpu_clockev =
193 kvm_setup_secondary_clock;
191#endif 194#endif
192#ifdef CONFIG_SMP 195#ifdef CONFIG_SMP
193 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 196 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 71f1d99a635d..ec6ef60cbd17 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
67#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
68 preempt_disable(); 68 preempt_disable();
69 load_LDT(pc); 69 load_LDT(pc);
70 if (!cpus_equal(current->mm->cpu_vm_mask, 70 if (!cpumask_equal(mm_cpumask(current->mm),
71 cpumask_of_cpu(smp_processor_id()))) 71 cpumask_of(smp_processor_id())))
72 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
73 preempt_enable(); 73 preempt_enable();
74#else 74#else
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9371448290ac..378e9a8f1bf8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
210{ 210{
211 ssize_t ret = -EINVAL; 211 ssize_t ret = -EINVAL;
212 212
213 if ((len >> PAGE_SHIFT) > num_physpages) { 213 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", num_physpages); 214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages);
215 return ret; 215 return ret;
216 } 216 }
217 217
@@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = {
236static struct miscdevice microcode_dev = { 236static struct miscdevice microcode_dev = {
237 .minor = MICROCODE_MINOR, 237 .minor = MICROCODE_MINOR,
238 .name = "microcode", 238 .name = "microcode",
239 .devnode = "cpu/microcode", 239 .nodename = "cpu/microcode",
240 .fops = &microcode_fops, 240 .fops = &microcode_fops,
241}; 241};
242 242
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index fcd513bf2846..5be95ef4ffec 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -45,6 +45,11 @@ static int __init mpf_checksum(unsigned char *mp, int len)
45 return sum & 0xFF; 45 return sum & 0xFF;
46} 46}
47 47
48int __init default_mpc_apic_id(struct mpc_cpu *m)
49{
50 return m->apicid;
51}
52
48static void __init MP_processor_info(struct mpc_cpu *m) 53static void __init MP_processor_info(struct mpc_cpu *m)
49{ 54{
50 int apicid; 55 int apicid;
@@ -55,10 +60,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
55 return; 60 return;
56 } 61 }
57 62
58 if (x86_quirks->mpc_apic_id) 63 apicid = x86_init.mpparse.mpc_apic_id(m);
59 apicid = x86_quirks->mpc_apic_id(m);
60 else
61 apicid = m->apicid;
62 64
63 if (m->cpuflag & CPU_BOOTPROCESSOR) { 65 if (m->cpuflag & CPU_BOOTPROCESSOR) {
64 bootup_cpu = " (Bootup-CPU)"; 66 bootup_cpu = " (Bootup-CPU)";
@@ -70,16 +72,18 @@ static void __init MP_processor_info(struct mpc_cpu *m)
70} 72}
71 73
72#ifdef CONFIG_X86_IO_APIC 74#ifdef CONFIG_X86_IO_APIC
73static void __init MP_bus_info(struct mpc_bus *m) 75void __init default_mpc_oem_bus_info(struct mpc_bus *m, char *str)
74{ 76{
75 char str[7];
76 memcpy(str, m->bustype, 6); 77 memcpy(str, m->bustype, 6);
77 str[6] = 0; 78 str[6] = 0;
79 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str);
80}
78 81
79 if (x86_quirks->mpc_oem_bus_info) 82static void __init MP_bus_info(struct mpc_bus *m)
80 x86_quirks->mpc_oem_bus_info(m, str); 83{
81 else 84 char str[7];
82 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->busid, str); 85
86 x86_init.mpparse.mpc_oem_bus_info(m, str);
83 87
84#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
85 if (m->busid >= MAX_MP_BUSSES) { 89 if (m->busid >= MAX_MP_BUSSES) {
@@ -96,8 +100,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
97#endif 101#endif
98 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
99 if (x86_quirks->mpc_oem_pci_bus) 103 if (x86_init.mpparse.mpc_oem_pci_bus)
100 x86_quirks->mpc_oem_pci_bus(m); 104 x86_init.mpparse.mpc_oem_pci_bus(m);
101 105
102 clear_bit(m->busid, mp_bus_not_pci); 106 clear_bit(m->busid, mp_bus_not_pci);
103#if defined(CONFIG_EISA) || defined(CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
@@ -291,6 +295,8 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
291 1, mpc, mpc->length, 1); 295 1, mpc, mpc->length, 1);
292} 296}
293 297
298void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
299
294static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 300static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
295{ 301{
296 char str[16]; 302 char str[16];
@@ -312,16 +318,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
312 if (early) 318 if (early)
313 return 1; 319 return 1;
314 320
315 if (mpc->oemptr && x86_quirks->smp_read_mpc_oem) { 321 if (mpc->oemptr)
316 struct mpc_oemtable *oem_table = (void *)(long)mpc->oemptr; 322 x86_init.mpparse.smp_read_mpc_oem(mpc);
317 x86_quirks->smp_read_mpc_oem(oem_table, mpc->oemsize);
318 }
319 323
320 /* 324 /*
321 * Now process the configuration blocks. 325 * Now process the configuration blocks.
322 */ 326 */
323 if (x86_quirks->mpc_record) 327 x86_init.mpparse.mpc_record(0);
324 *x86_quirks->mpc_record = 0;
325 328
326 while (count < mpc->length) { 329 while (count < mpc->length) {
327 switch (*mpt) { 330 switch (*mpt) {
@@ -353,8 +356,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
353 count = mpc->length; 356 count = mpc->length;
354 break; 357 break;
355 } 358 }
356 if (x86_quirks->mpc_record) 359 x86_init.mpparse.mpc_record(1);
357 (*x86_quirks->mpc_record)++;
358 } 360 }
359 361
360#ifdef CONFIG_X86_BIGSMP 362#ifdef CONFIG_X86_BIGSMP
@@ -608,7 +610,7 @@ static int __init check_physptr(struct mpf_intel *mpf, unsigned int early)
608/* 610/*
609 * Scan the memory blocks for an SMP configuration block. 611 * Scan the memory blocks for an SMP configuration block.
610 */ 612 */
611static void __init __get_smp_config(unsigned int early) 613void __init default_get_smp_config(unsigned int early)
612{ 614{
613 struct mpf_intel *mpf = mpf_found; 615 struct mpf_intel *mpf = mpf_found;
614 616
@@ -625,11 +627,6 @@ static void __init __get_smp_config(unsigned int early)
625 if (acpi_lapic && acpi_ioapic) 627 if (acpi_lapic && acpi_ioapic)
626 return; 628 return;
627 629
628 if (x86_quirks->mach_get_smp_config) {
629 if (x86_quirks->mach_get_smp_config(early))
630 return;
631 }
632
633 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 630 printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
634 mpf->specification); 631 mpf->specification);
635#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32) 632#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
@@ -670,16 +667,6 @@ static void __init __get_smp_config(unsigned int early)
670 */ 667 */
671} 668}
672 669
673void __init early_get_smp_config(void)
674{
675 __get_smp_config(1);
676}
677
678void __init get_smp_config(void)
679{
680 __get_smp_config(0);
681}
682
683static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 670static void __init smp_reserve_bootmem(struct mpf_intel *mpf)
684{ 671{
685 unsigned long size = get_mpc_size(mpf->physptr); 672 unsigned long size = get_mpc_size(mpf->physptr);
@@ -745,14 +732,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
745 return 0; 732 return 0;
746} 733}
747 734
748static void __init __find_smp_config(unsigned int reserve) 735void __init default_find_smp_config(unsigned int reserve)
749{ 736{
750 unsigned int address; 737 unsigned int address;
751 738
752 if (x86_quirks->mach_find_smp_config) {
753 if (x86_quirks->mach_find_smp_config(reserve))
754 return;
755 }
756 /* 739 /*
757 * FIXME: Linux assumes you have 640K of base ram.. 740 * FIXME: Linux assumes you have 640K of base ram..
758 * this continues the error... 741 * this continues the error...
@@ -787,16 +770,6 @@ static void __init __find_smp_config(unsigned int reserve)
787 smp_scan_config(address, 0x400, reserve); 770 smp_scan_config(address, 0x400, reserve);
788} 771}
789 772
790void __init early_find_smp_config(void)
791{
792 __find_smp_config(0);
793}
794
795void __init find_smp_config(void)
796{
797 __find_smp_config(1);
798}
799
800#ifdef CONFIG_X86_IO_APIC 773#ifdef CONFIG_X86_IO_APIC
801static u8 __initdata irq_used[MAX_IRQ_SOURCES]; 774static u8 __initdata irq_used[MAX_IRQ_SOURCES];
802 775
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
new file mode 100644
index 000000000000..3b7078abc871
--- /dev/null
+++ b/arch/x86/kernel/mrst.c
@@ -0,0 +1,24 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13
14#include <asm/setup.h>
15
16/*
17 * Moorestown specific x86_init function overrides and early setup
18 * calls.
19 */
20void __init x86_mrst_early_setup(void)
21{
22 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop;
24}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7dd950094178..6a3cefc7dda1 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
241 .notifier_call = msr_class_cpu_callback, 241 .notifier_call = msr_class_cpu_callback,
242}; 242};
243 243
244static char *msr_nodename(struct device *dev) 244static char *msr_devnode(struct device *dev, mode_t *mode)
245{ 245{
246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 246 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
247} 247}
@@ -262,7 +262,7 @@ static int __init msr_init(void)
262 err = PTR_ERR(msr_class); 262 err = PTR_ERR(msr_class);
263 goto out_chrdev; 263 goto out_chrdev;
264 } 264 }
265 msr_class->nodename = msr_nodename; 265 msr_class->devnode = msr_devnode;
266 for_each_online_cpu(i) { 266 for_each_online_cpu(i) {
267 err = msr_device_create(i); 267 err = msr_device_create(i);
268 if (err != 0) 268 if (err != 0)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index f5b0b4a01fb2..1b1739d16310 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,17 +54,12 @@ u64 _paravirt_ident_64(u64 x)
54 return x; 54 return x;
55} 55}
56 56
57static void __init default_banner(void) 57void __init default_banner(void)
58{ 58{
59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 59 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
60 pv_info.name); 60 pv_info.name);
61} 61}
62 62
63char *memory_setup(void)
64{
65 return pv_init_ops.memory_setup();
66}
67
68/* Simple instruction patching code. */ 63/* Simple instruction patching code. */
69#define DEF_NATIVE(ops, name, code) \ 64#define DEF_NATIVE(ops, name, code) \
70 extern const char start_##ops##_##name[], end_##ops##_##name[]; \ 65 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
@@ -188,11 +183,6 @@ unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
188 return insn_len; 183 return insn_len;
189} 184}
190 185
191void init_IRQ(void)
192{
193 pv_irq_ops.init_IRQ();
194}
195
196static void native_flush_tlb(void) 186static void native_flush_tlb(void)
197{ 187{
198 __native_flush_tlb(); 188 __native_flush_tlb();
@@ -218,13 +208,6 @@ extern void native_irq_enable_sysexit(void);
218extern void native_usergs_sysret32(void); 208extern void native_usergs_sysret32(void);
219extern void native_usergs_sysret64(void); 209extern void native_usergs_sysret64(void);
220 210
221static int __init print_banner(void)
222{
223 pv_init_ops.banner();
224 return 0;
225}
226core_initcall(print_banner);
227
228static struct resource reserve_ioports = { 211static struct resource reserve_ioports = {
229 .start = 0, 212 .start = 0,
230 .end = IO_SPACE_LIMIT, 213 .end = IO_SPACE_LIMIT,
@@ -320,21 +303,13 @@ struct pv_info pv_info = {
320 303
321struct pv_init_ops pv_init_ops = { 304struct pv_init_ops pv_init_ops = {
322 .patch = native_patch, 305 .patch = native_patch,
323 .banner = default_banner,
324 .arch_setup = paravirt_nop,
325 .memory_setup = machine_specific_memory_setup,
326}; 306};
327 307
328struct pv_time_ops pv_time_ops = { 308struct pv_time_ops pv_time_ops = {
329 .time_init = hpet_time_init,
330 .get_wallclock = native_get_wallclock,
331 .set_wallclock = native_set_wallclock,
332 .sched_clock = native_sched_clock, 309 .sched_clock = native_sched_clock,
333 .get_tsc_khz = native_calibrate_tsc,
334}; 310};
335 311
336struct pv_irq_ops pv_irq_ops = { 312struct pv_irq_ops pv_irq_ops = {
337 .init_IRQ = native_init_IRQ,
338 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl), 313 .save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
339 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl), 314 .restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
340 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable), 315 .irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
@@ -409,8 +384,6 @@ struct pv_cpu_ops pv_cpu_ops = {
409 384
410struct pv_apic_ops pv_apic_ops = { 385struct pv_apic_ops pv_apic_ops = {
411#ifdef CONFIG_X86_LOCAL_APIC 386#ifdef CONFIG_X86_LOCAL_APIC
412 .setup_boot_clock = setup_boot_APIC_clock,
413 .setup_secondary_clock = setup_secondary_APIC_clock,
414 .startup_ipi_hook = paravirt_nop, 387 .startup_ipi_hook = paravirt_nop,
415#endif 388#endif
416}; 389};
@@ -424,13 +397,6 @@ struct pv_apic_ops pv_apic_ops = {
424#endif 397#endif
425 398
426struct pv_mmu_ops pv_mmu_ops = { 399struct pv_mmu_ops pv_mmu_ops = {
427#ifndef CONFIG_X86_64
428 .pagetable_setup_start = native_pagetable_setup_start,
429 .pagetable_setup_done = native_pagetable_setup_done,
430#else
431 .pagetable_setup_start = paravirt_nop,
432 .pagetable_setup_done = paravirt_nop,
433#endif
434 400
435 .read_cr2 = native_read_cr2, 401 .read_cr2 = native_read_cr2,
436 .write_cr2 = native_write_cr2, 402 .write_cr2 = native_write_cr2,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index d71c8655905b..b2a71dca5642 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -35,7 +35,7 @@ int iommu_detected __read_mostly = 0;
35 35
36/* 36/*
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 37 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA ranslation for 38 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 39 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 40 * useful if a user want to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 41 * guests and not for driver dma translation.
@@ -225,10 +225,8 @@ static __init int iommu_setup(char *p)
225 if (!strncmp(p, "soft", 4)) 225 if (!strncmp(p, "soft", 4))
226 swiotlb = 1; 226 swiotlb = 1;
227#endif 227#endif
228 if (!strncmp(p, "pt", 2)) { 228 if (!strncmp(p, "pt", 2))
229 iommu_pass_through = 1; 229 iommu_pass_through = 1;
230 return 1;
231 }
232 230
233 gart_parse_options(p); 231 gart_parse_options(p);
234 232
@@ -313,7 +311,7 @@ void pci_iommu_shutdown(void)
313 amd_iommu_shutdown(); 311 amd_iommu_shutdown();
314} 312}
315/* Must execute after PCI subsystem */ 313/* Must execute after PCI subsystem */
316fs_initcall(pci_iommu_init); 314rootfs_initcall(pci_iommu_init);
317 315
318#ifdef CONFIG_PCI 316#ifdef CONFIG_PCI
319/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 317/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..a7f1b64f86e0 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,6 +16,7 @@
16#include <linux/agp_backend.h> 16#include <linux/agp_backend.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/mm.h> 18#include <linux/mm.h>
19#include <linux/sched.h>
19#include <linux/string.h> 20#include <linux/string.h>
20#include <linux/spinlock.h> 21#include <linux/spinlock.h>
21#include <linux/pci.h> 22#include <linux/pci.h>
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index e8a35016115f..aaa6b7839f1e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -46,9 +46,8 @@ void __init pci_swiotlb_init(void)
46{ 46{
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 48#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || 49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN))
50 iommu_pass_through) 50 swiotlb = 1;
51 swiotlb = 1;
52#endif 51#endif
53 if (swiotlb_force) 52 if (swiotlb_force)
54 swiotlb = 1; 53 swiotlb = 1;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 1092a1a2fbe6..2275ce5776de 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <trace/power.h> 12#include <trace/events/power.h>
13#include <asm/system.h> 13#include <asm/system.h>
14#include <asm/apic.h> 14#include <asm/apic.h>
15#include <asm/syscalls.h> 15#include <asm/syscalls.h>
@@ -27,9 +27,6 @@ EXPORT_SYMBOL(idle_nomwait);
27 27
28struct kmem_cache *task_xstate_cachep; 28struct kmem_cache *task_xstate_cachep;
29 29
30DEFINE_TRACE(power_start);
31DEFINE_TRACE(power_end);
32
33int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 30int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
34{ 31{
35 *dst = *src; 32 *dst = *src;
@@ -289,9 +286,7 @@ static inline int hlt_use_halt(void)
289void default_idle(void) 286void default_idle(void)
290{ 287{
291 if (hlt_use_halt()) { 288 if (hlt_use_halt()) {
292 struct power_trace it; 289 trace_power_start(POWER_CSTATE, 1);
293
294 trace_power_start(&it, POWER_CSTATE, 1);
295 current_thread_info()->status &= ~TS_POLLING; 290 current_thread_info()->status &= ~TS_POLLING;
296 /* 291 /*
297 * TS_POLLING-cleared state must be visible before we 292 * TS_POLLING-cleared state must be visible before we
@@ -304,7 +299,6 @@ void default_idle(void)
304 else 299 else
305 local_irq_enable(); 300 local_irq_enable();
306 current_thread_info()->status |= TS_POLLING; 301 current_thread_info()->status |= TS_POLLING;
307 trace_power_end(&it);
308 } else { 302 } else {
309 local_irq_enable(); 303 local_irq_enable();
310 /* loop is done by the caller */ 304 /* loop is done by the caller */
@@ -362,9 +356,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
362 */ 356 */
363void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 357void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
364{ 358{
365 struct power_trace it; 359 trace_power_start(POWER_CSTATE, (ax>>4)+1);
366
367 trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
368 if (!need_resched()) { 360 if (!need_resched()) {
369 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 361 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
370 clflush((void *)&current_thread_info()->flags); 362 clflush((void *)&current_thread_info()->flags);
@@ -374,15 +366,13 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
374 if (!need_resched()) 366 if (!need_resched())
375 __mwait(ax, cx); 367 __mwait(ax, cx);
376 } 368 }
377 trace_power_end(&it);
378} 369}
379 370
380/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 371/* Default MONITOR/MWAIT with no hints, used for default C1 state */
381static void mwait_idle(void) 372static void mwait_idle(void)
382{ 373{
383 struct power_trace it;
384 if (!need_resched()) { 374 if (!need_resched()) {
385 trace_power_start(&it, POWER_CSTATE, 1); 375 trace_power_start(POWER_CSTATE, 1);
386 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 376 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
387 clflush((void *)&current_thread_info()->flags); 377 clflush((void *)&current_thread_info()->flags);
388 378
@@ -392,7 +382,6 @@ static void mwait_idle(void)
392 __sti_mwait(0, 0); 382 __sti_mwait(0, 0);
393 else 383 else
394 local_irq_enable(); 384 local_irq_enable();
395 trace_power_end(&it);
396 } else 385 } else
397 local_irq_enable(); 386 local_irq_enable();
398} 387}
@@ -404,13 +393,11 @@ static void mwait_idle(void)
404 */ 393 */
405static void poll_idle(void) 394static void poll_idle(void)
406{ 395{
407 struct power_trace it; 396 trace_power_start(POWER_CSTATE, 0);
408
409 trace_power_start(&it, POWER_CSTATE, 0);
410 local_irq_enable(); 397 local_irq_enable();
411 while (!need_resched()) 398 while (!need_resched())
412 cpu_relax(); 399 cpu_relax();
413 trace_power_end(&it); 400 trace_power_end(0);
414} 401}
415 402
416/* 403/*
@@ -558,10 +545,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
558void __init init_c1e_mask(void) 545void __init init_c1e_mask(void)
559{ 546{
560 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 547 /* If we're using c1e_idle, we need to allocate c1e_mask. */
561 if (pm_idle == c1e_idle) { 548 if (pm_idle == c1e_idle)
562 alloc_cpumask_var(&c1e_mask, GFP_KERNEL); 549 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL);
563 cpumask_clear(c1e_mask);
564 }
565} 550}
566 551
567static int __init idle_setup(char *str) 552static int __init idle_setup(char *str)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 113b8927c822..267cb85b479c 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -312,16 +312,6 @@ static int putreg(struct task_struct *child,
312 return set_flags(child, value); 312 return set_flags(child, value);
313 313
314#ifdef CONFIG_X86_64 314#ifdef CONFIG_X86_64
315 /*
316 * Orig_ax is really just a flag with small positive and
317 * negative values, so make sure to always sign-extend it
318 * from 32 bits so that it works correctly regardless of
319 * whether we come from a 32-bit environment or not.
320 */
321 case offsetof(struct user_regs_struct, orig_ax):
322 value = (long) (s32) value;
323 break;
324
325 case offsetof(struct user_regs_struct,fs_base): 315 case offsetof(struct user_regs_struct,fs_base):
326 if (value >= TASK_SIZE_OF(child)) 316 if (value >= TASK_SIZE_OF(child))
327 return -EIO; 317 return -EIO;
@@ -1177,10 +1167,15 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value)
1177 1167
1178 case offsetof(struct user32, regs.orig_eax): 1168 case offsetof(struct user32, regs.orig_eax):
1179 /* 1169 /*
1180 * Sign-extend the value so that orig_eax = -1 1170 * A 32-bit debugger setting orig_eax means to restore
1181 * causes (long)orig_ax < 0 tests to fire correctly. 1171 * the state of the task restarting a 32-bit syscall.
1172 * Make sure we interpret the -ERESTART* codes correctly
1173 * in case the task is not actually still sitting at the
1174 * exit from a 32-bit syscall with TS_COMPAT still set.
1182 */ 1175 */
1183 regs->orig_ax = (long) (s32) value; 1176 regs->orig_ax = value;
1177 if (syscall_get_nr(child, regs) >= 0)
1178 task_thread_info(child)->status |= TS_COMPAT;
1184 break; 1179 break;
1185 1180
1186 case offsetof(struct user32, regs.eflags): 1181 case offsetof(struct user32, regs.eflags):
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index af71d06624bf..6c3b2c6fd772 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -508,7 +508,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
508 508
509 pci_read_config_dword(nb_ht, 0x60, &val); 509 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 510 set_dev_node(&dev->dev, val & 7);
511 pci_dev_put(dev); 511 pci_dev_put(nb_ht);
512} 512}
513 513
514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 514DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index a06e8d101844..a1a3cdda06e1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,8 @@
4#include <linux/pm.h> 4#include <linux/pm.h>
5#include <linux/efi.h> 5#include <linux/efi.h>
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h>
8#include <linux/tboot.h>
7#include <acpi/reboot.h> 9#include <acpi/reboot.h>
8#include <asm/io.h> 10#include <asm/io.h>
9#include <asm/apic.h> 11#include <asm/apic.h>
@@ -508,6 +510,8 @@ static void native_machine_emergency_restart(void)
508 if (reboot_emergency) 510 if (reboot_emergency)
509 emergency_vmx_disable_all(); 511 emergency_vmx_disable_all();
510 512
513 tboot_shutdown(TB_SHUTDOWN_REBOOT);
514
511 /* Tell the BIOS if we want cold or warm reboot */ 515 /* Tell the BIOS if we want cold or warm reboot */
512 *((unsigned short *)__va(0x472)) = reboot_mode; 516 *((unsigned short *)__va(0x472)) = reboot_mode;
513 517
@@ -634,6 +638,8 @@ static void native_machine_halt(void)
634 /* stop other cpus and apics */ 638 /* stop other cpus and apics */
635 machine_shutdown(); 639 machine_shutdown();
636 640
641 tboot_shutdown(TB_SHUTDOWN_HALT);
642
637 /* stop this cpu */ 643 /* stop this cpu */
638 stop_this_cpu(NULL); 644 stop_this_cpu(NULL);
639} 645}
@@ -645,6 +651,8 @@ static void native_machine_power_off(void)
645 machine_shutdown(); 651 machine_shutdown();
646 pm_power_off(); 652 pm_power_off();
647 } 653 }
654 /* a fallback in case there is no PM info available */
655 tboot_shutdown(TB_SHUTDOWN_HALT);
648} 656}
649 657
650struct machine_ops machine_ops = { 658struct machine_ops machine_ops = {
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 5d465b207e72..1cfbbfc3ae26 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -8,6 +8,7 @@
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9 9
10#include <asm/vsyscall.h> 10#include <asm/vsyscall.h>
11#include <asm/x86_init.h>
11#include <asm/time.h> 12#include <asm/time.h>
12 13
13#ifdef CONFIG_X86_32 14#ifdef CONFIG_X86_32
@@ -165,33 +166,29 @@ void rtc_cmos_write(unsigned char val, unsigned char addr)
165} 166}
166EXPORT_SYMBOL(rtc_cmos_write); 167EXPORT_SYMBOL(rtc_cmos_write);
167 168
168static int set_rtc_mmss(unsigned long nowtime) 169int update_persistent_clock(struct timespec now)
169{ 170{
170 unsigned long flags; 171 unsigned long flags;
171 int retval; 172 int retval;
172 173
173 spin_lock_irqsave(&rtc_lock, flags); 174 spin_lock_irqsave(&rtc_lock, flags);
174 retval = set_wallclock(nowtime); 175 retval = x86_platform.set_wallclock(now.tv_sec);
175 spin_unlock_irqrestore(&rtc_lock, flags); 176 spin_unlock_irqrestore(&rtc_lock, flags);
176 177
177 return retval; 178 return retval;
178} 179}
179 180
180/* not static: needed by APM */ 181/* not static: needed by APM */
181unsigned long read_persistent_clock(void) 182void read_persistent_clock(struct timespec *ts)
182{ 183{
183 unsigned long retval, flags; 184 unsigned long retval, flags;
184 185
185 spin_lock_irqsave(&rtc_lock, flags); 186 spin_lock_irqsave(&rtc_lock, flags);
186 retval = get_wallclock(); 187 retval = x86_platform.get_wallclock();
187 spin_unlock_irqrestore(&rtc_lock, flags); 188 spin_unlock_irqrestore(&rtc_lock, flags);
188 189
189 return retval; 190 ts->tv_sec = retval;
190} 191 ts->tv_nsec = 0;
191
192int update_persistent_clock(struct timespec now)
193{
194 return set_rtc_mmss(now.tv_sec);
195} 192}
196 193
197unsigned long long native_read_tsc(void) 194unsigned long long native_read_tsc(void)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 63f32d220ef2..e09f0e2c14b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -27,6 +27,7 @@
27#include <linux/screen_info.h> 27#include <linux/screen_info.h>
28#include <linux/ioport.h> 28#include <linux/ioport.h>
29#include <linux/acpi.h> 29#include <linux/acpi.h>
30#include <linux/sfi.h>
30#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
31#include <linux/initrd.h> 32#include <linux/initrd.h>
32#include <linux/bootmem.h> 33#include <linux/bootmem.h>
@@ -66,6 +67,7 @@
66 67
67#include <linux/percpu.h> 68#include <linux/percpu.h>
68#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h>
69 71
70#include <video/edid.h> 72#include <video/edid.h>
71 73
@@ -108,10 +110,6 @@
108#include <asm/numa_64.h> 110#include <asm/numa_64.h>
109#endif 111#endif
110 112
111#ifndef ARCH_SETUP
112#define ARCH_SETUP
113#endif
114
115/* 113/*
116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
117 * The direct mapping extends to max_pfn_mapped, so that we can directly access 115 * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -133,9 +131,9 @@ int default_cpu_present_to_apicid(int mps_cpu)
133 return __default_cpu_present_to_apicid(mps_cpu); 131 return __default_cpu_present_to_apicid(mps_cpu);
134} 132}
135 133
136int default_check_phys_apicid_present(int boot_cpu_physical_apicid) 134int default_check_phys_apicid_present(int phys_apicid)
137{ 135{
138 return __default_check_phys_apicid_present(boot_cpu_physical_apicid); 136 return __default_check_phys_apicid_present(phys_apicid);
139} 137}
140#endif 138#endif
141 139
@@ -171,13 +169,6 @@ static struct resource bss_resource = {
171 169
172 170
173#ifdef CONFIG_X86_32 171#ifdef CONFIG_X86_32
174static struct resource video_ram_resource = {
175 .name = "Video RAM area",
176 .start = 0xa0000,
177 .end = 0xbffff,
178 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
179};
180
181/* cpu data as detected by the assembly code in head.S */ 172/* cpu data as detected by the assembly code in head.S */
182struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 173struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
183/* common cpu data for all cpus */ 174/* common cpu data for all cpus */
@@ -605,7 +596,7 @@ static struct resource standard_io_resources[] = {
605 .flags = IORESOURCE_BUSY | IORESOURCE_IO } 596 .flags = IORESOURCE_BUSY | IORESOURCE_IO }
606}; 597};
607 598
608static void __init reserve_standard_io_resources(void) 599void __init reserve_standard_io_resources(void)
609{ 600{
610 int i; 601 int i;
611 602
@@ -637,10 +628,6 @@ static int __init setup_elfcorehdr(char *arg)
637early_param("elfcorehdr", setup_elfcorehdr); 628early_param("elfcorehdr", setup_elfcorehdr);
638#endif 629#endif
639 630
640static struct x86_quirks default_x86_quirks __initdata;
641
642struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
643
644#ifdef CONFIG_X86_RESERVE_LOW_64K 631#ifdef CONFIG_X86_RESERVE_LOW_64K
645static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
646{ 633{
@@ -757,7 +744,7 @@ void __init setup_arch(char **cmdline_p)
757 } 744 }
758#endif 745#endif
759 746
760 ARCH_SETUP 747 x86_init.oem.arch_setup();
761 748
762 setup_memory_map(); 749 setup_memory_map();
763 parse_setup_data(); 750 parse_setup_data();
@@ -796,6 +783,16 @@ void __init setup_arch(char **cmdline_p)
796 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 783 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
797 *cmdline_p = command_line; 784 *cmdline_p = command_line;
798 785
786#ifdef CONFIG_X86_64
787 /*
788 * Must call this twice: Once just to detect whether hardware doesn't
789 * support NX (so that the early EHCI debug console setup can safely
790 * call set_fixmap(), and then again after parsing early parameters to
791 * honor the respective command line option.
792 */
793 check_efer();
794#endif
795
799 parse_early_param(); 796 parse_early_param();
800 797
801#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
@@ -833,11 +830,9 @@ void __init setup_arch(char **cmdline_p)
833 * VMware detection requires dmi to be available, so this 830 * VMware detection requires dmi to be available, so this
834 * needs to be done after dmi_scan_machine, for the BP. 831 * needs to be done after dmi_scan_machine, for the BP.
835 */ 832 */
836 init_hypervisor(&boot_cpu_data); 833 init_hypervisor_platform();
837 834
838#ifdef CONFIG_X86_32 835 x86_init.resources.probe_roms();
839 probe_roms();
840#endif
841 836
842 /* after parse_early_param, so could debug it */ 837 /* after parse_early_param, so could debug it */
843 insert_resource(&iomem_resource, &code_resource); 838 insert_resource(&iomem_resource, &code_resource);
@@ -972,10 +967,11 @@ void __init setup_arch(char **cmdline_p)
972 kvmclock_init(); 967 kvmclock_init();
973#endif 968#endif
974 969
975 paravirt_pagetable_setup_start(swapper_pg_dir); 970 x86_init.paging.pagetable_setup_start(swapper_pg_dir);
976 paging_init(); 971 paging_init();
977 paravirt_pagetable_setup_done(swapper_pg_dir); 972 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
978 paravirt_post_allocator_init(); 973
974 tboot_probe();
979 975
980#ifdef CONFIG_X86_64 976#ifdef CONFIG_X86_64
981 map_vsyscall(); 977 map_vsyscall();
@@ -990,13 +986,13 @@ void __init setup_arch(char **cmdline_p)
990 */ 986 */
991 acpi_boot_init(); 987 acpi_boot_init();
992 988
993#if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) 989 sfi_init();
990
994 /* 991 /*
995 * get boot-time SMP configuration: 992 * get boot-time SMP configuration:
996 */ 993 */
997 if (smp_found_config) 994 if (smp_found_config)
998 get_smp_config(); 995 get_smp_config();
999#endif
1000 996
1001 prefill_possible_map(); 997 prefill_possible_map();
1002 998
@@ -1015,10 +1011,7 @@ void __init setup_arch(char **cmdline_p)
1015 e820_reserve_resources(); 1011 e820_reserve_resources();
1016 e820_mark_nosave_regions(max_low_pfn); 1012 e820_mark_nosave_regions(max_low_pfn);
1017 1013
1018#ifdef CONFIG_X86_32 1014 x86_init.resources.reserve_resources();
1019 request_resource(&iomem_resource, &video_ram_resource);
1020#endif
1021 reserve_standard_io_resources();
1022 1015
1023 e820_setup_gap(); 1016 e820_setup_gap();
1024 1017
@@ -1030,78 +1023,22 @@ void __init setup_arch(char **cmdline_p)
1030 conswitchp = &dummy_con; 1023 conswitchp = &dummy_con;
1031#endif 1024#endif
1032#endif 1025#endif
1026 x86_init.oem.banner();
1033} 1027}
1034 1028
1035#ifdef CONFIG_X86_32 1029#ifdef CONFIG_X86_32
1036 1030
1037/** 1031static struct resource video_ram_resource = {
1038 * x86_quirk_intr_init - post gate setup interrupt initialisation 1032 .name = "Video RAM area",
1039 * 1033 .start = 0xa0000,
1040 * Description: 1034 .end = 0xbffff,
1041 * Fill in any interrupts that may have been left out by the general 1035 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
1042 * init_IRQ() routine. interrupts having to do with the machine rather
1043 * than the devices on the I/O bus (like APIC interrupts in intel MP
1044 * systems) are started here.
1045 **/
1046void __init x86_quirk_intr_init(void)
1047{
1048 if (x86_quirks->arch_intr_init) {
1049 if (x86_quirks->arch_intr_init())
1050 return;
1051 }
1052}
1053
1054/**
1055 * x86_quirk_trap_init - initialise system specific traps
1056 *
1057 * Description:
1058 * Called as the final act of trap_init(). Used in VISWS to initialise
1059 * the various board specific APIC traps.
1060 **/
1061void __init x86_quirk_trap_init(void)
1062{
1063 if (x86_quirks->arch_trap_init) {
1064 if (x86_quirks->arch_trap_init())
1065 return;
1066 }
1067}
1068
1069static struct irqaction irq0 = {
1070 .handler = timer_interrupt,
1071 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
1072 .name = "timer"
1073}; 1036};
1074 1037
1075/** 1038void __init i386_reserve_resources(void)
1076 * x86_quirk_pre_time_init - do any specific initialisations before.
1077 *
1078 **/
1079void __init x86_quirk_pre_time_init(void)
1080{ 1039{
1081 if (x86_quirks->arch_pre_time_init) 1040 request_resource(&iomem_resource, &video_ram_resource);
1082 x86_quirks->arch_pre_time_init(); 1041 reserve_standard_io_resources();
1083} 1042}
1084 1043
1085/**
1086 * x86_quirk_time_init - do any specific initialisations for the system timer.
1087 *
1088 * Description:
1089 * Must plug the system timer interrupt source at HZ into the IRQ listed
1090 * in irq_vectors.h:TIMER_IRQ
1091 **/
1092void __init x86_quirk_time_init(void)
1093{
1094 if (x86_quirks->arch_time_init) {
1095 /*
1096 * A nonzero return code does not mean failure, it means
1097 * that the architecture quirk does not want any
1098 * generic (timer) setup to be performed after this:
1099 */
1100 if (x86_quirks->arch_time_init())
1101 return;
1102 }
1103
1104 irq0.mask = cpumask_of_cpu(0);
1105 setup_irq(0, &irq0);
1106}
1107#endif /* CONFIG_X86_32 */ 1044#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 07d81916f212..d559af913e1f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -55,6 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset);
55#define PERCPU_FIRST_CHUNK_RESERVE 0 55#define PERCPU_FIRST_CHUNK_RESERVE 0
56#endif 56#endif
57 57
58#ifdef CONFIG_X86_32
58/** 59/**
59 * pcpu_need_numa - determine percpu allocation needs to consider NUMA 60 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
60 * 61 *
@@ -83,6 +84,7 @@ static bool __init pcpu_need_numa(void)
83#endif 84#endif
84 return false; 85 return false;
85} 86}
87#endif
86 88
87/** 89/**
88 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu 90 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
@@ -124,308 +126,35 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
124} 126}
125 127
126/* 128/*
127 * Large page remap allocator 129 * Helpers for first chunk memory allocation
128 *
129 * This allocator uses PMD page as unit. A PMD page is allocated for
130 * each cpu and each is remapped into vmalloc area using PMD mapping.
131 * As PMD page is quite large, only part of it is used for the first
132 * chunk. Unused part is returned to the bootmem allocator.
133 *
134 * So, the PMD pages are mapped twice - once to the physical mapping
135 * and to the vmalloc area for the first percpu chunk. The double
136 * mapping does add one more PMD TLB entry pressure but still is much
137 * better than only using 4k mappings while still being NUMA friendly.
138 */ 130 */
139#ifdef CONFIG_NEED_MULTIPLE_NODES 131static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
140struct pcpul_ent {
141 unsigned int cpu;
142 void *ptr;
143};
144
145static size_t pcpul_size;
146static struct pcpul_ent *pcpul_map;
147static struct vm_struct pcpul_vm;
148
149static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
150{ 132{
151 size_t off = (size_t)pageno << PAGE_SHIFT; 133 return pcpu_alloc_bootmem(cpu, size, align);
152
153 if (off >= pcpul_size)
154 return NULL;
155
156 return virt_to_page(pcpul_map[cpu].ptr + off);
157} 134}
158 135
159static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 136static void __init pcpu_fc_free(void *ptr, size_t size)
160{ 137{
161 size_t map_size, dyn_size; 138 free_bootmem(__pa(ptr), size);
162 unsigned int cpu;
163 int i, j;
164 ssize_t ret;
165
166 if (!chosen) {
167 size_t vm_size = VMALLOC_END - VMALLOC_START;
168 size_t tot_size = nr_cpu_ids * PMD_SIZE;
169
170 /* on non-NUMA, embedding is better */
171 if (!pcpu_need_numa())
172 return -EINVAL;
173
174 /* don't consume more than 20% of vmalloc area */
175 if (tot_size > vm_size / 5) {
176 pr_info("PERCPU: too large chunk size %zuMB for "
177 "large page remap\n", tot_size >> 20);
178 return -EINVAL;
179 }
180 }
181
182 /* need PSE */
183 if (!cpu_has_pse) {
184 pr_warning("PERCPU: lpage allocator requires PSE\n");
185 return -EINVAL;
186 }
187
188 /*
189 * Currently supports only single page. Supporting multiple
190 * pages won't be too difficult if it ever becomes necessary.
191 */
192 pcpul_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
193 PERCPU_DYNAMIC_RESERVE);
194 if (pcpul_size > PMD_SIZE) {
195 pr_warning("PERCPU: static data is larger than large page, "
196 "can't use large page\n");
197 return -EINVAL;
198 }
199 dyn_size = pcpul_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
200
201 /* allocate pointer array and alloc large pages */
202 map_size = PFN_ALIGN(nr_cpu_ids * sizeof(pcpul_map[0]));
203 pcpul_map = alloc_bootmem(map_size);
204
205 for_each_possible_cpu(cpu) {
206 pcpul_map[cpu].cpu = cpu;
207 pcpul_map[cpu].ptr = pcpu_alloc_bootmem(cpu, PMD_SIZE,
208 PMD_SIZE);
209 if (!pcpul_map[cpu].ptr) {
210 pr_warning("PERCPU: failed to allocate large page "
211 "for cpu%u\n", cpu);
212 goto enomem;
213 }
214
215 /*
216 * Only use pcpul_size bytes and give back the rest.
217 *
218 * Ingo: The 2MB up-rounding bootmem is needed to make
219 * sure the partial 2MB page is still fully RAM - it's
220 * not well-specified to have a PAT-incompatible area
221 * (unmapped RAM, device memory, etc.) in that hole.
222 */
223 free_bootmem(__pa(pcpul_map[cpu].ptr + pcpul_size),
224 PMD_SIZE - pcpul_size);
225
226 memcpy(pcpul_map[cpu].ptr, __per_cpu_load, static_size);
227 }
228
229 /* allocate address and map */
230 pcpul_vm.flags = VM_ALLOC;
231 pcpul_vm.size = nr_cpu_ids * PMD_SIZE;
232 vm_area_register_early(&pcpul_vm, PMD_SIZE);
233
234 for_each_possible_cpu(cpu) {
235 pmd_t *pmd, pmd_v;
236
237 pmd = populate_extra_pmd((unsigned long)pcpul_vm.addr +
238 cpu * PMD_SIZE);
239 pmd_v = pfn_pmd(page_to_pfn(virt_to_page(pcpul_map[cpu].ptr)),
240 PAGE_KERNEL_LARGE);
241 set_pmd(pmd, pmd_v);
242 }
243
244 /* we're ready, commit */
245 pr_info("PERCPU: Remapped at %p with large pages, static data "
246 "%zu bytes\n", pcpul_vm.addr, static_size);
247
248 ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
249 PERCPU_FIRST_CHUNK_RESERVE, dyn_size,
250 PMD_SIZE, pcpul_vm.addr, NULL);
251
252 /* sort pcpul_map array for pcpu_lpage_remapped() */
253 for (i = 0; i < nr_cpu_ids - 1; i++)
254 for (j = i + 1; j < nr_cpu_ids; j++)
255 if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
256 struct pcpul_ent tmp = pcpul_map[i];
257 pcpul_map[i] = pcpul_map[j];
258 pcpul_map[j] = tmp;
259 }
260
261 return ret;
262
263enomem:
264 for_each_possible_cpu(cpu)
265 if (pcpul_map[cpu].ptr)
266 free_bootmem(__pa(pcpul_map[cpu].ptr), pcpul_size);
267 free_bootmem(__pa(pcpul_map), map_size);
268 return -ENOMEM;
269} 139}
270 140
271/** 141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
272 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
273 * @kaddr: the kernel address in question
274 *
275 * Determine whether @kaddr falls in the pcpul recycled area. This is
276 * used by pageattr to detect VM aliases and break up the pcpu PMD
277 * mapping such that the same physical page is not mapped under
278 * different attributes.
279 *
280 * The recycled area is always at the tail of a partially used PMD
281 * page.
282 *
283 * RETURNS:
284 * Address of corresponding remapped pcpu address if match is found;
285 * otherwise, NULL.
286 */
287void *pcpu_lpage_remapped(void *kaddr)
288{ 142{
289 void *pmd_addr = (void *)((unsigned long)kaddr & PMD_MASK); 143#ifdef CONFIG_NEED_MULTIPLE_NODES
290 unsigned long offset = (unsigned long)kaddr & ~PMD_MASK; 144 if (early_cpu_to_node(from) == early_cpu_to_node(to))
291 int left = 0, right = nr_cpu_ids - 1; 145 return LOCAL_DISTANCE;
292 int pos; 146 else
293 147 return REMOTE_DISTANCE;
294 /* pcpul in use at all? */
295 if (!pcpul_map)
296 return NULL;
297
298 /* okay, perform binary search */
299 while (left <= right) {
300 pos = (left + right) / 2;
301
302 if (pcpul_map[pos].ptr < pmd_addr)
303 left = pos + 1;
304 else if (pcpul_map[pos].ptr > pmd_addr)
305 right = pos - 1;
306 else {
307 /* it shouldn't be in the area for the first chunk */
308 WARN_ON(offset < pcpul_size);
309
310 return pcpul_vm.addr +
311 pcpul_map[pos].cpu * PMD_SIZE + offset;
312 }
313 }
314
315 return NULL;
316}
317#else 148#else
318static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) 149 return LOCAL_DISTANCE;
319{
320 return -EINVAL;
321}
322#endif 150#endif
323
324/*
325 * Embedding allocator
326 *
327 * The first chunk is sized to just contain the static area plus
328 * module and dynamic reserves and embedded into linear physical
329 * mapping so that it can use PMD mapping without additional TLB
330 * pressure.
331 */
332static ssize_t __init setup_pcpu_embed(size_t static_size, bool chosen)
333{
334 size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
335
336 /*
337 * If large page isn't supported, there's no benefit in doing
338 * this. Also, embedding allocation doesn't play well with
339 * NUMA.
340 */
341 if (!chosen && (!cpu_has_pse || pcpu_need_numa()))
342 return -EINVAL;
343
344 return pcpu_embed_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE,
345 reserve - PERCPU_FIRST_CHUNK_RESERVE, -1);
346} 151}
347 152
348/* 153static void __init pcpup_populate_pte(unsigned long addr)
349 * 4k page allocator
350 *
351 * This is the basic allocator. Static percpu area is allocated
352 * page-by-page and most of initialization is done by the generic
353 * setup function.
354 */
355static struct page **pcpu4k_pages __initdata;
356static int pcpu4k_nr_static_pages __initdata;
357
358static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
359{
360 if (pageno < pcpu4k_nr_static_pages)
361 return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
362 return NULL;
363}
364
365static void __init pcpu4k_populate_pte(unsigned long addr)
366{ 154{
367 populate_extra_pte(addr); 155 populate_extra_pte(addr);
368} 156}
369 157
370static ssize_t __init setup_pcpu_4k(size_t static_size)
371{
372 size_t pages_size;
373 unsigned int cpu;
374 int i, j;
375 ssize_t ret;
376
377 pcpu4k_nr_static_pages = PFN_UP(static_size);
378
379 /* unaligned allocations can't be freed, round up to page size */
380 pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * nr_cpu_ids
381 * sizeof(pcpu4k_pages[0]));
382 pcpu4k_pages = alloc_bootmem(pages_size);
383
384 /* allocate and copy */
385 j = 0;
386 for_each_possible_cpu(cpu)
387 for (i = 0; i < pcpu4k_nr_static_pages; i++) {
388 void *ptr;
389
390 ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
391 if (!ptr) {
392 pr_warning("PERCPU: failed to allocate "
393 "4k page for cpu%u\n", cpu);
394 goto enomem;
395 }
396
397 memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
398 pcpu4k_pages[j++] = virt_to_page(ptr);
399 }
400
401 /* we're ready, commit */
402 pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
403 pcpu4k_nr_static_pages, static_size);
404
405 ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
406 PERCPU_FIRST_CHUNK_RESERVE, -1,
407 -1, NULL, pcpu4k_populate_pte);
408 goto out_free_ar;
409
410enomem:
411 while (--j >= 0)
412 free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
413 ret = -ENOMEM;
414out_free_ar:
415 free_bootmem(__pa(pcpu4k_pages), pages_size);
416 return ret;
417}
418
419/* for explicit first chunk allocator selection */
420static char pcpu_chosen_alloc[16] __initdata;
421
422static int __init percpu_alloc_setup(char *str)
423{
424 strncpy(pcpu_chosen_alloc, str, sizeof(pcpu_chosen_alloc) - 1);
425 return 0;
426}
427early_param("percpu_alloc", percpu_alloc_setup);
428
429static inline void setup_percpu_segment(int cpu) 158static inline void setup_percpu_segment(int cpu)
430{ 159{
431#ifdef CONFIG_X86_32 160#ifdef CONFIG_X86_32
@@ -441,52 +170,49 @@ static inline void setup_percpu_segment(int cpu)
441 170
442void __init setup_per_cpu_areas(void) 171void __init setup_per_cpu_areas(void)
443{ 172{
444 size_t static_size = __per_cpu_end - __per_cpu_start;
445 unsigned int cpu; 173 unsigned int cpu;
446 unsigned long delta; 174 unsigned long delta;
447 size_t pcpu_unit_size; 175 int rc;
448 ssize_t ret;
449 176
450 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n", 177 pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
451 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids); 178 NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
452 179
453 /* 180 /*
454 * Allocate percpu area. If PSE is supported, try to make use 181 * Allocate percpu area. Embedding allocator is our favorite;
455 * of large page mappings. Please read comments on top of 182 * however, on NUMA configurations, it can result in very
456 * each allocator for details. 183 * sparse unit mapping and vmalloc area isn't spacious enough
184 * on 32bit. Use page in that case.
457 */ 185 */
458 ret = -EINVAL; 186#ifdef CONFIG_X86_32
459 if (strlen(pcpu_chosen_alloc)) { 187 if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
460 if (strcmp(pcpu_chosen_alloc, "4k")) { 188 pcpu_chosen_fc = PCPU_FC_PAGE;
461 if (!strcmp(pcpu_chosen_alloc, "lpage")) 189#endif
462 ret = setup_pcpu_lpage(static_size, true); 190 rc = -EINVAL;
463 else if (!strcmp(pcpu_chosen_alloc, "embed")) 191 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
464 ret = setup_pcpu_embed(static_size, true); 192 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
465 else 193 const size_t dyn_size = PERCPU_MODULE_RESERVE +
466 pr_warning("PERCPU: unknown allocator %s " 194 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
467 "specified\n", pcpu_chosen_alloc); 195
468 if (ret < 0) 196 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
469 pr_warning("PERCPU: %s allocator failed (%zd), " 197 dyn_size, atom_size,
470 "falling back to 4k\n", 198 pcpu_cpu_distance,
471 pcpu_chosen_alloc, ret); 199 pcpu_fc_alloc, pcpu_fc_free);
472 } 200 if (rc < 0)
473 } else { 201 pr_warning("PERCPU: %s allocator failed (%d), "
474 ret = setup_pcpu_lpage(static_size, false); 202 "falling back to page size\n",
475 if (ret < 0) 203 pcpu_fc_names[pcpu_chosen_fc], rc);
476 ret = setup_pcpu_embed(static_size, false);
477 } 204 }
478 if (ret < 0) 205 if (rc < 0)
479 ret = setup_pcpu_4k(static_size); 206 rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
480 if (ret < 0) 207 pcpu_fc_alloc, pcpu_fc_free,
481 panic("cannot allocate static percpu area (%zu bytes, err=%zd)", 208 pcpup_populate_pte);
482 static_size, ret); 209 if (rc < 0)
483 210 panic("cannot initialize percpu area (err=%d)", rc);
484 pcpu_unit_size = ret;
485 211
486 /* alrighty, percpu areas up and running */ 212 /* alrighty, percpu areas up and running */
487 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; 213 delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
488 for_each_possible_cpu(cpu) { 214 for_each_possible_cpu(cpu) {
489 per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; 215 per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
490 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); 216 per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
491 per_cpu(cpu_number, cpu) = cpu; 217 per_cpu(cpu_number, cpu) = cpu;
492 setup_percpu_segment(cpu); 218 setup_percpu_segment(cpu);
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
new file mode 100644
index 000000000000..34e099382651
--- /dev/null
+++ b/arch/x86/kernel/sfi.c
@@ -0,0 +1,122 @@
1/*
2 * sfi.c - x86 architecture SFI support.
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#define KMSG_COMPONENT "SFI"
22#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24#include <linux/acpi.h>
25#include <linux/init.h>
26#include <linux/sfi.h>
27#include <linux/io.h>
28
29#include <asm/io_apic.h>
30#include <asm/mpspec.h>
31#include <asm/setup.h>
32#include <asm/apic.h>
33
34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id)
50{
51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS);
54 return;
55 }
56
57 pr_info("registering lapic[%d]\n", id);
58
59 generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
60}
61
62static int __init sfi_parse_cpus(struct sfi_table_header *table)
63{
64 struct sfi_table_simple *sb;
65 struct sfi_cpu_table_entry *pentry;
66 int i;
67 int cpu_num;
68
69 sb = (struct sfi_table_simple *)table;
70 cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
71 pentry = (struct sfi_cpu_table_entry *)sb->pentry;
72
73 for (i = 0; i < cpu_num; i++) {
74 mp_sfi_register_lapic(pentry->apic_id);
75 pentry++;
76 }
77
78 smp_found_config = 1;
79 return 0;
80}
81#endif /* CONFIG_X86_LOCAL_APIC */
82
83#ifdef CONFIG_X86_IO_APIC
84static u32 gsi_base;
85
86static int __init sfi_parse_ioapic(struct sfi_table_header *table)
87{
88 struct sfi_table_simple *sb;
89 struct sfi_apic_table_entry *pentry;
90 int i, num;
91
92 sb = (struct sfi_table_simple *)table;
93 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
94 pentry = (struct sfi_apic_table_entry *)sb->pentry;
95
96 for (i = 0; i < num; i++) {
97 mp_register_ioapic(i, pentry->phys_addr, gsi_base);
98 gsi_base += io_apic_get_redir_entries(i);
99 pentry++;
100 }
101
102 WARN(pic_mode, KERN_WARNING
103 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
104 pic_mode = 0;
105 return 0;
106}
107#endif /* CONFIG_X86_IO_APIC */
108
109/*
110 * sfi_platform_init(): register lapics & io-apics
111 */
112int __init sfi_platform_init(void)
113{
114#ifdef CONFIG_X86_LOCAL_APIC
115 mp_sfi_register_lapic_address(sfi_lapic_addr);
116 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
117#endif
118#ifdef CONFIG_X86_IO_APIC
119 sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
120#endif
121 return 0;
122}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index baaf8052f355..fbf3b07c8567 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -847,7 +847,7 @@ static void do_signal(struct pt_regs *regs)
847void 847void
848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 848do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
849{ 849{
850#ifdef CONFIG_X86_NEW_MCE 850#ifdef CONFIG_X86_MCE
851 /* notify userspace of pending MCEs */ 851 /* notify userspace of pending MCEs */
852 if (thread_info_flags & _TIF_MCE_NOTIFY) 852 if (thread_info_flags & _TIF_MCE_NOTIFY)
853 mce_notify_process(); 853 mce_notify_process();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ec7b64c2df82..213a7a3e4562 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -47,6 +47,7 @@
47#include <linux/bootmem.h> 47#include <linux/bootmem.h>
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h>
50 51
51#include <asm/acpi.h> 52#include <asm/acpi.h>
52#include <asm/desc.h> 53#include <asm/desc.h>
@@ -324,7 +325,7 @@ notrace static void __cpuinit start_secondary(void *unused)
324 /* enable local interrupts */ 325 /* enable local interrupts */
325 local_irq_enable(); 326 local_irq_enable();
326 327
327 setup_secondary_clock(); 328 x86_cpuinit.setup_percpu_clockev();
328 329
329 wmb(); 330 wmb();
330 load_debug_registers(); 331 load_debug_registers();
@@ -1060,12 +1061,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1060#endif 1061#endif
1061 current_thread_info()->cpu = 0; /* needed? */ 1062 current_thread_info()->cpu = 0; /* needed? */
1062 for_each_possible_cpu(i) { 1063 for_each_possible_cpu(i) {
1063 alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1064 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1064 alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1065 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1065 alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1066 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL);
1066 cpumask_clear(per_cpu(cpu_core_map, i));
1067 cpumask_clear(per_cpu(cpu_sibling_map, i));
1068 cpumask_clear(cpu_data(i).llc_shared_map);
1069 } 1067 }
1070 set_cpu_sibling_map(0); 1068 set_cpu_sibling_map(0);
1071 1069
@@ -1115,13 +1113,26 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1115 1113
1116 printk(KERN_INFO "CPU%d: ", 0); 1114 printk(KERN_INFO "CPU%d: ", 0);
1117 print_cpu_info(&cpu_data(0)); 1115 print_cpu_info(&cpu_data(0));
1118 setup_boot_clock(); 1116 x86_init.timers.setup_percpu_clockev();
1119 1117
1120 if (is_uv_system()) 1118 if (is_uv_system())
1121 uv_system_init(); 1119 uv_system_init();
1120
1121 set_mtrr_aps_delayed_init();
1122out: 1122out:
1123 preempt_enable(); 1123 preempt_enable();
1124} 1124}
1125
1126void arch_enable_nonboot_cpus_begin(void)
1127{
1128 set_mtrr_aps_delayed_init();
1129}
1130
1131void arch_enable_nonboot_cpus_end(void)
1132{
1133 mtrr_aps_init();
1134}
1135
1125/* 1136/*
1126 * Early setup to make printk work. 1137 * Early setup to make printk work.
1127 */ 1138 */
@@ -1143,6 +1154,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1143 setup_ioapic_dest(); 1154 setup_ioapic_dest();
1144#endif 1155#endif
1145 check_nmi_watchdog(); 1156 check_nmi_watchdog();
1157 mtrr_aps_init();
1146} 1158}
1147 1159
1148static int __initdata setup_possible_cpus = -1; 1160static int __initdata setup_possible_cpus = -1;
@@ -1321,6 +1333,7 @@ void play_dead_common(void)
1321void native_play_dead(void) 1333void native_play_dead(void)
1322{ 1334{
1323 play_dead_common(); 1335 play_dead_common();
1336 tboot_shutdown(TB_SHUTDOWN_WFS);
1324 wbinvd_halt(); 1337 wbinvd_halt();
1325} 1338}
1326 1339
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d51321ddafda..0157cd26d7cc 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -335,4 +335,4 @@ ENTRY(sys_call_table)
335 .long sys_preadv 335 .long sys_preadv
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_counter_open 338 .long sys_perf_event_open
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
new file mode 100644
index 000000000000..86c9f91b48ae
--- /dev/null
+++ b/arch/x86/kernel/tboot.c
@@ -0,0 +1,447 @@
1/*
2 * tboot.c: main implementation of helper functions used by kernel for
3 * runtime support of Intel(R) Trusted Execution Technology
4 *
5 * Copyright (c) 2006-2009, Intel Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 */
21
22#include <linux/dma_remapping.h>
23#include <linux/init_task.h>
24#include <linux/spinlock.h>
25#include <linux/delay.h>
26#include <linux/sched.h>
27#include <linux/init.h>
28#include <linux/dmar.h>
29#include <linux/cpu.h>
30#include <linux/pfn.h>
31#include <linux/mm.h>
32#include <linux/tboot.h>
33
34#include <asm/trampoline.h>
35#include <asm/processor.h>
36#include <asm/bootparam.h>
37#include <asm/pgtable.h>
38#include <asm/pgalloc.h>
39#include <asm/fixmap.h>
40#include <asm/proto.h>
41#include <asm/setup.h>
42#include <asm/e820.h>
43#include <asm/io.h>
44
45#include "acpi/realmode/wakeup.h"
46
47/* Global pointer to shared data; NULL means no measured launch. */
48struct tboot *tboot __read_mostly;
49
50/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
51#define AP_WAIT_TIMEOUT 1
52
53#undef pr_fmt
54#define pr_fmt(fmt) "tboot: " fmt
55
56static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
57
58void __init tboot_probe(void)
59{
60 /* Look for valid page-aligned address for shared page. */
61 if (!boot_params.tboot_addr)
62 return;
63 /*
64 * also verify that it is mapped as we expect it before calling
65 * set_fixmap(), to reduce chance of garbage value causing crash
66 */
67 if (!e820_any_mapped(boot_params.tboot_addr,
68 boot_params.tboot_addr, E820_RESERVED)) {
69 pr_warning("non-0 tboot_addr but it is not of type E820_RESERVED\n");
70 return;
71 }
72
73 /* only a natively booted kernel should be using TXT */
74 if (paravirt_enabled()) {
75 pr_warning("non-0 tboot_addr but pv_ops is enabled\n");
76 return;
77 }
78
79 /* Map and check for tboot UUID. */
80 set_fixmap(FIX_TBOOT_BASE, boot_params.tboot_addr);
81 tboot = (struct tboot *)fix_to_virt(FIX_TBOOT_BASE);
82 if (memcmp(&tboot_uuid, &tboot->uuid, sizeof(tboot->uuid))) {
83 pr_warning("tboot at 0x%llx is invalid\n",
84 boot_params.tboot_addr);
85 tboot = NULL;
86 return;
87 }
88 if (tboot->version < 5) {
89 pr_warning("tboot version is invalid: %u\n", tboot->version);
90 tboot = NULL;
91 return;
92 }
93
94 pr_info("found shared page at phys addr 0x%llx:\n",
95 boot_params.tboot_addr);
96 pr_debug("version: %d\n", tboot->version);
97 pr_debug("log_addr: 0x%08x\n", tboot->log_addr);
98 pr_debug("shutdown_entry: 0x%x\n", tboot->shutdown_entry);
99 pr_debug("tboot_base: 0x%08x\n", tboot->tboot_base);
100 pr_debug("tboot_size: 0x%x\n", tboot->tboot_size);
101}
102
103static pgd_t *tboot_pg_dir;
104static struct mm_struct tboot_mm = {
105 .mm_rb = RB_ROOT,
106 .pgd = swapper_pg_dir,
107 .mm_users = ATOMIC_INIT(2),
108 .mm_count = ATOMIC_INIT(1),
109 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
110 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
111 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
112 .cpu_vm_mask = CPU_MASK_ALL,
113};
114
115static inline void switch_to_tboot_pt(void)
116{
117 write_cr3(virt_to_phys(tboot_pg_dir));
118}
119
120static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
121 pgprot_t prot)
122{
123 pgd_t *pgd;
124 pud_t *pud;
125 pmd_t *pmd;
126 pte_t *pte;
127
128 pgd = pgd_offset(&tboot_mm, vaddr);
129 pud = pud_alloc(&tboot_mm, pgd, vaddr);
130 if (!pud)
131 return -1;
132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
133 if (!pmd)
134 return -1;
135 pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
136 if (!pte)
137 return -1;
138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
139 pte_unmap(pte);
140 return 0;
141}
142
143static int map_tboot_pages(unsigned long vaddr, unsigned long start_pfn,
144 unsigned long nr)
145{
146 /* Reuse the original kernel mapping */
147 tboot_pg_dir = pgd_alloc(&tboot_mm);
148 if (!tboot_pg_dir)
149 return -1;
150
151 for (; nr > 0; nr--, vaddr += PAGE_SIZE, start_pfn++) {
152 if (map_tboot_page(vaddr, start_pfn, PAGE_KERNEL_EXEC))
153 return -1;
154 }
155
156 return 0;
157}
158
159static void tboot_create_trampoline(void)
160{
161 u32 map_base, map_size;
162
163 /* Create identity map for tboot shutdown code. */
164 map_base = PFN_DOWN(tboot->tboot_base);
165 map_size = PFN_UP(tboot->tboot_size);
166 if (map_tboot_pages(map_base << PAGE_SHIFT, map_base, map_size))
167 panic("tboot: Error mapping tboot pages (mfns) @ 0x%x, 0x%x\n",
168 map_base, map_size);
169}
170
171#ifdef CONFIG_ACPI_SLEEP
172
173static void add_mac_region(phys_addr_t start, unsigned long size)
174{
175 struct tboot_mac_region *mr;
176 phys_addr_t end = start + size;
177
178 if (start && size) {
179 mr = &tboot->mac_regions[tboot->num_mac_regions++];
180 mr->start = round_down(start, PAGE_SIZE);
181 mr->size = round_up(end, PAGE_SIZE) - mr->start;
182 }
183}
184
185static int tboot_setup_sleep(void)
186{
187 tboot->num_mac_regions = 0;
188
189 /* S3 resume code */
190 add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
191
192#ifdef CONFIG_X86_TRAMPOLINE
193 /* AP trampoline code */
194 add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
195#endif
196
197 /* kernel code + data + bss */
198 add_mac_region(virt_to_phys(_text), _end - _text);
199
200 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
201
202 return 0;
203}
204
205#else /* no CONFIG_ACPI_SLEEP */
206
207static int tboot_setup_sleep(void)
208{
209 /* S3 shutdown requested, but S3 not supported by the kernel... */
210 BUG();
211 return -1;
212}
213
214#endif
215
216void tboot_shutdown(u32 shutdown_type)
217{
218 void (*shutdown)(void);
219
220 if (!tboot_enabled())
221 return;
222
223 /*
224 * if we're being called before the 1:1 mapping is set up then just
225 * return and let the normal shutdown happen; this should only be
226 * due to very early panic()
227 */
228 if (!tboot_pg_dir)
229 return;
230
231 /* if this is S3 then set regions to MAC */
232 if (shutdown_type == TB_SHUTDOWN_S3)
233 if (tboot_setup_sleep())
234 return;
235
236 tboot->shutdown_type = shutdown_type;
237
238 switch_to_tboot_pt();
239
240 shutdown = (void(*)(void))(unsigned long)tboot->shutdown_entry;
241 shutdown();
242
243 /* should not reach here */
244 while (1)
245 halt();
246}
247
248static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
249{
250#define TB_COPY_GAS(tbg, g) \
251 tbg.space_id = g.space_id; \
252 tbg.bit_width = g.bit_width; \
253 tbg.bit_offset = g.bit_offset; \
254 tbg.access_width = g.access_width; \
255 tbg.address = g.address;
256
257 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_cnt_blk, fadt->xpm1a_control_block);
258 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_cnt_blk, fadt->xpm1b_control_block);
259 TB_COPY_GAS(tboot->acpi_sinfo.pm1a_evt_blk, fadt->xpm1a_event_block);
260 TB_COPY_GAS(tboot->acpi_sinfo.pm1b_evt_blk, fadt->xpm1b_event_block);
261
262 /*
263 * We need phys addr of waking vector, but can't use virt_to_phys() on
264 * &acpi_gbl_FACS because it is ioremap'ed, so calc from FACS phys
265 * addr.
266 */
267 tboot->acpi_sinfo.wakeup_vector = fadt->facs +
268 offsetof(struct acpi_table_facs, firmware_waking_vector);
269}
270
271void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
272{
273 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
274 /* S0,1,2: */ -1, -1, -1,
275 /* S3: */ TB_SHUTDOWN_S3,
276 /* S4: */ TB_SHUTDOWN_S4,
277 /* S5: */ TB_SHUTDOWN_S5 };
278
279 if (!tboot_enabled())
280 return;
281
282 tboot_copy_fadt(&acpi_gbl_FADT);
283 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
284 tboot->acpi_sinfo.pm1b_cnt_val = pm1b_control;
285 /* we always use the 32b wakeup vector */
286 tboot->acpi_sinfo.vector_width = 32;
287
288 if (sleep_state >= ACPI_S_STATE_COUNT ||
289 acpi_shutdown_map[sleep_state] == -1) {
290 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
291 return;
292 }
293
294 tboot_shutdown(acpi_shutdown_map[sleep_state]);
295}
296
297static atomic_t ap_wfs_count;
298
299static int tboot_wait_for_aps(int num_aps)
300{
301 unsigned long timeout;
302
303 timeout = AP_WAIT_TIMEOUT*HZ;
304 while (atomic_read((atomic_t *)&tboot->num_in_wfs) != num_aps &&
305 timeout) {
306 mdelay(1);
307 timeout--;
308 }
309
310 if (timeout)
311 pr_warning("tboot wait for APs timeout\n");
312
313 return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps);
314}
315
316static int __cpuinit tboot_cpu_callback(struct notifier_block *nfb,
317 unsigned long action, void *hcpu)
318{
319 switch (action) {
320 case CPU_DYING:
321 atomic_inc(&ap_wfs_count);
322 if (num_online_cpus() == 1)
323 if (tboot_wait_for_aps(atomic_read(&ap_wfs_count)))
324 return NOTIFY_BAD;
325 break;
326 }
327 return NOTIFY_OK;
328}
329
330static struct notifier_block tboot_cpu_notifier __cpuinitdata =
331{
332 .notifier_call = tboot_cpu_callback,
333};
334
335static __init int tboot_late_init(void)
336{
337 if (!tboot_enabled())
338 return 0;
339
340 tboot_create_trampoline();
341
342 atomic_set(&ap_wfs_count, 0);
343 register_hotcpu_notifier(&tboot_cpu_notifier);
344 return 0;
345}
346
347late_initcall(tboot_late_init);
348
349/*
350 * TXT configuration registers (offsets from TXT_{PUB, PRIV}_CONFIG_REGS_BASE)
351 */
352
353#define TXT_PUB_CONFIG_REGS_BASE 0xfed30000
354#define TXT_PRIV_CONFIG_REGS_BASE 0xfed20000
355
356/* # pages for each config regs space - used by fixmap */
357#define NR_TXT_CONFIG_PAGES ((TXT_PUB_CONFIG_REGS_BASE - \
358 TXT_PRIV_CONFIG_REGS_BASE) >> PAGE_SHIFT)
359
360/* offsets from pub/priv config space */
361#define TXTCR_HEAP_BASE 0x0300
362#define TXTCR_HEAP_SIZE 0x0308
363
364#define SHA1_SIZE 20
365
366struct sha1_hash {
367 u8 hash[SHA1_SIZE];
368};
369
370struct sinit_mle_data {
371 u32 version; /* currently 6 */
372 struct sha1_hash bios_acm_id;
373 u32 edx_senter_flags;
374 u64 mseg_valid;
375 struct sha1_hash sinit_hash;
376 struct sha1_hash mle_hash;
377 struct sha1_hash stm_hash;
378 struct sha1_hash lcp_policy_hash;
379 u32 lcp_policy_control;
380 u32 rlp_wakeup_addr;
381 u32 reserved;
382 u32 num_mdrs;
383 u32 mdrs_off;
384 u32 num_vtd_dmars;
385 u32 vtd_dmars_off;
386} __packed;
387
388struct acpi_table_header *tboot_get_dmar_table(struct acpi_table_header *dmar_tbl)
389{
390 void *heap_base, *heap_ptr, *config;
391
392 if (!tboot_enabled())
393 return dmar_tbl;
394
395 /*
396 * ACPI tables may not be DMA protected by tboot, so use DMAR copy
397 * SINIT saved in SinitMleData in TXT heap (which is DMA protected)
398 */
399
400 /* map config space in order to get heap addr */
401 config = ioremap(TXT_PUB_CONFIG_REGS_BASE, NR_TXT_CONFIG_PAGES *
402 PAGE_SIZE);
403 if (!config)
404 return NULL;
405
406 /* now map TXT heap */
407 heap_base = ioremap(*(u64 *)(config + TXTCR_HEAP_BASE),
408 *(u64 *)(config + TXTCR_HEAP_SIZE));
409 iounmap(config);
410 if (!heap_base)
411 return NULL;
412
413 /* walk heap to SinitMleData */
414 /* skip BiosData */
415 heap_ptr = heap_base + *(u64 *)heap_base;
416 /* skip OsMleData */
417 heap_ptr += *(u64 *)heap_ptr;
418 /* skip OsSinitData */
419 heap_ptr += *(u64 *)heap_ptr;
420 /* now points to SinitMleDataSize; set to SinitMleData */
421 heap_ptr += sizeof(u64);
422 /* get addr of DMAR table */
423 dmar_tbl = (struct acpi_table_header *)(heap_ptr +
424 ((struct sinit_mle_data *)heap_ptr)->vtd_dmars_off -
425 sizeof(u64));
426
427 /* don't unmap heap because dmar.c needs access to this */
428
429 return dmar_tbl;
430}
431
432int tboot_force_iommu(void)
433{
434 if (!tboot_enabled())
435 return 0;
436
437 if (no_iommu || swiotlb || dmar_disabled)
438 pr_warning("Forcing Intel-IOMMU to enabled\n");
439
440 dmar_disabled = 0;
441#ifdef CONFIG_SWIOTLB
442 swiotlb = 0;
443#endif
444 no_iommu = 0;
445
446 return 1;
447}
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
new file mode 100644
index 000000000000..be2573448ed9
--- /dev/null
+++ b/arch/x86/kernel/time.c
@@ -0,0 +1,121 @@
1/*
2 * Copyright (c) 1991,1992,1995 Linus Torvalds
3 * Copyright (c) 1994 Alan Modra
4 * Copyright (c) 1995 Markus Kuhn
5 * Copyright (c) 1996 Ingo Molnar
6 * Copyright (c) 1998 Andrea Arcangeli
7 * Copyright (c) 2002,2006 Vojtech Pavlik
8 * Copyright (c) 2003 Andi Kleen
9 *
10 */
11
12#include <linux/clockchips.h>
13#include <linux/interrupt.h>
14#include <linux/time.h>
15#include <linux/mca.h>
16
17#include <asm/vsyscall.h>
18#include <asm/x86_init.h>
19#include <asm/i8259.h>
20#include <asm/i8253.h>
21#include <asm/timer.h>
22#include <asm/hpet.h>
23#include <asm/time.h>
24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
31#endif
32
33unsigned long profile_pc(struct pt_regs *regs)
34{
35 unsigned long pc = instruction_pointer(regs);
36
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp =
42 (unsigned long *)kernel_stack_pointer(regs);
43 /*
44 * Return address is either directly at stack pointer
45 * or above a saved flags. Eflags has bits 22-31 zero,
46 * kernel addresses don't.
47 */
48 if (sp[0] >> 22)
49 return sp[0];
50 if (sp[1] >> 22)
51 return sp[1];
52#endif
53 }
54 return pc;
55}
56EXPORT_SYMBOL(profile_pc);
57
58/*
59 * Default timer interrupt handler for PIT/HPET
60 */
61static irqreturn_t timer_interrupt(int irq, void *dev_id)
62{
63 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs);
65
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event);
81
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
83 if (MCA_bus)
84 outb_p(inb_p(0x61)| 0x80, 0x61);
85
86 return IRQ_HANDLED;
87}
88
89static struct irqaction irq0 = {
90 .handler = timer_interrupt,
91 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
92 .name = "timer"
93};
94
95void __init setup_default_timer_irq(void)
96{
97 setup_irq(0, &irq0);
98}
99
100/* Default timer init function */
101void __init hpet_time_init(void)
102{
103 if (!hpet_enable())
104 setup_pit_timer();
105 setup_default_timer_irq();
106}
107
108static __init void x86_late_time_init(void)
109{
110 x86_init.timers.timer_init();
111 tsc_init();
112}
113
114/*
115 * Initialize TSC and delay the periodic timer init to
116 * late x86_late_time_init() so ioremap works.
117 */
118void __init time_init(void)
119{
120 late_time_init = x86_late_time_init;
121}
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
deleted file mode 100644
index 5c5d87f0b2e1..000000000000
--- a/arch/x86/kernel/time_32.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992, 1995 Linus Torvalds
3 *
4 * This file contains the PC-specific time handling details:
5 * reading the RTC at bootup, etc..
6 * 1994-07-02 Alan Modra
7 * fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
8 * 1995-03-26 Markus Kuhn
9 * fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
10 * precision CMOS clock update
11 * 1996-05-03 Ingo Molnar
12 * fixed time warps in do_[slow|fast]_gettimeoffset()
13 * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
14 * "A Kernel Model for Precision Timekeeping" by Dave Mills
15 * 1998-09-05 (Various)
16 * More robust do_fast_gettimeoffset() algorithm implemented
17 * (works with APM, Cyrix 6x86MX and Centaur C6),
18 * monotonic gettimeofday() with fast_get_timeoffset(),
19 * drift-proof precision TSC calibration on boot
20 * (C. Scott Ananian <cananian@alumni.princeton.edu>, Andrew D.
21 * Balsa <andrebalsa@altern.org>, Philip Gladstone <philip@raptor.com>;
22 * ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@tu-harburg.de>).
23 * 1998-12-16 Andrea Arcangeli
24 * Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
25 * because was not accounting lost_ticks.
26 * 1998-12-24 Copyright (C) 1998 Andrea Arcangeli
27 * Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
28 * serialize accesses to xtime/lost_ticks).
29 */
30
31#include <linux/init.h>
32#include <linux/interrupt.h>
33#include <linux/time.h>
34#include <linux/mca.h>
35
36#include <asm/setup.h>
37#include <asm/hpet.h>
38#include <asm/time.h>
39#include <asm/timer.h>
40
41#include <asm/do_timer.h>
42
43int timer_ack;
44
45unsigned long profile_pc(struct pt_regs *regs)
46{
47 unsigned long pc = instruction_pointer(regs);
48
49#ifdef CONFIG_SMP
50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else
54 unsigned long *sp = (unsigned long *)&regs->sp;
55
56 /* Return address is either directly at stack pointer
57 or above a saved flags. Eflags has bits 22-31 zero,
58 kernel addresses don't. */
59 if (sp[0] >> 22)
60 return sp[0];
61 if (sp[1] >> 22)
62 return sp[1];
63#endif
64 }
65#endif
66 return pc;
67}
68EXPORT_SYMBOL(profile_pc);
69
70/*
71 * This is the same as the above, except we _also_ save the current
72 * Time Stamp Counter value at the time of the timer interrupt, so that
73 * we later on can estimate the time of day more exactly.
74 */
75irqreturn_t timer_interrupt(int irq, void *dev_id)
76{
77 /* Keep nmi watchdog up to date */
78 inc_irq_stat(irq0_irqs);
79
80#ifdef CONFIG_X86_IO_APIC
81 if (timer_ack) {
82 /*
83 * Subtle, when I/O APICs are used we have to ack timer IRQ
84 * manually to deassert NMI lines for the watchdog if run
85 * on an 82489DX-based system.
86 */
87 spin_lock(&i8259A_lock);
88 outb(0x0c, PIC_MASTER_OCW3);
89 /* Ack the IRQ; AEOI will end it automatically. */
90 inb(PIC_MASTER_POLL);
91 spin_unlock(&i8259A_lock);
92 }
93#endif
94
95 do_timer_interrupt_hook();
96
97#ifdef CONFIG_MCA
98 if (MCA_bus) {
99 /* The PS/2 uses level-triggered interrupts. You can't
100 turn them off, nor would you want to (any attempt to
101 enable edge-triggered interrupts usually gets intercepted by a
102 special hardware circuit). Hence we have to acknowledge
103 the timer interrupt. Through some incredibly stupid
104 design idea, the reset for IRQ 0 is done by setting the
105 high bit of the PPI port B (0x61). Note that some PS/2s,
106 notably the 55SX, work fine if this is removed. */
107
108 u8 irq_v = inb_p(0x61); /* read the current state */
109 outb_p(irq_v | 0x80, 0x61); /* reset the IRQ */
110 }
111#endif
112
113 return IRQ_HANDLED;
114}
115
116/* Duplicate of time_init() below, with hpet_enable part added */
117void __init hpet_time_init(void)
118{
119 if (!hpet_enable())
120 setup_pit_timer();
121 x86_quirk_time_init();
122}
123
124/*
125 * This is called directly from init code; we must delay timer setup in the
126 * HPET case as we can't make the decision to turn on HPET this early in the
127 * boot process.
128 *
129 * The chosen time_init function will usually be hpet_time_init, above, but
130 * in the case of virtual hardware, an alternative function may be substituted.
131 */
132void __init time_init(void)
133{
134 x86_quirk_pre_time_init();
135 tsc_init();
136 late_time_init = choose_time_init();
137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
deleted file mode 100644
index 5ba343e61844..000000000000
--- a/arch/x86/kernel/time_64.c
+++ /dev/null
@@ -1,135 +0,0 @@
1/*
2 * "High Precision Event Timer" based timekeeping.
3 *
4 * Copyright (c) 1991,1992,1995 Linus Torvalds
5 * Copyright (c) 1994 Alan Modra
6 * Copyright (c) 1995 Markus Kuhn
7 * Copyright (c) 1996 Ingo Molnar
8 * Copyright (c) 1998 Andrea Arcangeli
9 * Copyright (c) 2002,2006 Vojtech Pavlik
10 * Copyright (c) 2003 Andi Kleen
11 * RTC support code taken from arch/i386/kernel/timers/time_hpet.c
12 */
13
14#include <linux/clockchips.h>
15#include <linux/init.h>
16#include <linux/interrupt.h>
17#include <linux/module.h>
18#include <linux/time.h>
19#include <linux/mca.h>
20#include <linux/nmi.h>
21
22#include <asm/i8253.h>
23#include <asm/hpet.h>
24#include <asm/vgtod.h>
25#include <asm/time.h>
26#include <asm/timer.h>
27
28volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
29
30unsigned long profile_pc(struct pt_regs *regs)
31{
32 unsigned long pc = instruction_pointer(regs);
33
34 /* Assume the lock function has either no stack frame or a copy
35 of flags from PUSHF
36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
41 unsigned long *sp = (unsigned long *)regs->sp;
42 if (sp[0] >> 22)
43 return sp[0];
44 if (sp[1] >> 22)
45 return sp[1];
46#endif
47 }
48 return pc;
49}
50EXPORT_SYMBOL(profile_pc);
51
52static irqreturn_t timer_interrupt(int irq, void *dev_id)
53{
54 inc_irq_stat(irq0_irqs);
55
56 global_clock_event->event_handler(global_clock_event);
57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
65 return IRQ_HANDLED;
66}
67
68/* calibrate_cpu is used on systems with fixed rate TSCs to determine
69 * processor frequency */
70#define TICK_COUNT 100000000
71unsigned long __init calibrate_cpu(void)
72{
73 int tsc_start, tsc_now;
74 int i, no_ctr_free;
75 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
76 unsigned long flags;
77
78 for (i = 0; i < 4; i++)
79 if (avail_to_resrv_perfctr_nmi_bit(i))
80 break;
81 no_ctr_free = (i == 4);
82 if (no_ctr_free) {
83 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
84 "cpu_khz value may be incorrect.\n");
85 i = 3;
86 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
87 wrmsrl(MSR_K7_EVNTSEL3, 0);
88 rdmsrl(MSR_K7_PERFCTR3, pmc3);
89 } else {
90 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
91 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
92 }
93 local_irq_save(flags);
94 /* start measuring cycles, incrementing from 0 */
95 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
96 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
97 rdtscl(tsc_start);
98 do {
99 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
100 tsc_now = get_cycles();
101 } while ((tsc_now - tsc_start) < TICK_COUNT);
102
103 local_irq_restore(flags);
104 if (no_ctr_free) {
105 wrmsrl(MSR_K7_EVNTSEL3, 0);
106 wrmsrl(MSR_K7_PERFCTR3, pmc3);
107 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
108 } else {
109 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
110 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
111 }
112
113 return pmc_now * tsc_khz / (tsc_now - tsc_start);
114}
115
116static struct irqaction irq0 = {
117 .handler = timer_interrupt,
118 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING | IRQF_TIMER,
119 .name = "timer"
120};
121
122void __init hpet_time_init(void)
123{
124 if (!hpet_enable())
125 setup_pit_timer();
126
127 setup_irq(0, &irq0);
128}
129
130void __init time_init(void)
131{
132 tsc_init();
133
134 late_time_init = choose_time_init();
135}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index 808031a5ba19..cd022121cab6 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -3,8 +3,16 @@
3#include <asm/trampoline.h> 3#include <asm/trampoline.h>
4#include <asm/e820.h> 4#include <asm/e820.h>
5 5
6#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP)
7#define __trampinit
8#define __trampinitdata
9#else
10#define __trampinit __cpuinit
11#define __trampinitdata __cpuinitdata
12#endif
13
6/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
7unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE);
8 16
9void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
10{ 18{
@@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void)
26 * bootstrap into the page concerned. The caller 34 * bootstrap into the page concerned. The caller
27 * has made sure it's suitably aligned. 35 * has made sure it's suitably aligned.
28 */ 36 */
29unsigned long setup_trampoline(void) 37unsigned long __trampinit setup_trampoline(void)
30{ 38{
31 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 39 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
32 return virt_to_phys(trampoline_base); 40 return virt_to_phys(trampoline_base);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 66d874e5404c..8508237e8e43 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -28,16 +28,12 @@
28 */ 28 */
29 29
30#include <linux/linkage.h> 30#include <linux/linkage.h>
31#include <linux/init.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/page_types.h> 33#include <asm/page_types.h>
33 34
34/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35/* We can free up trampoline after bootup if cpu hotplug is not supported. */
35#ifndef CONFIG_HOTPLUG_CPU 36__CPUINITRODATA
36.section ".cpuinit.data","aw",@progbits
37#else
38.section .rodata,"a",@progbits
39#endif
40
41.code16 37.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index cddfb8d386b9..3af2dff58b21 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -25,14 +25,19 @@
25 */ 25 */
26 26
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/init.h>
28#include <asm/pgtable_types.h> 29#include <asm/pgtable_types.h>
29#include <asm/page_types.h> 30#include <asm/page_types.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/segment.h> 32#include <asm/segment.h>
32#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
33 34
35#ifdef CONFIG_ACPI_SLEEP
34.section .rodata, "a", @progbits 36.section .rodata, "a", @progbits
35 37#else
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
36.code16 41.code16
37 42
38ENTRY(trampoline_data) 43ENTRY(trampoline_data)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 49a401b1d4d7..33399176512a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -14,7 +14,6 @@
14#include <linux/spinlock.h> 14#include <linux/spinlock.h>
15#include <linux/kprobes.h> 15#include <linux/kprobes.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/utsname.h>
18#include <linux/kdebug.h> 17#include <linux/kdebug.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
20#include <linux/module.h> 19#include <linux/module.h>
@@ -59,12 +58,12 @@
59#include <asm/mach_traps.h> 58#include <asm/mach_traps.h>
60 59
61#ifdef CONFIG_X86_64 60#ifdef CONFIG_X86_64
61#include <asm/x86_init.h>
62#include <asm/pgalloc.h> 62#include <asm/pgalloc.h>
63#include <asm/proto.h> 63#include <asm/proto.h>
64#else 64#else
65#include <asm/processor-flags.h> 65#include <asm/processor-flags.h>
66#include <asm/setup.h> 66#include <asm/setup.h>
67#include <asm/traps.h>
68 67
69asmlinkage int system_call(void); 68asmlinkage int system_call(void);
70 69
@@ -73,11 +72,9 @@ char ignore_fpu_irq;
73 72
74/* 73/*
75 * The IDT has to be page-aligned to simplify the Pentium 74 * The IDT has to be page-aligned to simplify the Pentium
76 * F0 0F bug workaround.. We have a special link segment 75 * F0 0F bug workaround.
77 * for this.
78 */ 76 */
79gate_desc idt_table[NR_VECTORS] 77gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
80 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
81#endif 78#endif
82 79
83DECLARE_BITMAP(used_vectors, NR_VECTORS); 80DECLARE_BITMAP(used_vectors, NR_VECTORS);
@@ -951,7 +948,5 @@ void __init trap_init(void)
951 */ 948 */
952 cpu_init(); 949 cpu_init();
953 950
954#ifdef CONFIG_X86_32 951 x86_init.irqs.trap_init();
955 x86_quirk_trap_init();
956#endif
957} 952}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 71f4368b357e..cd982f48e23e 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -17,6 +17,8 @@
17#include <asm/time.h> 17#include <asm/time.h>
18#include <asm/delay.h> 18#include <asm/delay.h>
19#include <asm/hypervisor.h> 19#include <asm/hypervisor.h>
20#include <asm/nmi.h>
21#include <asm/x86_init.h>
20 22
21unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ 23unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */
22EXPORT_SYMBOL(cpu_khz); 24EXPORT_SYMBOL(cpu_khz);
@@ -400,15 +402,9 @@ unsigned long native_calibrate_tsc(void)
400{ 402{
401 u64 tsc1, tsc2, delta, ref1, ref2; 403 u64 tsc1, tsc2, delta, ref1, ref2;
402 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 404 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
403 unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; 405 unsigned long flags, latch, ms, fast_calibrate;
404 int hpet = is_hpet_enabled(), i, loopmin; 406 int hpet = is_hpet_enabled(), i, loopmin;
405 407
406 hv_tsc_khz = get_hypervisor_tsc_freq();
407 if (hv_tsc_khz) {
408 printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
409 return hv_tsc_khz;
410 }
411
412 local_irq_save(flags); 408 local_irq_save(flags);
413 fast_calibrate = quick_pit_calibrate(); 409 fast_calibrate = quick_pit_calibrate();
414 local_irq_restore(flags); 410 local_irq_restore(flags);
@@ -566,7 +562,7 @@ int recalibrate_cpu_khz(void)
566 unsigned long cpu_khz_old = cpu_khz; 562 unsigned long cpu_khz_old = cpu_khz;
567 563
568 if (cpu_has_tsc) { 564 if (cpu_has_tsc) {
569 tsc_khz = calibrate_tsc(); 565 tsc_khz = x86_platform.calibrate_tsc();
570 cpu_khz = tsc_khz; 566 cpu_khz = tsc_khz;
571 cpu_data(0).loops_per_jiffy = 567 cpu_data(0).loops_per_jiffy =
572 cpufreq_scale(cpu_data(0).loops_per_jiffy, 568 cpufreq_scale(cpu_data(0).loops_per_jiffy,
@@ -670,7 +666,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
670 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || 666 if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) ||
671 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || 667 (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
672 (val == CPUFREQ_RESUMECHANGE)) { 668 (val == CPUFREQ_RESUMECHANGE)) {
673 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); 669 *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
674 670
675 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 671 tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
676 if (!(freq->flags & CPUFREQ_CONST_LOOPS)) 672 if (!(freq->flags & CPUFREQ_CONST_LOOPS))
@@ -744,10 +740,16 @@ static cycle_t __vsyscall_fn vread_tsc(void)
744} 740}
745#endif 741#endif
746 742
743static void resume_tsc(void)
744{
745 clocksource_tsc.cycle_last = 0;
746}
747
747static struct clocksource clocksource_tsc = { 748static struct clocksource clocksource_tsc = {
748 .name = "tsc", 749 .name = "tsc",
749 .rating = 300, 750 .rating = 300,
750 .read = read_tsc, 751 .read = read_tsc,
752 .resume = resume_tsc,
751 .mask = CLOCKSOURCE_MASK(64), 753 .mask = CLOCKSOURCE_MASK(64),
752 .shift = 22, 754 .shift = 22,
753 .flags = CLOCK_SOURCE_IS_CONTINUOUS | 755 .flags = CLOCK_SOURCE_IS_CONTINUOUS |
@@ -761,12 +763,14 @@ void mark_tsc_unstable(char *reason)
761{ 763{
762 if (!tsc_unstable) { 764 if (!tsc_unstable) {
763 tsc_unstable = 1; 765 tsc_unstable = 1;
764 printk("Marking TSC unstable due to %s\n", reason); 766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
765 /* Change only the rating, when not registered */ 767 /* Change only the rating, when not registered */
766 if (clocksource_tsc.mult) 768 if (clocksource_tsc.mult)
767 clocksource_change_rating(&clocksource_tsc, 0); 769 clocksource_mark_unstable(&clocksource_tsc);
768 else 770 else {
771 clocksource_tsc.flags |= CLOCK_SOURCE_UNSTABLE;
769 clocksource_tsc.rating = 0; 772 clocksource_tsc.rating = 0;
773 }
770 } 774 }
771} 775}
772 776
@@ -852,15 +856,71 @@ static void __init init_tsc_clocksource(void)
852 clocksource_register(&clocksource_tsc); 856 clocksource_register(&clocksource_tsc);
853} 857}
854 858
859#ifdef CONFIG_X86_64
860/*
861 * calibrate_cpu is used on systems with fixed rate TSCs to determine
862 * processor frequency
863 */
864#define TICK_COUNT 100000000
865static unsigned long __init calibrate_cpu(void)
866{
867 int tsc_start, tsc_now;
868 int i, no_ctr_free;
869 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
870 unsigned long flags;
871
872 for (i = 0; i < 4; i++)
873 if (avail_to_resrv_perfctr_nmi_bit(i))
874 break;
875 no_ctr_free = (i == 4);
876 if (no_ctr_free) {
877 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
878 "cpu_khz value may be incorrect.\n");
879 i = 3;
880 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
881 wrmsrl(MSR_K7_EVNTSEL3, 0);
882 rdmsrl(MSR_K7_PERFCTR3, pmc3);
883 } else {
884 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
885 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
886 }
887 local_irq_save(flags);
888 /* start measuring cycles, incrementing from 0 */
889 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
890 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
891 rdtscl(tsc_start);
892 do {
893 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
894 tsc_now = get_cycles();
895 } while ((tsc_now - tsc_start) < TICK_COUNT);
896
897 local_irq_restore(flags);
898 if (no_ctr_free) {
899 wrmsrl(MSR_K7_EVNTSEL3, 0);
900 wrmsrl(MSR_K7_PERFCTR3, pmc3);
901 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
902 } else {
903 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
904 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
905 }
906
907 return pmc_now * tsc_khz / (tsc_now - tsc_start);
908}
909#else
910static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
911#endif
912
855void __init tsc_init(void) 913void __init tsc_init(void)
856{ 914{
857 u64 lpj; 915 u64 lpj;
858 int cpu; 916 int cpu;
859 917
918 x86_init.timers.tsc_pre_init();
919
860 if (!cpu_has_tsc) 920 if (!cpu_has_tsc)
861 return; 921 return;
862 922
863 tsc_khz = calibrate_tsc(); 923 tsc_khz = x86_platform.calibrate_tsc();
864 cpu_khz = tsc_khz; 924 cpu_khz = tsc_khz;
865 925
866 if (!tsc_khz) { 926 if (!tsc_khz) {
@@ -868,11 +928,9 @@ void __init tsc_init(void)
868 return; 928 return;
869 } 929 }
870 930
871#ifdef CONFIG_X86_64
872 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && 931 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
873 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) 932 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
874 cpu_khz = calibrate_cpu(); 933 cpu_khz = calibrate_cpu();
875#endif
876 934
877 printk("Detected %lu.%03lu MHz processor.\n", 935 printk("Detected %lu.%03lu MHz processor.\n",
878 (unsigned long)cpu_khz / 1000, 936 (unsigned long)cpu_khz / 1000,
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 027b5b498993..f37930954d15 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 pr_info("Skipping synchronization checks as TSC is reliable.\n"); 117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n");
118 return; 118 return;
119 } 119 }
120 120
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 31ffc24eec4d..f068553a1b17 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -30,6 +30,7 @@
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/apic.h> 31#include <asm/apic.h>
32#include <asm/e820.h> 32#include <asm/e820.h>
33#include <asm/time.h>
33#include <asm/io.h> 34#include <asm/io.h>
34 35
35#include <linux/kernel_stat.h> 36#include <linux/kernel_stat.h>
@@ -53,7 +54,7 @@ int is_visws_box(void)
53 return visws_board_type >= 0; 54 return visws_board_type >= 0;
54} 55}
55 56
56static int __init visws_time_init(void) 57static void __init visws_time_init(void)
57{ 58{
58 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 59 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
59 60
@@ -66,21 +67,13 @@ static int __init visws_time_init(void)
66 /* Enable (unmask) the timer interrupt */ 67 /* Enable (unmask) the timer interrupt */
67 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK); 68 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
68 69
69 /* 70 setup_default_timer_irq();
70 * Zero return means the generic timer setup code will set up
71 * the standard vector:
72 */
73 return 0;
74} 71}
75 72
76static int __init visws_pre_intr_init(void) 73/* Replaces the default init_ISA_irqs in the generic setup */
74static void __init visws_pre_intr_init(void)
77{ 75{
78 init_VISWS_APIC_irqs(); 76 init_VISWS_APIC_irqs();
79
80 /*
81 * We dont want ISA irqs to be set up by the generic code:
82 */
83 return 1;
84} 77}
85 78
86/* Quirk for machine specific memory setup. */ 79/* Quirk for machine specific memory setup. */
@@ -156,12 +149,8 @@ static void visws_machine_power_off(void)
156 outl(PIIX_SPECIAL_STOP, 0xCFC); 149 outl(PIIX_SPECIAL_STOP, 0xCFC);
157} 150}
158 151
159static int __init visws_get_smp_config(unsigned int early) 152static void __init visws_get_smp_config(unsigned int early)
160{ 153{
161 /*
162 * Prevent MP-table parsing by the generic code:
163 */
164 return 1;
165} 154}
166 155
167/* 156/*
@@ -208,7 +197,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
208 apic_version[m->apicid] = ver; 197 apic_version[m->apicid] = ver;
209} 198}
210 199
211static int __init visws_find_smp_config(unsigned int reserve) 200static void __init visws_find_smp_config(unsigned int reserve)
212{ 201{
213 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
214 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -230,21 +219,9 @@ static int __init visws_find_smp_config(unsigned int reserve)
230 MP_processor_info(mp++); 219 MP_processor_info(mp++);
231 220
232 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 221 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
233
234 return 1;
235} 222}
236 223
237static int visws_trap_init(void); 224static void visws_trap_init(void);
238
239static struct x86_quirks visws_x86_quirks __initdata = {
240 .arch_time_init = visws_time_init,
241 .arch_pre_intr_init = visws_pre_intr_init,
242 .arch_memory_setup = visws_memory_setup,
243 .arch_intr_init = NULL,
244 .arch_trap_init = visws_trap_init,
245 .mach_get_smp_config = visws_get_smp_config,
246 .mach_find_smp_config = visws_find_smp_config,
247};
248 225
249void __init visws_early_detect(void) 226void __init visws_early_detect(void)
250{ 227{
@@ -257,11 +234,14 @@ void __init visws_early_detect(void)
257 return; 234 return;
258 235
259 /* 236 /*
260 * Install special quirks for timer, interrupt and memory setup: 237 * Override the default platform setup functions
261 * Fall back to generic behavior for traps:
262 * Override generic MP-table parsing:
263 */ 238 */
264 x86_quirks = &visws_x86_quirks; 239 x86_init.resources.memory_setup = visws_memory_setup;
240 x86_init.mpparse.get_smp_config = visws_get_smp_config;
241 x86_init.mpparse.find_smp_config = visws_find_smp_config;
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init;
265 245
266 /* 246 /*
267 * Install reboot quirks: 247 * Install reboot quirks:
@@ -400,12 +380,10 @@ static __init void cobalt_init(void)
400 co_apic_read(CO_APIC_ID)); 380 co_apic_read(CO_APIC_ID));
401} 381}
402 382
403static int __init visws_trap_init(void) 383static void __init visws_trap_init(void)
404{ 384{
405 lithium_init(); 385 lithium_init();
406 cobalt_init(); 386 cobalt_init();
407
408 return 1;
409} 387}
410 388
411/* 389/*
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 95a7289e4b0c..d430e4c30193 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -648,7 +648,7 @@ static inline int __init activate_vmi(void)
648 648
649 pv_info.paravirt_enabled = 1; 649 pv_info.paravirt_enabled = 1;
650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; 650 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
651 pv_info.name = "vmi"; 651 pv_info.name = "vmi [deprecated]";
652 652
653 pv_init_ops.patch = vmi_patch; 653 pv_init_ops.patch = vmi_patch;
654 654
@@ -817,15 +817,15 @@ static inline int __init activate_vmi(void)
817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm); 817 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
818 vmi_timer_ops.cancel_alarm = 818 vmi_timer_ops.cancel_alarm =
819 vmi_get_function(VMI_CALL_CancelAlarm); 819 vmi_get_function(VMI_CALL_CancelAlarm);
820 pv_time_ops.time_init = vmi_time_init; 820 x86_init.timers.timer_init = vmi_time_init;
821 pv_time_ops.get_wallclock = vmi_get_wallclock;
822 pv_time_ops.set_wallclock = vmi_set_wallclock;
823#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
824 pv_apic_ops.setup_boot_clock = vmi_time_bsp_init; 822 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
825 pv_apic_ops.setup_secondary_clock = vmi_time_ap_init; 823 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
826#endif 824#endif
827 pv_time_ops.sched_clock = vmi_sched_clock; 825 pv_time_ops.sched_clock = vmi_sched_clock;
828 pv_time_ops.get_tsc_khz = vmi_tsc_khz; 826 x86_platform.calibrate_tsc = vmi_tsc_khz;
827 x86_platform.get_wallclock = vmi_get_wallclock;
828 x86_platform.set_wallclock = vmi_set_wallclock;
829 829
830 /* We have true wallclock functions; disable CMOS clock sync */ 830 /* We have true wallclock functions; disable CMOS clock sync */
831 no_sync_cmos_clock = 1; 831 no_sync_cmos_clock = 1;
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2b3eb82efeeb..611b9e2360d3 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -68,7 +68,7 @@ unsigned long long vmi_sched_clock(void)
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE)); 68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69} 69}
70 70
71/* paravirt_ops.get_tsc_khz = vmi_tsc_khz */ 71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void) 72unsigned long vmi_tsc_khz(void)
73{ 73{
74 unsigned long long khz; 74 unsigned long long khz;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9fc178255c04..8d6001ad8d8d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -45,9 +45,9 @@ PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 45 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 46 data PT_LOAD FLAGS(7); /* RWE */
47#ifdef CONFIG_X86_64 47#ifdef CONFIG_X86_64
48 user PT_LOAD FLAGS(7); /* RWE */ 48 user PT_LOAD FLAGS(5); /* R_E */
49#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
50 percpu PT_LOAD FLAGS(7); /* RWE */ 50 percpu PT_LOAD FLAGS(6); /* RW_ */
51#endif 51#endif
52 init PT_LOAD FLAGS(7); /* RWE */ 52 init PT_LOAD FLAGS(7); /* RWE */
53#endif 53#endif
@@ -65,17 +65,11 @@ SECTIONS
65#endif 65#endif
66 66
67 /* Text and read-only data */ 67 /* Text and read-only data */
68
69 /* bootstrapping code */
70 .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
71 _text = .;
72 *(.text.head)
73 } :text = 0x9090
74
75 /* The rest of the text */
76 .text : AT(ADDR(.text) - LOAD_OFFSET) { 68 .text : AT(ADDR(.text) - LOAD_OFFSET) {
69 _text = .;
70 /* bootstrapping code */
71 HEAD_TEXT
77#ifdef CONFIG_X86_32 72#ifdef CONFIG_X86_32
78 /* not really needed, already page aligned */
79 . = ALIGN(PAGE_SIZE); 73 . = ALIGN(PAGE_SIZE);
80 *(.text.page_aligned) 74 *(.text.page_aligned)
81#endif 75#endif
@@ -94,13 +88,7 @@ SECTIONS
94 88
95 NOTES :text :note 89 NOTES :text :note
96 90
97 /* Exception table */ 91 EXCEPTION_TABLE(16) :text = 0x9090
98 . = ALIGN(16);
99 __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
100 __start___ex_table = .;
101 *(__ex_table)
102 __stop___ex_table = .;
103 } :text = 0x9090
104 92
105 RO_DATA(PAGE_SIZE) 93 RO_DATA(PAGE_SIZE)
106 94
@@ -118,7 +106,6 @@ SECTIONS
118#endif 106#endif
119 107
120 PAGE_ALIGNED_DATA(PAGE_SIZE) 108 PAGE_ALIGNED_DATA(PAGE_SIZE)
121 *(.data.idt)
122 109
123 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
124 111
@@ -135,24 +122,21 @@ SECTIONS
135#ifdef CONFIG_X86_64 122#ifdef CONFIG_X86_64
136 123
137#define VSYSCALL_ADDR (-10*1024*1024) 124#define VSYSCALL_ADDR (-10*1024*1024)
138#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
139 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
140#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
141 PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
142 125
143#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) 126#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET)
144#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) 127#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
145 128
146#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) 129#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
147#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 130#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
148 131
132 . = ALIGN(4096);
133 __vsyscall_0 = .;
134
149 . = VSYSCALL_ADDR; 135 . = VSYSCALL_ADDR;
150 .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { 136 .vsyscall_0 : AT(VLOAD(.vsyscall_0)) {
151 *(.vsyscall_0) 137 *(.vsyscall_0)
152 } :user 138 } :user
153 139
154 __vsyscall_0 = VSYSCALL_VIRT_ADDR;
155
156 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
157 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
158 *(.vsyscall_fn) 142 *(.vsyscall_fn)
@@ -192,11 +176,9 @@ SECTIONS
192 *(.vsyscall_3) 176 *(.vsyscall_3)
193 } 177 }
194 178
195 . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; 179 . = __vsyscall_0 + PAGE_SIZE;
196 180
197#undef VSYSCALL_ADDR 181#undef VSYSCALL_ADDR
198#undef VSYSCALL_PHYS_ADDR
199#undef VSYSCALL_VIRT_ADDR
200#undef VLOAD_OFFSET 182#undef VLOAD_OFFSET
201#undef VLOAD 183#undef VLOAD
202#undef VVIRT_OFFSET 184#undef VVIRT_OFFSET
@@ -219,36 +201,12 @@ SECTIONS
219 PERCPU_VADDR(0, :percpu) 201 PERCPU_VADDR(0, :percpu)
220#endif 202#endif
221 203
222 .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { 204 INIT_TEXT_SECTION(PAGE_SIZE)
223 _sinittext = .;
224 INIT_TEXT
225 _einittext = .;
226 }
227#ifdef CONFIG_X86_64 205#ifdef CONFIG_X86_64
228 :init 206 :init
229#endif 207#endif
230 208
231 .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { 209 INIT_DATA_SECTION(16)
232 INIT_DATA
233 }
234
235 . = ALIGN(16);
236 .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
237 __setup_start = .;
238 *(.init.setup)
239 __setup_end = .;
240 }
241 .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
242 __initcall_start = .;
243 INITCALLS
244 __initcall_end = .;
245 }
246
247 .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
248 __con_initcall_start = .;
249 *(.con_initcall.init)
250 __con_initcall_end = .;
251 }
252 210
253 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 211 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
254 __x86_cpu_dev_start = .; 212 __x86_cpu_dev_start = .;
@@ -256,8 +214,6 @@ SECTIONS
256 __x86_cpu_dev_end = .; 214 __x86_cpu_dev_end = .;
257 } 215 }
258 216
259 SECURITY_INIT
260
261 . = ALIGN(8); 217 . = ALIGN(8);
262 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 218 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
263 __parainstructions = .; 219 __parainstructions = .;
@@ -288,15 +244,6 @@ SECTIONS
288 EXIT_DATA 244 EXIT_DATA
289 } 245 }
290 246
291#ifdef CONFIG_BLK_DEV_INITRD
292 . = ALIGN(PAGE_SIZE);
293 .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
294 __initramfs_start = .;
295 *(.init.ramfs)
296 __initramfs_end = .;
297 }
298#endif
299
300#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 247#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
301 PERCPU(PAGE_SIZE) 248 PERCPU(PAGE_SIZE)
302#endif 249#endif
@@ -348,21 +295,18 @@ SECTIONS
348 _end = .; 295 _end = .;
349 } 296 }
350 297
351 /* Sections to be discarded */
352 /DISCARD/ : {
353 *(.exitcall.exit)
354 *(.eh_frame)
355 *(.discard)
356 }
357
358 STABS_DEBUG 298 STABS_DEBUG
359 DWARF_DEBUG 299 DWARF_DEBUG
300
301 /* Sections to be discarded */
302 DISCARDS
303 /DISCARD/ : { *(.eh_frame) }
360} 304}
361 305
362 306
363#ifdef CONFIG_X86_32 307#ifdef CONFIG_X86_32
364. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), 308ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
365 "kernel image bigger than KERNEL_IMAGE_SIZE"); 309 "kernel image bigger than KERNEL_IMAGE_SIZE");
366#else 310#else
367/* 311/*
368 * Per-cpu symbols which need to be offset from __per_cpu_load 312 * Per-cpu symbols which need to be offset from __per_cpu_load
@@ -375,12 +319,12 @@ INIT_PER_CPU(irq_stack_union);
375/* 319/*
376 * Build-time check on the image size: 320 * Build-time check on the image size:
377 */ 321 */
378. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), 322ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
379 "kernel image bigger than KERNEL_IMAGE_SIZE"); 323 "kernel image bigger than KERNEL_IMAGE_SIZE");
380 324
381#ifdef CONFIG_SMP 325#ifdef CONFIG_SMP
382. = ASSERT((per_cpu__irq_stack_union == 0), 326ASSERT((per_cpu__irq_stack_union == 0),
383 "irq_stack_union is not at start of per-cpu area"); 327 "irq_stack_union is not at start of per-cpu area");
384#endif 328#endif
385 329
386#endif /* CONFIG_X86_32 */ 330#endif /* CONFIG_X86_32 */
@@ -388,7 +332,6 @@ INIT_PER_CPU(irq_stack_union);
388#ifdef CONFIG_KEXEC 332#ifdef CONFIG_KEXEC
389#include <asm/kexec.h> 333#include <asm/kexec.h>
390 334
391. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, 335ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
392 "kexec control code size is too big"); 336 "kexec control code size is too big");
393#endif 337#endif
394
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 25ee06a80aad..8cb4974ff599 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -87,6 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; 89 vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
90 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
90 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 91 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
91} 92}
92 93
@@ -227,19 +228,11 @@ static long __vsyscall(3) venosys_1(void)
227} 228}
228 229
229#ifdef CONFIG_SYSCTL 230#ifdef CONFIG_SYSCTL
230
231static int
232vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp,
233 void __user *buffer, size_t *lenp, loff_t *ppos)
234{
235 return proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
236}
237
238static ctl_table kernel_table2[] = { 231static ctl_table kernel_table2[] = {
239 { .procname = "vsyscall64", 232 { .procname = "vsyscall64",
240 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), 233 .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int),
241 .mode = 0644, 234 .mode = 0644,
242 .proc_handler = vsyscall_sysctl_change }, 235 .proc_handler = proc_dointvec },
243 {} 236 {}
244}; 237};
245 238
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
new file mode 100644
index 000000000000..4449a4a2c2ed
--- /dev/null
+++ b/arch/x86/kernel/x86_init.c
@@ -0,0 +1,75 @@
1/*
2 * Copyright (C) 2009 Thomas Gleixner <tglx@linutronix.de>
3 *
4 * For licencing details see kernel-base/COPYING
5 */
6#include <linux/init.h>
7
8#include <asm/bios_ebda.h>
9#include <asm/paravirt.h>
10#include <asm/mpspec.h>
11#include <asm/setup.h>
12#include <asm/apic.h>
13#include <asm/e820.h>
14#include <asm/time.h>
15#include <asm/irq.h>
16#include <asm/tsc.h>
17
18void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { }
21
22/*
23 * The platform setup functions are preset with the default functions
24 * for standard PC hardware.
25 */
26struct x86_init_ops x86_init __initdata = {
27
28 .resources = {
29 .probe_roms = x86_init_noop,
30 .reserve_resources = reserve_standard_io_resources,
31 .memory_setup = default_machine_specific_memory_setup,
32 },
33
34 .mpparse = {
35 .mpc_record = x86_init_uint_noop,
36 .setup_ioapic_ids = x86_init_noop,
37 .mpc_apic_id = default_mpc_apic_id,
38 .smp_read_mpc_oem = default_smp_read_mpc_oem,
39 .mpc_oem_bus_info = default_mpc_oem_bus_info,
40 .find_smp_config = default_find_smp_config,
41 .get_smp_config = default_get_smp_config,
42 },
43
44 .irqs = {
45 .pre_vector_init = init_ISA_irqs,
46 .intr_init = native_init_IRQ,
47 .trap_init = x86_init_noop,
48 },
49
50 .oem = {
51 .arch_setup = x86_init_noop,
52 .banner = default_banner,
53 },
54
55 .paging = {
56 .pagetable_setup_start = native_pagetable_setup_start,
57 .pagetable_setup_done = native_pagetable_setup_done,
58 },
59
60 .timers = {
61 .setup_percpu_clockev = setup_boot_APIC_clock,
62 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init,
64 },
65};
66
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock,
69};
70
71struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss,
75};