aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile13
-rw-r--r--arch/x86/kernel/acpi/cstate.c23
-rw-r--r--arch/x86/kernel/apic/apic.c66
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c1
-rw-r--r--arch/x86/kernel/apic/apic_noop.c1
-rw-r--r--arch/x86/kernel/apic/io_apic.c20
-rw-r--r--arch/x86/kernel/apic/ipi.c1
-rw-r--r--arch/x86/kernel/apic/summit_32.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c1
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c1
-rw-r--r--arch/x86/kernel/check.c2
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/amd.c23
-rw-r--r--arch/x86/kernel/cpu/centaur.c1
-rw-r--r--arch/x86/kernel/cpu/common.c7
-rw-r--r--arch/x86/kernel/cpu/cyrix.c1
-rw-r--r--arch/x86/kernel/cpu/intel.c32
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c12
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/Makefile7
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c (renamed from arch/x86/kernel/microcode_amd.c)15
-rw-r--r--arch/x86/kernel/cpu/microcode/amd_early.c (renamed from arch/x86/kernel/microcode_amd_early.c)239
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c (renamed from arch/x86/kernel/microcode_core.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c (renamed from arch/x86/kernel/microcode_core_early.c)0
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c (renamed from arch/x86/kernel/microcode_intel.c)2
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c (renamed from arch/x86/kernel/microcode_intel_early.c)10
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c (renamed from arch/x86/kernel/microcode_intel_lib.c)0
-rw-r--r--arch/x86/kernel/cpu/perf_event.c16
-rw-r--r--arch/x86/kernel/cpu/perf_event.h15
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c53
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_rapl.c679
-rw-r--r--arch/x86/kernel/cpu/rdrand.c14
-rw-r--r--arch/x86/kernel/cpu/transmeta.c1
-rw-r--r--arch/x86/kernel/cpu/umc.c1
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/doublefault.c1
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S2
-rw-r--r--arch/x86/kernel/hw_breakpoint.c1
-rw-r--r--arch/x86/kernel/iosf_mbi.c226
-rw-r--r--arch/x86/kernel/irq.c89
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kgdb.c1
-rw-r--r--arch/x86/kernel/ksysfs.c340
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/pci-nommu.c1
-rw-r--r--arch/x86/kernel/process_32.c1
-rw-r--r--arch/x86/kernel/reboot.c11
-rw-r--r--arch/x86/kernel/setup.c54
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/traps.c22
-rw-r--r--arch/x86/kernel/tsc.c328
-rw-r--r--arch/x86/kernel/tsc_msr.c127
-rw-r--r--arch/x86/kernel/tsc_sync.c1
-rw-r--r--arch/x86/kernel/xsave.c10
59 files changed, 2189 insertions, 324 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9b0a34e2cd79..cb648c84b327 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -29,10 +29,11 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
29obj-y += syscall_$(BITS).o 29obj-y += syscall_$(BITS).o
30obj-$(CONFIG_X86_64) += vsyscall_64.o 30obj-$(CONFIG_X86_64) += vsyscall_64.o
31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 31obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
32obj-$(CONFIG_SYSFS) += ksysfs.o
32obj-y += bootflag.o e820.o 33obj-y += bootflag.o e820.o
33obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 34obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
34obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 35obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
35obj-y += tsc.o io_delay.o rtc.o 36obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
36obj-y += pci-iommu_table.o 37obj-y += pci-iommu_table.o
37obj-y += resource.o 38obj-y += resource.o
38 39
@@ -91,15 +92,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
91 92
92obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 93obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
93 94
94obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o
95obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o
96obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o
97microcode-y := microcode_core.o
98microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
99microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
100obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o
101obj-$(CONFIG_MICROCODE) += microcode.o
102
103obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 95obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
104 96
105obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 97obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
@@ -111,6 +103,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
111 103
112obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 104obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
113obj-$(CONFIG_TRACING) += tracepoint.o 105obj-$(CONFIG_TRACING) += tracepoint.o
106obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o
114 107
115### 108###
116# 64 bit specific files 109# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781bc..e69182fd01cf 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
150} 150}
151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 151EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
152 152
153/*
154 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
155 * which can obviate IPI to trigger checking of need_resched.
156 * We execute MONITOR against need_resched and enter optimized wait state
157 * through MWAIT. Whenever someone changes need_resched, we would be woken
158 * up from MWAIT (without an IPI).
159 *
160 * New with Core Duo processors, MWAIT can take some hints based on CPU
161 * capability.
162 */
163void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
164{
165 if (!need_resched()) {
166 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
167 clflush((void *)&current_thread_info()->flags);
168
169 __monitor((void *)&current_thread_info()->flags, 0, 0);
170 smp_mb();
171 if (!need_resched())
172 __mwait(ax, cx);
173 }
174}
175
176void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 153void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
177{ 154{
178 unsigned int cpu = smp_processor_id(); 155 unsigned int cpu = smp_processor_id();
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d278736bf774..7f26c9a70a9e 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid;
75physid_mask_t phys_cpu_present_map; 75physid_mask_t phys_cpu_present_map;
76 76
77/* 77/*
78 * Processor to be disabled specified by kernel parameter
79 * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to
80 * avoid undefined behaviour caused by sending INIT from AP to BSP.
81 */
82static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID;
83
84/*
78 * Map cpu index to physical APIC ID 85 * Map cpu index to physical APIC ID
79 */ 86 */
80DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); 87DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID);
@@ -1968,7 +1975,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
1968 */ 1975 */
1969static inline void __smp_error_interrupt(struct pt_regs *regs) 1976static inline void __smp_error_interrupt(struct pt_regs *regs)
1970{ 1977{
1971 u32 v0, v1; 1978 u32 v;
1972 u32 i = 0; 1979 u32 i = 0;
1973 static const char * const error_interrupt_reason[] = { 1980 static const char * const error_interrupt_reason[] = {
1974 "Send CS error", /* APIC Error Bit 0 */ 1981 "Send CS error", /* APIC Error Bit 0 */
@@ -1982,21 +1989,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs)
1982 }; 1989 };
1983 1990
1984 /* First tickle the hardware, only then report what went on. -- REW */ 1991 /* First tickle the hardware, only then report what went on. -- REW */
1985 v0 = apic_read(APIC_ESR);
1986 apic_write(APIC_ESR, 0); 1992 apic_write(APIC_ESR, 0);
1987 v1 = apic_read(APIC_ESR); 1993 v = apic_read(APIC_ESR);
1988 ack_APIC_irq(); 1994 ack_APIC_irq();
1989 atomic_inc(&irq_err_count); 1995 atomic_inc(&irq_err_count);
1990 1996
1991 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", 1997 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x",
1992 smp_processor_id(), v0 , v1); 1998 smp_processor_id(), v);
1993 1999
1994 v1 = v1 & 0xff; 2000 v &= 0xff;
1995 while (v1) { 2001 while (v) {
1996 if (v1 & 0x1) 2002 if (v & 0x1)
1997 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); 2003 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1998 i++; 2004 i++;
1999 v1 >>= 1; 2005 v >>= 1;
2000 } 2006 }
2001 2007
2002 apic_printk(APIC_DEBUG, KERN_CONT "\n"); 2008 apic_printk(APIC_DEBUG, KERN_CONT "\n");
@@ -2115,6 +2121,39 @@ int generic_processor_info(int apicid, int version)
2115 phys_cpu_present_map); 2121 phys_cpu_present_map);
2116 2122
2117 /* 2123 /*
2124 * boot_cpu_physical_apicid is designed to have the apicid
2125 * returned by read_apic_id(), i.e, the apicid of the
2126 * currently booting-up processor. However, on some platforms,
2127 * it is temporarily modified by the apicid reported as BSP
2128 * through MP table. Concretely:
2129 *
2130 * - arch/x86/kernel/mpparse.c: MP_processor_info()
2131 * - arch/x86/mm/amdtopology.c: amd_numa_init()
2132 * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info()
2133 *
2134 * This function is executed with the modified
2135 * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel
2136 * parameter doesn't work to disable APs on kdump 2nd kernel.
2137 *
2138 * Since fixing handling of boot_cpu_physical_apicid requires
2139 * another discussion and tests on each platform, we leave it
2140 * for now and here we use read_apic_id() directly in this
2141 * function, generic_processor_info().
2142 */
2143 if (disabled_cpu_apicid != BAD_APICID &&
2144 disabled_cpu_apicid != read_apic_id() &&
2145 disabled_cpu_apicid == apicid) {
2146 int thiscpu = num_processors + disabled_cpus;
2147
2148 pr_warning("APIC: Disabling requested cpu."
2149 " Processor %d/0x%x ignored.\n",
2150 thiscpu, apicid);
2151
2152 disabled_cpus++;
2153 return -ENODEV;
2154 }
2155
2156 /*
2118 * If boot cpu has not been detected yet, then only allow upto 2157 * If boot cpu has not been detected yet, then only allow upto
2119 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu 2158 * nr_cpu_ids - 1 processors and keep one slot free for boot cpu
2120 */ 2159 */
@@ -2592,3 +2631,12 @@ static int __init lapic_insert_resource(void)
2592 * that is using request_resource 2631 * that is using request_resource
2593 */ 2632 */
2594late_initcall(lapic_insert_resource); 2633late_initcall(lapic_insert_resource);
2634
2635static int __init apic_set_disabled_cpu_apicid(char *arg)
2636{
2637 if (!arg || !get_option(&arg, &disabled_cpu_apicid))
2638 return -EINVAL;
2639
2640 return 0;
2641}
2642early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid);
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf78e9e..5d5b9eb2b7a4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -14,7 +14,6 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h>
18#include <linux/hardirq.h> 17#include <linux/hardirq.h>
19#include <linux/module.h> 18#include <linux/module.h>
20#include <asm/smp.h> 19#include <asm/smp.h>
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b4099..191ce75c0e54 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -15,7 +15,6 @@
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h> 18#include <linux/errno.h>
20#include <asm/fixmap.h> 19#include <asm/fixmap.h>
21#include <asm/mpspec.h> 20#include <asm/mpspec.h>
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e63a5bd2a78f..a43f068ebec1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1142,9 +1142,10 @@ next:
1142 if (test_bit(vector, used_vectors)) 1142 if (test_bit(vector, used_vectors))
1143 goto next; 1143 goto next;
1144 1144
1145 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) 1145 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
1146 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1146 if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
1147 goto next; 1147 goto next;
1148 }
1148 /* Found one! */ 1149 /* Found one! */
1149 current_vector = vector; 1150 current_vector = vector;
1150 current_offset = offset; 1151 current_offset = offset;
@@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1183 1184
1184 vector = cfg->vector; 1185 vector = cfg->vector;
1185 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) 1186 for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
1186 per_cpu(vector_irq, cpu)[vector] = -1; 1187 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1187 1188
1188 cfg->vector = 0; 1189 cfg->vector = 0;
1189 cpumask_clear(cfg->domain); 1190 cpumask_clear(cfg->domain);
@@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1191 if (likely(!cfg->move_in_progress)) 1192 if (likely(!cfg->move_in_progress))
1192 return; 1193 return;
1193 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { 1194 for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
1194 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; 1195 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
1195 vector++) {
1196 if (per_cpu(vector_irq, cpu)[vector] != irq) 1196 if (per_cpu(vector_irq, cpu)[vector] != irq)
1197 continue; 1197 continue;
1198 per_cpu(vector_irq, cpu)[vector] = -1; 1198 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1199 break; 1199 break;
1200 } 1200 }
1201 } 1201 }
@@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu)
1228 /* Mark the free vectors */ 1228 /* Mark the free vectors */
1229 for (vector = 0; vector < NR_VECTORS; ++vector) { 1229 for (vector = 0; vector < NR_VECTORS; ++vector) {
1230 irq = per_cpu(vector_irq, cpu)[vector]; 1230 irq = per_cpu(vector_irq, cpu)[vector];
1231 if (irq < 0) 1231 if (irq <= VECTOR_UNDEFINED)
1232 continue; 1232 continue;
1233 1233
1234 cfg = irq_cfg(irq); 1234 cfg = irq_cfg(irq);
1235 if (!cpumask_test_cpu(cpu, cfg->domain)) 1235 if (!cpumask_test_cpu(cpu, cfg->domain))
1236 per_cpu(vector_irq, cpu)[vector] = -1; 1236 per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
1237 } 1237 }
1238 raw_spin_unlock(&vector_lock); 1238 raw_spin_unlock(&vector_lock);
1239} 1239}
@@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2202 2202
2203 me = smp_processor_id(); 2203 me = smp_processor_id();
2204 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2204 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
2205 unsigned int irq; 2205 int irq;
2206 unsigned int irr; 2206 unsigned int irr;
2207 struct irq_desc *desc; 2207 struct irq_desc *desc;
2208 struct irq_cfg *cfg; 2208 struct irq_cfg *cfg;
2209 irq = __this_cpu_read(vector_irq[vector]); 2209 irq = __this_cpu_read(vector_irq[vector]);
2210 2210
2211 if (irq == -1) 2211 if (irq <= VECTOR_UNDEFINED)
2212 continue; 2212 continue;
2213 2213
2214 desc = irq_to_desc(irq); 2214 desc = irq_to_desc(irq);
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 7434d8556d09..62071569bd50 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -1,6 +1,5 @@
1#include <linux/cpumask.h> 1#include <linux/cpumask.h>
2#include <linux/interrupt.h> 2#include <linux/interrupt.h>
3#include <linux/init.h>
4 3
5#include <linux/mm.h> 4#include <linux/mm.h>
6#include <linux/delay.h> 5#include <linux/delay.h>
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 77c95c0e1bf7..00146f9b0254 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -29,7 +29,6 @@
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__ 29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30 30
31#include <linux/mm.h> 31#include <linux/mm.h>
32#include <linux/init.h>
33#include <asm/io.h> 32#include <asm/io.h>
34#include <asm/bios_ebda.h> 33#include <asm/bios_ebda.h>
35 34
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 140e29db478d..cac85ee6913f 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8#include <linux/cpu.h> 7#include <linux/cpu.h>
9 8
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 562a76d433c8..de231e328cae 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -3,7 +3,6 @@
3#include <linux/string.h> 3#include <linux/string.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h> 6#include <linux/dmar.h>
8 7
9#include <asm/smp.h> 8#include <asm/smp.h>
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabdd..83a7995625a6 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void)
91 91
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 93
94 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 94 for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE),
96 PAGE_SIZE, corruption_check_size); 96 PAGE_SIZE, corruption_check_size);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE),
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 47b56a7e99cb..7fd54f09b011 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o
36endif 36endif
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o 37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o 38obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o 39obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o
40endif 40endif
41 41
42 42
43obj-$(CONFIG_X86_MCE) += mcheck/ 43obj-$(CONFIG_X86_MCE) += mcheck/
44obj-$(CONFIG_MTRR) += mtrr/ 44obj-$(CONFIG_MTRR) += mtrr/
45obj-$(CONFIG_MICROCODE) += microcode/
45 46
46obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o 47obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o
47 48
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index bca023bdd6b2..d3153e281d72 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,5 +1,4 @@
1#include <linux/export.h> 1#include <linux/export.h>
2#include <linux/init.h>
3#include <linux/bitops.h> 2#include <linux/bitops.h>
4#include <linux/elf.h> 3#include <linux/elf.h>
5#include <linux/mm.h> 4#include <linux/mm.h>
@@ -487,7 +486,7 @@ static void early_init_amd(struct cpuinfo_x86 *c)
487 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 486 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
488 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 487 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
489 if (!check_tsc_unstable()) 488 if (!check_tsc_unstable())
490 sched_clock_stable = 1; 489 set_sched_clock_stable();
491 } 490 }
492 491
493#ifdef CONFIG_X86_64 492#ifdef CONFIG_X86_64
@@ -508,6 +507,16 @@ static void early_init_amd(struct cpuinfo_x86 *c)
508 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 507 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
509 } 508 }
510#endif 509#endif
510
511 /* F16h erratum 793, CVE-2013-6885 */
512 if (c->x86 == 0x16 && c->x86_model <= 0xf) {
513 u64 val;
514
515 rdmsrl(MSR_AMD64_LS_CFG, val);
516 if (!(val & BIT(15)))
517 wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15));
518 }
519
511} 520}
512 521
513static const int amd_erratum_383[]; 522static const int amd_erratum_383[];
@@ -790,14 +799,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
790 } 799 }
791 800
792 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ 801 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
793 if (!((eax >> 16) & mask)) { 802 if (!((eax >> 16) & mask))
794 u32 a, b, c, d; 803 tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff;
795 804 else
796 cpuid(0x80000005, &a, &b, &c, &d);
797 tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
798 } else {
799 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; 805 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
800 }
801 806
802 /* a 4M entry uses two 2M entries */ 807 /* a 4M entry uses two 2M entries */
803 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; 808 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 8d5652dc99dd..8779edab684e 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,6 +1,5 @@
1#include <linux/bitops.h> 1#include <linux/bitops.h>
2#include <linux/kernel.h> 2#include <linux/kernel.h>
3#include <linux/init.h>
4 3
5#include <asm/processor.h> 4#include <asm/processor.h>
6#include <asm/e820.h> 5#include <asm/e820.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6abc172b8258..24b6fd10625a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO];
472u16 __read_mostly tlb_lld_4k[NR_INFO]; 472u16 __read_mostly tlb_lld_4k[NR_INFO];
473u16 __read_mostly tlb_lld_2m[NR_INFO]; 473u16 __read_mostly tlb_lld_2m[NR_INFO];
474u16 __read_mostly tlb_lld_4m[NR_INFO]; 474u16 __read_mostly tlb_lld_4m[NR_INFO];
475u16 __read_mostly tlb_lld_1g[NR_INFO];
475 476
476/* 477/*
477 * tlb_flushall_shift shows the balance point in replacing cr3 write 478 * tlb_flushall_shift shows the balance point in replacing cr3 write
@@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c)
486 if (this_cpu->c_detect_tlb) 487 if (this_cpu->c_detect_tlb)
487 this_cpu->c_detect_tlb(c); 488 this_cpu->c_detect_tlb(c);
488 489
489 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 490 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n"
490 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ 491 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n"
491 "tlb_flushall_shift: %d\n", 492 "tlb_flushall_shift: %d\n",
492 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], 493 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
493 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], 494 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
494 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], 495 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
495 tlb_flushall_shift); 496 tlb_lld_1g[ENTRIES], tlb_flushall_shift);
496} 497}
497 498
498void detect_ht(struct cpuinfo_x86 *c) 499void detect_ht(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index d0969c75ab54..aaf152e79637 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/bitops.h> 1#include <linux/bitops.h>
3#include <linux/delay.h> 2#include <linux/delay.h>
4#include <linux/pci.h> 3#include <linux/pci.h>
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index dc1ec0dff939..3db61c644e44 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1,4 +1,3 @@
1#include <linux/init.h>
2#include <linux/kernel.h> 1#include <linux/kernel.h>
3 2
4#include <linux/string.h> 3#include <linux/string.h>
@@ -93,7 +92,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
93 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
94 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
95 if (!check_tsc_unstable()) 94 if (!check_tsc_unstable())
96 sched_clock_stable = 1; 95 set_sched_clock_stable();
97 } 96 }
98 97
99 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ 98 /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */
@@ -387,7 +386,8 @@ static void init_intel(struct cpuinfo_x86 *c)
387 set_cpu_cap(c, X86_FEATURE_PEBS); 386 set_cpu_cap(c, X86_FEATURE_PEBS);
388 } 387 }
389 388
390 if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) 389 if (c->x86 == 6 && cpu_has_clflush &&
390 (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47))
391 set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); 391 set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR);
392 392
393#ifdef CONFIG_X86_64 393#ifdef CONFIG_X86_64
@@ -505,6 +505,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
505#define TLB_DATA0_2M_4M 0x23 505#define TLB_DATA0_2M_4M 0x23
506 506
507#define STLB_4K 0x41 507#define STLB_4K 0x41
508#define STLB_4K_2M 0x42
508 509
509static const struct _tlb_table intel_tlb_table[] = { 510static const struct _tlb_table intel_tlb_table[] = {
510 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, 511 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
@@ -525,13 +526,20 @@ static const struct _tlb_table intel_tlb_table[] = {
525 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, 526 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
526 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, 527 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
527 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, 528 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
529 { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" },
530 { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" },
531 { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
528 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, 532 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
529 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, 533 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
530 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, 534 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
531 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, 535 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
532 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, 536 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
537 { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" },
538 { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" },
533 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, 539 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
534 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, 540 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
541 { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" },
542 { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" },
535 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, 543 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
536 { 0x00, 0, 0 } 544 { 0x00, 0, 0 }
537}; 545};
@@ -557,6 +565,20 @@ static void intel_tlb_lookup(const unsigned char desc)
557 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) 565 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
558 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; 566 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
559 break; 567 break;
568 case STLB_4K_2M:
569 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
570 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
571 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
572 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
573 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
574 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
575 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
576 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
577 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
578 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
579 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
580 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
581 break;
560 case TLB_INST_ALL: 582 case TLB_INST_ALL:
561 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) 583 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
562 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; 584 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
@@ -602,6 +624,10 @@ static void intel_tlb_lookup(const unsigned char desc)
602 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) 624 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
603 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; 625 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
604 break; 626 break;
627 case TLB_DATA_1G:
628 if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries)
629 tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries;
630 break;
605 } 631 }
606} 632}
607 633
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index de8b60a53f69..a1aef9533154 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -33,22 +33,28 @@
33#include <linux/acpi.h> 33#include <linux/acpi.h>
34#include <linux/cper.h> 34#include <linux/cper.h>
35#include <acpi/apei.h> 35#include <acpi/apei.h>
36#include <acpi/ghes.h>
36#include <asm/mce.h> 37#include <asm/mce.h>
37 38
38#include "mce-internal.h" 39#include "mce-internal.h"
39 40
40void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) 41void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
41{ 42{
42 struct mce m; 43 struct mce m;
43 44
44 /* Only corrected MC is reported */ 45 if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
45 if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
46 return; 46 return;
47 47
48 mce_setup(&m); 48 mce_setup(&m);
49 m.bank = 1; 49 m.bank = 1;
50 /* Fake a memory read corrected error with unknown channel */ 50 /* Fake a memory read error with unknown channel */
51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; 51 m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
52
53 if (severity >= GHES_SEV_RECOVERABLE)
54 m.status |= MCI_STATUS_UC;
55 if (severity >= GHES_SEV_PANIC)
56 m.status |= MCI_STATUS_PCC;
57
52 m.addr = mem_err->physical_addr; 58 m.addr = mem_err->physical_addr;
53 mce_log(&m); 59 mce_log(&m);
54 mce_notify_irq(); 60 mce_notify_irq();
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index b3218cdee95f..4d5419b249da 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1638 1638
1639static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1639static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1640{ 1640{
1641 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1641 unsigned long iv = check_interval * HZ;
1642
1643 __this_cpu_write(mce_next_interval, iv);
1644 1642
1645 if (mca_cfg.ignore_ce || !iv) 1643 if (mca_cfg.ignore_ce || !iv)
1646 return; 1644 return;
1647 1645
1646 per_cpu(mce_next_interval, cpu) = iv;
1647
1648 t->expires = round_jiffies(jiffies + iv); 1648 t->expires = round_jiffies(jiffies + iv);
1649 add_timer_on(t, smp_processor_id()); 1649 add_timer_on(t, cpu);
1650} 1650}
1651 1651
1652static void __mcheck_cpu_init_timer(void) 1652static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
2272 dev->release = &mce_device_release; 2272 dev->release = &mce_device_release;
2273 2273
2274 err = device_register(dev); 2274 err = device_register(dev);
2275 if (err) 2275 if (err) {
2276 put_device(dev);
2276 return err; 2277 return err;
2278 }
2277 2279
2278 for (i = 0; mce_device_attrs[i]; i++) { 2280 for (i = 0; mce_device_attrs[i]; i++) {
2279 err = device_create_file(dev, mce_device_attrs[i]); 2281 err = device_create_file(dev, mce_device_attrs[i]);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 4cfe0458ca66..fb6156fee6f7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/gfp.h> 8#include <linux/gfp.h>
9#include <linux/init.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
11#include <linux/percpu.h> 10#include <linux/percpu.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 1c044b1ccc59..a3042989398c 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9#include <linux/smp.h> 8#include <linux/smp.h>
10 9
11#include <asm/processor.h> 10#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index e9a701aecaa1..7dc5564d0cdf 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -5,7 +5,6 @@
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel.h> 6#include <linux/kernel.h>
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/init.h>
9 8
10#include <asm/processor.h> 9#include <asm/processor.h>
11#include <asm/mce.h> 10#include <asm/mce.h>
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile
new file mode 100644
index 000000000000..285c85427c32
--- /dev/null
+++ b/arch/x86/kernel/cpu/microcode/Makefile
@@ -0,0 +1,7 @@
1microcode-y := core.o
2obj-$(CONFIG_MICROCODE) += microcode.o
3microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o
4microcode-$(CONFIG_MICROCODE_AMD) += amd.o
5obj-$(CONFIG_MICROCODE_EARLY) += core_early.o
6obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o
7obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index c3d4cc972eca..8fffd845e22b 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd)
182{ 182{
183 u32 rev, dummy; 183 u32 rev, dummy;
184 184
185 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 185 native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
186 186
187 /* verify patch application was successful */ 187 /* verify patch application was successful */
188 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 188 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
189 if (rev != mc_amd->hdr.patch_id) 189 if (rev != mc_amd->hdr.patch_id)
190 return -1; 190 return -1;
191 191
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover)
332 patch->patch_id = mc_hdr->patch_id; 332 patch->patch_id = mc_hdr->patch_id;
333 patch->equiv_cpu = proc_id; 333 patch->equiv_cpu = proc_id;
334 334
335 pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n",
336 __func__, patch->patch_id, proc_id);
337
335 /* ... and add to cache. */ 338 /* ... and add to cache. */
336 update_cache(patch); 339 update_cache(patch);
337 340
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size)
390 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { 393 if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) {
391 struct ucode_patch *p = find_patch(smp_processor_id()); 394 struct ucode_patch *p = find_patch(smp_processor_id());
392 if (p) { 395 if (p) {
393 memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); 396 memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
394 memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), 397 memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
395 MPB_MAX_SIZE)); 398 PATCH_MAX_SIZE));
396 } 399 }
397 } 400 }
398#endif 401#endif
@@ -430,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
430 if (c->x86 >= 0x15) 433 if (c->x86 >= 0x15)
431 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); 434 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
432 435
433 if (request_firmware(&fw, (const char *)fw_name, device)) { 436 if (request_firmware_direct(&fw, (const char *)fw_name, device)) {
434 pr_debug("failed to load file %s\n", fw_name); 437 pr_debug("failed to load file %s\n", fw_name);
435 goto out; 438 goto out;
436 } 439 }
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c
index 6073104ccaa3..8384c0fa206f 100644
--- a/arch/x86/kernel/microcode_amd_early.c
+++ b/arch/x86/kernel/cpu/microcode/amd_early.c
@@ -2,6 +2,7 @@
2 * Copyright (C) 2013 Advanced Micro Devices, Inc. 2 * Copyright (C) 2013 Advanced Micro Devices, Inc.
3 * 3 *
4 * Author: Jacob Shin <jacob.shin@amd.com> 4 * Author: Jacob Shin <jacob.shin@amd.com>
5 * Fixes: Borislav Petkov <bp@suse.de>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
@@ -15,10 +16,18 @@
15#include <asm/setup.h> 16#include <asm/setup.h>
16#include <asm/microcode_amd.h> 17#include <asm/microcode_amd.h>
17 18
18static bool ucode_loaded; 19/*
20 * This points to the current valid container of microcode patches which we will
21 * save from the initrd before jettisoning its contents.
22 */
23static u8 *container;
24static size_t container_size;
25
19static u32 ucode_new_rev; 26static u32 ucode_new_rev;
20static unsigned long ucode_offset; 27u8 amd_ucode_patch[PATCH_MAX_SIZE];
21static size_t ucode_size; 28static u16 this_equiv_id;
29
30struct cpio_data ucode_cpio;
22 31
23/* 32/*
24 * Microcode patch container file is prepended to the initrd in cpio format. 33 * Microcode patch container file is prepended to the initrd in cpio format.
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void)
32 char *path; 41 char *path;
33 void *start; 42 void *start;
34 size_t size; 43 size_t size;
35 unsigned long *uoffset;
36 size_t *usize;
37 struct cpio_data cd;
38 44
39#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
40 struct boot_params *p; 46 struct boot_params *p;
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void)
47 path = (char *)__pa_nodebug(ucode_path); 53 path = (char *)__pa_nodebug(ucode_path);
48 start = (void *)p->hdr.ramdisk_image; 54 start = (void *)p->hdr.ramdisk_image;
49 size = p->hdr.ramdisk_size; 55 size = p->hdr.ramdisk_size;
50 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset);
51 usize = (size_t *)__pa_nodebug(&ucode_size);
52#else 56#else
53 path = ucode_path; 57 path = ucode_path;
54 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); 58 start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET);
55 size = boot_params.hdr.ramdisk_size; 59 size = boot_params.hdr.ramdisk_size;
56 uoffset = &ucode_offset;
57 usize = &ucode_size;
58#endif 60#endif
59 61
60 cd = find_cpio_data(path, start, size, &offset); 62 return find_cpio_data(path, start, size, &offset);
61 if (!cd.data) 63}
62 return cd;
63 64
64 if (*(u32 *)cd.data != UCODE_MAGIC) { 65static size_t compute_container_size(u8 *data, u32 total_size)
65 cd.data = NULL; 66{
66 cd.size = 0; 67 size_t size = 0;
67 return cd; 68 u32 *header = (u32 *)data;
68 }
69 69
70 *uoffset = (u8 *)cd.data - (u8 *)start; 70 if (header[0] != UCODE_MAGIC ||
71 *usize = cd.size; 71 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
72 header[2] == 0) /* size */
73 return size;
72 74
73 return cd; 75 size = header[2] + CONTAINER_HDR_SZ;
76 total_size -= size;
77 data += size;
78
79 while (total_size) {
80 u16 patch_size;
81
82 header = (u32 *)data;
83
84 if (header[0] != UCODE_UCODE_TYPE)
85 break;
86
87 /*
88 * Sanity-check patch size.
89 */
90 patch_size = header[1];
91 if (patch_size > PATCH_MAX_SIZE)
92 break;
93
94 size += patch_size + SECTION_HDR_SIZE;
95 data += patch_size + SECTION_HDR_SIZE;
96 total_size -= patch_size + SECTION_HDR_SIZE;
97 }
98
99 return size;
74} 100}
75 101
76/* 102/*
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void)
85static void apply_ucode_in_initrd(void *ucode, size_t size) 111static void apply_ucode_in_initrd(void *ucode, size_t size)
86{ 112{
87 struct equiv_cpu_entry *eq; 113 struct equiv_cpu_entry *eq;
114 size_t *cont_sz;
88 u32 *header; 115 u32 *header;
89 u8 *data; 116 u8 *data, **cont;
90 u16 eq_id = 0; 117 u16 eq_id = 0;
91 int offset, left; 118 int offset, left;
92 u32 rev, eax; 119 u32 rev, eax, ebx, ecx, edx;
93 u32 *new_rev; 120 u32 *new_rev;
94 unsigned long *uoffset;
95 size_t *usize;
96 121
97#ifdef CONFIG_X86_32 122#ifdef CONFIG_X86_32
98 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); 123 new_rev = (u32 *)__pa_nodebug(&ucode_new_rev);
99 uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); 124 cont_sz = (size_t *)__pa_nodebug(&container_size);
100 usize = (size_t *)__pa_nodebug(&ucode_size); 125 cont = (u8 **)__pa_nodebug(&container);
101#else 126#else
102 new_rev = &ucode_new_rev; 127 new_rev = &ucode_new_rev;
103 uoffset = &ucode_offset; 128 cont_sz = &container_size;
104 usize = &ucode_size; 129 cont = &container;
105#endif 130#endif
106 131
107 data = ucode; 132 data = ucode;
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
109 header = (u32 *)data; 134 header = (u32 *)data;
110 135
111 /* find equiv cpu table */ 136 /* find equiv cpu table */
112 137 if (header[0] != UCODE_MAGIC ||
113 if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ 138 header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */
114 header[2] == 0) /* size */ 139 header[2] == 0) /* size */
115 return; 140 return;
116 141
117 eax = cpuid_eax(0x00000001); 142 eax = 0x00000001;
143 ecx = 0;
144 native_cpuid(&eax, &ebx, &ecx, &edx);
118 145
119 while (left > 0) { 146 while (left > 0) {
120 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); 147 eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ);
121 148
149 *cont = data;
150
151 /* Advance past the container header */
122 offset = header[2] + CONTAINER_HDR_SZ; 152 offset = header[2] + CONTAINER_HDR_SZ;
123 data += offset; 153 data += offset;
124 left -= offset; 154 left -= offset;
125 155
126 eq_id = find_equiv_id(eq, eax); 156 eq_id = find_equiv_id(eq, eax);
127 if (eq_id) 157 if (eq_id) {
158 this_equiv_id = eq_id;
159 *cont_sz = compute_container_size(*cont, left + offset);
160
161 /*
162 * truncate how much we need to iterate over in the
163 * ucode update loop below
164 */
165 left = *cont_sz - offset;
128 break; 166 break;
167 }
129 168
130 /* 169 /*
131 * support multiple container files appended together. if this 170 * support multiple container files appended together. if this
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
145 184
146 /* mark where the next microcode container file starts */ 185 /* mark where the next microcode container file starts */
147 offset = data - (u8 *)ucode; 186 offset = data - (u8 *)ucode;
148 *uoffset += offset;
149 *usize -= offset;
150 ucode = data; 187 ucode = data;
151 } 188 }
152 189
153 if (!eq_id) { 190 if (!eq_id) {
154 *usize = 0; 191 *cont = NULL;
192 *cont_sz = 0;
155 return; 193 return;
156 } 194 }
157 195
158 /* find ucode and update if needed */ 196 /* find ucode and update if needed */
159 197
160 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 198 native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
161 199
162 while (left > 0) { 200 while (left > 0) {
163 struct microcode_amd *mc; 201 struct microcode_amd *mc;
@@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size)
168 break; 206 break;
169 207
170 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); 208 mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE);
171 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) 209
172 if (__apply_microcode_amd(mc) == 0) { 210 if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) {
211
212 if (!__apply_microcode_amd(mc)) {
173 rev = mc->hdr.patch_id; 213 rev = mc->hdr.patch_id;
174 *new_rev = rev; 214 *new_rev = rev;
215
216 /* save ucode patch */
217 memcpy(amd_ucode_patch, mc,
218 min_t(u32, header[1], PATCH_MAX_SIZE));
175 } 219 }
220 }
176 221
177 offset = header[1] + SECTION_HDR_SIZE; 222 offset = header[1] + SECTION_HDR_SIZE;
178 data += offset; 223 data += offset;
179 left -= offset; 224 left -= offset;
180 } 225 }
181
182 /* mark where this microcode container file ends */
183 offset = *usize - (data - (u8 *)ucode);
184 *usize -= offset;
185
186 if (!(*new_rev))
187 *usize = 0;
188} 226}
189 227
190void __init load_ucode_amd_bsp(void) 228void __init load_ucode_amd_bsp(void)
191{ 229{
192 struct cpio_data cd = find_ucode_in_initrd(); 230 struct cpio_data cp;
193 if (!cd.data) 231 void **data;
232 size_t *size;
233
234#ifdef CONFIG_X86_32
235 data = (void **)__pa_nodebug(&ucode_cpio.data);
236 size = (size_t *)__pa_nodebug(&ucode_cpio.size);
237#else
238 data = &ucode_cpio.data;
239 size = &ucode_cpio.size;
240#endif
241
242 cp = find_ucode_in_initrd();
243 if (!cp.data)
194 return; 244 return;
195 245
196 apply_ucode_in_initrd(cd.data, cd.size); 246 *data = cp.data;
247 *size = cp.size;
248
249 apply_ucode_in_initrd(cp.data, cp.size);
197} 250}
198 251
199#ifdef CONFIG_X86_32 252#ifdef CONFIG_X86_32
200u8 amd_bsp_mpb[MPB_MAX_SIZE];
201
202/* 253/*
203 * On 32-bit, since AP's early load occurs before paging is turned on, we 254 * On 32-bit, since AP's early load occurs before paging is turned on, we
204 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during 255 * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during
205 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During 256 * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During
206 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which 257 * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch,
207 * is used upon resume from suspend. 258 * which is used upon resume from suspend.
208 */ 259 */
209void load_ucode_amd_ap(void) 260void load_ucode_amd_ap(void)
210{ 261{
211 struct microcode_amd *mc; 262 struct microcode_amd *mc;
212 unsigned long *initrd;
213 unsigned long *uoffset;
214 size_t *usize; 263 size_t *usize;
215 void *ucode; 264 void **ucode;
216 265
217 mc = (struct microcode_amd *)__pa(amd_bsp_mpb); 266 mc = (struct microcode_amd *)__pa(amd_ucode_patch);
218 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { 267 if (mc->hdr.patch_id && mc->hdr.processor_rev_id) {
219 __apply_microcode_amd(mc); 268 __apply_microcode_amd(mc);
220 return; 269 return;
221 } 270 }
222 271
223 initrd = (unsigned long *)__pa(&initrd_start); 272 ucode = (void *)__pa_nodebug(&container);
224 uoffset = (unsigned long *)__pa(&ucode_offset); 273 usize = (size_t *)__pa_nodebug(&container_size);
225 usize = (size_t *)__pa(&ucode_size);
226 274
227 if (!*usize || !*initrd) 275 if (!*ucode || !*usize)
228 return; 276 return;
229 277
230 ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); 278 apply_ucode_in_initrd(*ucode, *usize);
231 apply_ucode_in_initrd(ucode, *usize);
232} 279}
233 280
234static void __init collect_cpu_sig_on_bsp(void *arg) 281static void __init collect_cpu_sig_on_bsp(void *arg)
235{ 282{
236 unsigned int cpu = smp_processor_id(); 283 unsigned int cpu = smp_processor_id();
237 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 284 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
285
238 uci->cpu_sig.sig = cpuid_eax(0x00000001); 286 uci->cpu_sig.sig = cpuid_eax(0x00000001);
239} 287}
240#else 288#else
@@ -242,36 +290,54 @@ void load_ucode_amd_ap(void)
242{ 290{
243 unsigned int cpu = smp_processor_id(); 291 unsigned int cpu = smp_processor_id();
244 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 292 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
293 struct equiv_cpu_entry *eq;
294 struct microcode_amd *mc;
245 u32 rev, eax; 295 u32 rev, eax;
296 u16 eq_id;
297
298 /* Exit if called on the BSP. */
299 if (!cpu)
300 return;
301
302 if (!container)
303 return;
246 304
247 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); 305 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax);
248 eax = cpuid_eax(0x00000001);
249 306
250 uci->cpu_sig.rev = rev; 307 uci->cpu_sig.rev = rev;
251 uci->cpu_sig.sig = eax; 308 uci->cpu_sig.sig = eax;
252 309
253 if (cpu && !ucode_loaded) { 310 eax = cpuid_eax(0x00000001);
254 void *ucode; 311 eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ);
255 312
256 if (!ucode_size || !initrd_start) 313 eq_id = find_equiv_id(eq, eax);
257 return; 314 if (!eq_id)
315 return;
316
317 if (eq_id == this_equiv_id) {
318 mc = (struct microcode_amd *)amd_ucode_patch;
258 319
259 ucode = (void *)(initrd_start + ucode_offset); 320 if (mc && rev < mc->hdr.patch_id) {
260 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 321 if (!__apply_microcode_amd(mc))
261 if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) 322 ucode_new_rev = mc->hdr.patch_id;
323 }
324
325 } else {
326 if (!ucode_cpio.data)
262 return; 327 return;
263 328
264 ucode_loaded = true; 329 /*
330 * AP has a different equivalence ID than BSP, looks like
331 * mixed-steppings silicon so go through the ucode blob anew.
332 */
333 apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size);
265 } 334 }
266
267 apply_microcode_amd(cpu);
268} 335}
269#endif 336#endif
270 337
271int __init save_microcode_in_initrd_amd(void) 338int __init save_microcode_in_initrd_amd(void)
272{ 339{
273 enum ucode_state ret; 340 enum ucode_state ret;
274 void *ucode;
275 u32 eax; 341 u32 eax;
276 342
277#ifdef CONFIG_X86_32 343#ifdef CONFIG_X86_32
@@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void)
280 346
281 if (!uci->cpu_sig.sig) 347 if (!uci->cpu_sig.sig)
282 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); 348 smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1);
349
350 /*
351 * Take into account the fact that the ramdisk might get relocated
352 * and therefore we need to recompute the container's position in
353 * virtual memory space.
354 */
355 container = (u8 *)(__va((u32)relocated_ramdisk) +
356 ((u32)container - boot_params.hdr.ramdisk_image));
283#endif 357#endif
284 if (ucode_new_rev) 358 if (ucode_new_rev)
285 pr_info("microcode: updated early to new patch_level=0x%08x\n", 359 pr_info("microcode: updated early to new patch_level=0x%08x\n",
286 ucode_new_rev); 360 ucode_new_rev);
287 361
288 if (ucode_loaded || !ucode_size || !initrd_start) 362 if (!container)
289 return 0; 363 return -EINVAL;
290 364
291 ucode = (void *)(initrd_start + ucode_offset);
292 eax = cpuid_eax(0x00000001); 365 eax = cpuid_eax(0x00000001);
293 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); 366 eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
294 367
295 ret = load_microcode_amd(eax, ucode, ucode_size); 368 ret = load_microcode_amd(eax, container, container_size);
296 if (ret != UCODE_OK) 369 if (ret != UCODE_OK)
297 return -EINVAL; 370 return -EINVAL;
298 371
299 ucode_loaded = true; 372 /*
373 * This will be freed any msec now, stash patches for the current
374 * family and switch to patch cache for cpu hotplug, etc later.
375 */
376 container = NULL;
377 container_size = 0;
378
300 return 0; 379 return 0;
301} 380}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c
index 15c987698b0f..15c987698b0f 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index be7f8514f577..be7f8514f577 100644
--- a/arch/x86/kernel/microcode_core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 5fb2cebf556b..a276fa75d9b5 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
278 sprintf(name, "intel-ucode/%02x-%02x-%02x", 278 sprintf(name, "intel-ucode/%02x-%02x-%02x",
279 c->x86, c->x86_model, c->x86_mask); 279 c->x86, c->x86_model, c->x86_mask);
280 280
281 if (request_firmware(&firmware, name, device)) { 281 if (request_firmware_direct(&firmware, name, device)) {
282 pr_debug("data file %s load failed\n", name); 282 pr_debug("data file %s load failed\n", name);
283 return UCODE_NFOUND; 283 return UCODE_NFOUND;
284 } 284 }
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 1575deb2e636..18f739129e72 100644
--- a/arch/x86/kernel/microcode_intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -365,16 +365,6 @@ out:
365 return state; 365 return state;
366} 366}
367 367
368#define native_rdmsr(msr, val1, val2) \
369do { \
370 u64 __val = native_read_msr((msr)); \
371 (void)((val1) = (u32)__val); \
372 (void)((val2) = (u32)(__val >> 32)); \
373} while (0)
374
375#define native_wrmsr(msr, low, high) \
376 native_write_msr(msr, low, high);
377
378static int collect_cpu_info_early(struct ucode_cpu_info *uci) 368static int collect_cpu_info_early(struct ucode_cpu_info *uci)
379{ 369{
380 unsigned int val[2]; 370 unsigned int val[2];
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..ce69320d0179 100644
--- a/arch/x86/kernel/microcode_intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8e132931614d..b88645191fe5 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1883,21 +1883,27 @@ static struct pmu pmu = {
1883 1883
1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) 1884void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1885{ 1885{
1886 struct cyc2ns_data *data;
1887
1886 userpg->cap_user_time = 0; 1888 userpg->cap_user_time = 0;
1887 userpg->cap_user_time_zero = 0; 1889 userpg->cap_user_time_zero = 0;
1888 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; 1890 userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc;
1889 userpg->pmc_width = x86_pmu.cntval_bits; 1891 userpg->pmc_width = x86_pmu.cntval_bits;
1890 1892
1891 if (!sched_clock_stable) 1893 if (!sched_clock_stable())
1892 return; 1894 return;
1893 1895
1896 data = cyc2ns_read_begin();
1897
1894 userpg->cap_user_time = 1; 1898 userpg->cap_user_time = 1;
1895 userpg->time_mult = this_cpu_read(cyc2ns); 1899 userpg->time_mult = data->cyc2ns_mul;
1896 userpg->time_shift = CYC2NS_SCALE_FACTOR; 1900 userpg->time_shift = data->cyc2ns_shift;
1897 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; 1901 userpg->time_offset = data->cyc2ns_offset - now;
1898 1902
1899 userpg->cap_user_time_zero = 1; 1903 userpg->cap_user_time_zero = 1;
1900 userpg->time_zero = this_cpu_read(cyc2ns_offset); 1904 userpg->time_zero = data->cyc2ns_offset;
1905
1906 cyc2ns_read_end(data);
1901} 1907}
1902 1908
1903/* 1909/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index fd00bb29425d..c1a861829d81 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -262,11 +262,20 @@ struct cpu_hw_events {
262 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ 262 __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
263 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) 263 HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
264 264
265#define EVENT_CONSTRAINT_END \ 265/*
266 EVENT_CONSTRAINT(0, 0, 0) 266 * We define the end marker as having a weight of -1
267 * to enable blacklisting of events using a counter bitmask
268 * of zero and thus a weight of zero.
269 * The end marker has a weight that cannot possibly be
270 * obtained from counting the bits in the bitmask.
271 */
272#define EVENT_CONSTRAINT_END { .weight = -1 }
267 273
274/*
275 * Check for end marker with weight == -1
276 */
268#define for_each_event_constraint(e, c) \ 277#define for_each_event_constraint(e, c) \
269 for ((e) = (c); (e)->weight; (e)++) 278 for ((e) = (c); (e)->weight != -1; (e)++)
270 279
271/* 280/*
272 * Extra registers for specific events. 281 * Extra registers for specific events.
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index e09f0bfb7b8f..4b8e4d3cd6ea 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/ptrace.h> 12#include <linux/ptrace.h>
13#include <linux/syscore_ops.h>
13 14
14#include <asm/apic.h> 15#include <asm/apic.h>
15 16
@@ -816,6 +817,18 @@ out:
816 return ret; 817 return ret;
817} 818}
818 819
820static void ibs_eilvt_setup(void)
821{
822 /*
823 * Force LVT offset assignment for family 10h: The offsets are
824 * not assigned by the BIOS for this family, so the OS is
825 * responsible for doing it. If the OS assignment fails, fall
826 * back to BIOS settings and try to setup this.
827 */
828 if (boot_cpu_data.x86 == 0x10)
829 force_ibs_eilvt_setup();
830}
831
819static inline int get_ibs_lvt_offset(void) 832static inline int get_ibs_lvt_offset(void)
820{ 833{
821 u64 val; 834 u64 val;
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy)
851 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); 864 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
852} 865}
853 866
867#ifdef CONFIG_PM
868
869static int perf_ibs_suspend(void)
870{
871 clear_APIC_ibs(NULL);
872 return 0;
873}
874
875static void perf_ibs_resume(void)
876{
877 ibs_eilvt_setup();
878 setup_APIC_ibs(NULL);
879}
880
881static struct syscore_ops perf_ibs_syscore_ops = {
882 .resume = perf_ibs_resume,
883 .suspend = perf_ibs_suspend,
884};
885
886static void perf_ibs_pm_init(void)
887{
888 register_syscore_ops(&perf_ibs_syscore_ops);
889}
890
891#else
892
893static inline void perf_ibs_pm_init(void) { }
894
895#endif
896
854static int 897static int
855perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) 898perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
856{ 899{
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void)
877 if (!caps) 920 if (!caps)
878 return -ENODEV; /* ibs not supported by the cpu */ 921 return -ENODEV; /* ibs not supported by the cpu */
879 922
880 /* 923 ibs_eilvt_setup();
881 * Force LVT offset assignment for family 10h: The offsets are
882 * not assigned by the BIOS for this family, so the OS is
883 * responsible for doing it. If the OS assignment fails, fall
884 * back to BIOS settings and try to setup this.
885 */
886 if (boot_cpu_data.x86 == 0x10)
887 force_ibs_eilvt_setup();
888 924
889 if (!ibs_eilvt_valid()) 925 if (!ibs_eilvt_valid())
890 goto out; 926 goto out;
891 927
928 perf_ibs_pm_init();
892 get_online_cpus(); 929 get_online_cpus();
893 ibs_caps = caps; 930 ibs_caps = caps;
894 /* make ibs_caps visible to other cpus: */ 931 /* make ibs_caps visible to other cpus: */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
new file mode 100644
index 000000000000..5ad35ad94d0f
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -0,0 +1,679 @@
1/*
2 * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3 * Copyright (C) 2013 Google, Inc., Stephane Eranian
4 *
5 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6 * section 14.7.1 (September 2013)
7 *
8 * RAPL provides more controls than just reporting energy consumption
9 * however here we only expose the 3 energy consumption free running
10 * counters (pp0, pkg, dram).
11 *
12 * Each of those counters increments in a power unit defined by the
13 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14 * but it can vary.
15 *
16 * Counter to rapl events mappings:
17 *
18 * pp0 counter: consumption of all physical cores (power plane 0)
19 * event: rapl_energy_cores
20 * perf code: 0x1
21 *
22 * pkg counter: consumption of the whole processor package
23 * event: rapl_energy_pkg
24 * perf code: 0x2
25 *
26 * dram counter: consumption of the dram domain (servers only)
27 * event: rapl_energy_dram
28 * perf code: 0x3
29 *
30 * dram counter: consumption of the builtin-gpu domain (client only)
31 * event: rapl_energy_gpu
32 * perf code: 0x4
33 *
34 * We manage those counters as free running (read-only). They may be
35 * use simultaneously by other tools, such as turbostat.
36 *
37 * The events only support system-wide mode counting. There is no
38 * sampling support because it does not make sense and is not
39 * supported by the RAPL hardware.
40 *
41 * Because we want to avoid floating-point operations in the kernel,
42 * the events are all reported in fixed point arithmetic (32.32).
43 * Tools must adjust the counts to convert them to Watts using
44 * the duration of the measurement. Tools may use a function such as
45 * ldexp(raw_count, -32);
46 */
47#include <linux/module.h>
48#include <linux/slab.h>
49#include <linux/perf_event.h>
50#include <asm/cpu_device_id.h>
51#include "perf_event.h"
52
53/*
54 * RAPL energy status counters
55 */
56#define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */
57#define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */
58#define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */
59#define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */
60#define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */
61#define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */
62#define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */
63#define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */
64
65/* Clients have PP0, PKG */
66#define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\
67 1<<RAPL_IDX_PKG_NRG_STAT|\
68 1<<RAPL_IDX_PP1_NRG_STAT)
69
70/* Servers have PP0, PKG, RAM */
71#define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\
72 1<<RAPL_IDX_PKG_NRG_STAT|\
73 1<<RAPL_IDX_RAM_NRG_STAT)
74
75/*
76 * event code: LSB 8 bits, passed in attr->config
77 * any other bit is reserved
78 */
79#define RAPL_EVENT_MASK 0xFFULL
80
81#define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \
82static ssize_t __rapl_##_var##_show(struct kobject *kobj, \
83 struct kobj_attribute *attr, \
84 char *page) \
85{ \
86 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
87 return sprintf(page, _format "\n"); \
88} \
89static struct kobj_attribute format_attr_##_var = \
90 __ATTR(_name, 0444, __rapl_##_var##_show, NULL)
91
92#define RAPL_EVENT_DESC(_name, _config) \
93{ \
94 .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \
95 .config = _config, \
96}
97
98#define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
99
100struct rapl_pmu {
101 spinlock_t lock;
102 int hw_unit; /* 1/2^hw_unit Joule */
103 int n_active; /* number of active events */
104 struct list_head active_list;
105 struct pmu *pmu; /* pointer to rapl_pmu_class */
106 ktime_t timer_interval; /* in ktime_t unit */
107 struct hrtimer hrtimer;
108};
109
110static struct pmu rapl_pmu_class;
111static cpumask_t rapl_cpu_mask;
112static int rapl_cntr_mask;
113
114static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
115static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
116
117static inline u64 rapl_read_counter(struct perf_event *event)
118{
119 u64 raw;
120 rdmsrl(event->hw.event_base, raw);
121 return raw;
122}
123
124static inline u64 rapl_scale(u64 v)
125{
126 /*
127 * scale delta to smallest unit (1/2^32)
128 * users must then scale back: count * 1/(1e9*2^32) to get Joules
129 * or use ldexp(count, -32).
130 * Watts = Joules/Time delta
131 */
132 return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit);
133}
134
135static u64 rapl_event_update(struct perf_event *event)
136{
137 struct hw_perf_event *hwc = &event->hw;
138 u64 prev_raw_count, new_raw_count;
139 s64 delta, sdelta;
140 int shift = RAPL_CNTR_WIDTH;
141
142again:
143 prev_raw_count = local64_read(&hwc->prev_count);
144 rdmsrl(event->hw.event_base, new_raw_count);
145
146 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
147 new_raw_count) != prev_raw_count) {
148 cpu_relax();
149 goto again;
150 }
151
152 /*
153 * Now we have the new raw value and have updated the prev
154 * timestamp already. We can now calculate the elapsed delta
155 * (event-)time and add that to the generic event.
156 *
157 * Careful, not all hw sign-extends above the physical width
158 * of the count.
159 */
160 delta = (new_raw_count << shift) - (prev_raw_count << shift);
161 delta >>= shift;
162
163 sdelta = rapl_scale(delta);
164
165 local64_add(sdelta, &event->count);
166
167 return new_raw_count;
168}
169
170static void rapl_start_hrtimer(struct rapl_pmu *pmu)
171{
172 __hrtimer_start_range_ns(&pmu->hrtimer,
173 pmu->timer_interval, 0,
174 HRTIMER_MODE_REL_PINNED, 0);
175}
176
177static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
178{
179 hrtimer_cancel(&pmu->hrtimer);
180}
181
182static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
183{
184 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
185 struct perf_event *event;
186 unsigned long flags;
187
188 if (!pmu->n_active)
189 return HRTIMER_NORESTART;
190
191 spin_lock_irqsave(&pmu->lock, flags);
192
193 list_for_each_entry(event, &pmu->active_list, active_entry) {
194 rapl_event_update(event);
195 }
196
197 spin_unlock_irqrestore(&pmu->lock, flags);
198
199 hrtimer_forward_now(hrtimer, pmu->timer_interval);
200
201 return HRTIMER_RESTART;
202}
203
204static void rapl_hrtimer_init(struct rapl_pmu *pmu)
205{
206 struct hrtimer *hr = &pmu->hrtimer;
207
208 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
209 hr->function = rapl_hrtimer_handle;
210}
211
212static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
213 struct perf_event *event)
214{
215 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
216 return;
217
218 event->hw.state = 0;
219
220 list_add_tail(&event->active_entry, &pmu->active_list);
221
222 local64_set(&event->hw.prev_count, rapl_read_counter(event));
223
224 pmu->n_active++;
225 if (pmu->n_active == 1)
226 rapl_start_hrtimer(pmu);
227}
228
229static void rapl_pmu_event_start(struct perf_event *event, int mode)
230{
231 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
232 unsigned long flags;
233
234 spin_lock_irqsave(&pmu->lock, flags);
235 __rapl_pmu_event_start(pmu, event);
236 spin_unlock_irqrestore(&pmu->lock, flags);
237}
238
239static void rapl_pmu_event_stop(struct perf_event *event, int mode)
240{
241 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
242 struct hw_perf_event *hwc = &event->hw;
243 unsigned long flags;
244
245 spin_lock_irqsave(&pmu->lock, flags);
246
247 /* mark event as deactivated and stopped */
248 if (!(hwc->state & PERF_HES_STOPPED)) {
249 WARN_ON_ONCE(pmu->n_active <= 0);
250 pmu->n_active--;
251 if (pmu->n_active == 0)
252 rapl_stop_hrtimer(pmu);
253
254 list_del(&event->active_entry);
255
256 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
257 hwc->state |= PERF_HES_STOPPED;
258 }
259
260 /* check if update of sw counter is necessary */
261 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
262 /*
263 * Drain the remaining delta count out of a event
264 * that we are disabling:
265 */
266 rapl_event_update(event);
267 hwc->state |= PERF_HES_UPTODATE;
268 }
269
270 spin_unlock_irqrestore(&pmu->lock, flags);
271}
272
273static int rapl_pmu_event_add(struct perf_event *event, int mode)
274{
275 struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu);
276 struct hw_perf_event *hwc = &event->hw;
277 unsigned long flags;
278
279 spin_lock_irqsave(&pmu->lock, flags);
280
281 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
282
283 if (mode & PERF_EF_START)
284 __rapl_pmu_event_start(pmu, event);
285
286 spin_unlock_irqrestore(&pmu->lock, flags);
287
288 return 0;
289}
290
291static void rapl_pmu_event_del(struct perf_event *event, int flags)
292{
293 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
294}
295
296static int rapl_pmu_event_init(struct perf_event *event)
297{
298 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
299 int bit, msr, ret = 0;
300
301 /* only look at RAPL events */
302 if (event->attr.type != rapl_pmu_class.type)
303 return -ENOENT;
304
305 /* check only supported bits are set */
306 if (event->attr.config & ~RAPL_EVENT_MASK)
307 return -EINVAL;
308
309 /*
310 * check event is known (determines counter)
311 */
312 switch (cfg) {
313 case INTEL_RAPL_PP0:
314 bit = RAPL_IDX_PP0_NRG_STAT;
315 msr = MSR_PP0_ENERGY_STATUS;
316 break;
317 case INTEL_RAPL_PKG:
318 bit = RAPL_IDX_PKG_NRG_STAT;
319 msr = MSR_PKG_ENERGY_STATUS;
320 break;
321 case INTEL_RAPL_RAM:
322 bit = RAPL_IDX_RAM_NRG_STAT;
323 msr = MSR_DRAM_ENERGY_STATUS;
324 break;
325 case INTEL_RAPL_PP1:
326 bit = RAPL_IDX_PP1_NRG_STAT;
327 msr = MSR_PP1_ENERGY_STATUS;
328 break;
329 default:
330 return -EINVAL;
331 }
332 /* check event supported */
333 if (!(rapl_cntr_mask & (1 << bit)))
334 return -EINVAL;
335
336 /* unsupported modes and filters */
337 if (event->attr.exclude_user ||
338 event->attr.exclude_kernel ||
339 event->attr.exclude_hv ||
340 event->attr.exclude_idle ||
341 event->attr.exclude_host ||
342 event->attr.exclude_guest ||
343 event->attr.sample_period) /* no sampling */
344 return -EINVAL;
345
346 /* must be done before validate_group */
347 event->hw.event_base = msr;
348 event->hw.config = cfg;
349 event->hw.idx = bit;
350
351 return ret;
352}
353
354static void rapl_pmu_event_read(struct perf_event *event)
355{
356 rapl_event_update(event);
357}
358
359static ssize_t rapl_get_attr_cpumask(struct device *dev,
360 struct device_attribute *attr, char *buf)
361{
362 int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
363
364 buf[n++] = '\n';
365 buf[n] = '\0';
366 return n;
367}
368
369static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
370
371static struct attribute *rapl_pmu_attrs[] = {
372 &dev_attr_cpumask.attr,
373 NULL,
374};
375
376static struct attribute_group rapl_pmu_attr_group = {
377 .attrs = rapl_pmu_attrs,
378};
379
380EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
381EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
382EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
383EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
384
385EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
386EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
387EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
388EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
389
390/*
391 * we compute in 0.23 nJ increments regardless of MSR
392 */
393EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
394EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
395EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
396EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
397
398static struct attribute *rapl_events_srv_attr[] = {
399 EVENT_PTR(rapl_cores),
400 EVENT_PTR(rapl_pkg),
401 EVENT_PTR(rapl_ram),
402
403 EVENT_PTR(rapl_cores_unit),
404 EVENT_PTR(rapl_pkg_unit),
405 EVENT_PTR(rapl_ram_unit),
406
407 EVENT_PTR(rapl_cores_scale),
408 EVENT_PTR(rapl_pkg_scale),
409 EVENT_PTR(rapl_ram_scale),
410 NULL,
411};
412
413static struct attribute *rapl_events_cln_attr[] = {
414 EVENT_PTR(rapl_cores),
415 EVENT_PTR(rapl_pkg),
416 EVENT_PTR(rapl_gpu),
417
418 EVENT_PTR(rapl_cores_unit),
419 EVENT_PTR(rapl_pkg_unit),
420 EVENT_PTR(rapl_gpu_unit),
421
422 EVENT_PTR(rapl_cores_scale),
423 EVENT_PTR(rapl_pkg_scale),
424 EVENT_PTR(rapl_gpu_scale),
425 NULL,
426};
427
428static struct attribute_group rapl_pmu_events_group = {
429 .name = "events",
430 .attrs = NULL, /* patched at runtime */
431};
432
433DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
434static struct attribute *rapl_formats_attr[] = {
435 &format_attr_event.attr,
436 NULL,
437};
438
439static struct attribute_group rapl_pmu_format_group = {
440 .name = "format",
441 .attrs = rapl_formats_attr,
442};
443
444const struct attribute_group *rapl_attr_groups[] = {
445 &rapl_pmu_attr_group,
446 &rapl_pmu_format_group,
447 &rapl_pmu_events_group,
448 NULL,
449};
450
451static struct pmu rapl_pmu_class = {
452 .attr_groups = rapl_attr_groups,
453 .task_ctx_nr = perf_invalid_context, /* system-wide only */
454 .event_init = rapl_pmu_event_init,
455 .add = rapl_pmu_event_add, /* must have */
456 .del = rapl_pmu_event_del, /* must have */
457 .start = rapl_pmu_event_start,
458 .stop = rapl_pmu_event_stop,
459 .read = rapl_pmu_event_read,
460};
461
462static void rapl_cpu_exit(int cpu)
463{
464 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
465 int i, phys_id = topology_physical_package_id(cpu);
466 int target = -1;
467
468 /* find a new cpu on same package */
469 for_each_online_cpu(i) {
470 if (i == cpu)
471 continue;
472 if (phys_id == topology_physical_package_id(i)) {
473 target = i;
474 break;
475 }
476 }
477 /*
478 * clear cpu from cpumask
479 * if was set in cpumask and still some cpu on package,
480 * then move to new cpu
481 */
482 if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
483 cpumask_set_cpu(target, &rapl_cpu_mask);
484
485 WARN_ON(cpumask_empty(&rapl_cpu_mask));
486 /*
487 * migrate events and context to new cpu
488 */
489 if (target >= 0)
490 perf_pmu_migrate_context(pmu->pmu, cpu, target);
491
492 /* cancel overflow polling timer for CPU */
493 rapl_stop_hrtimer(pmu);
494}
495
496static void rapl_cpu_init(int cpu)
497{
498 int i, phys_id = topology_physical_package_id(cpu);
499
500 /* check if phys_is is already covered */
501 for_each_cpu(i, &rapl_cpu_mask) {
502 if (phys_id == topology_physical_package_id(i))
503 return;
504 }
505 /* was not found, so add it */
506 cpumask_set_cpu(cpu, &rapl_cpu_mask);
507}
508
509static int rapl_cpu_prepare(int cpu)
510{
511 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
512 int phys_id = topology_physical_package_id(cpu);
513 u64 ms;
514
515 if (pmu)
516 return 0;
517
518 if (phys_id < 0)
519 return -1;
520
521 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
522 if (!pmu)
523 return -1;
524
525 spin_lock_init(&pmu->lock);
526
527 INIT_LIST_HEAD(&pmu->active_list);
528
529 /*
530 * grab power unit as: 1/2^unit Joules
531 *
532 * we cache in local PMU instance
533 */
534 rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit);
535 pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL;
536 pmu->pmu = &rapl_pmu_class;
537
538 /*
539 * use reference of 200W for scaling the timeout
540 * to avoid missing counter overflows.
541 * 200W = 200 Joules/sec
542 * divide interval by 2 to avoid lockstep (2 * 100)
543 * if hw unit is 32, then we use 2 ms 1/200/2
544 */
545 if (pmu->hw_unit < 32)
546 ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
547 else
548 ms = 2;
549
550 pmu->timer_interval = ms_to_ktime(ms);
551
552 rapl_hrtimer_init(pmu);
553
554 /* set RAPL pmu for this cpu for now */
555 per_cpu(rapl_pmu, cpu) = pmu;
556 per_cpu(rapl_pmu_to_free, cpu) = NULL;
557
558 return 0;
559}
560
561static void rapl_cpu_kfree(int cpu)
562{
563 struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
564
565 kfree(pmu);
566
567 per_cpu(rapl_pmu_to_free, cpu) = NULL;
568}
569
570static int rapl_cpu_dying(int cpu)
571{
572 struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
573
574 if (!pmu)
575 return 0;
576
577 per_cpu(rapl_pmu, cpu) = NULL;
578
579 per_cpu(rapl_pmu_to_free, cpu) = pmu;
580
581 return 0;
582}
583
584static int rapl_cpu_notifier(struct notifier_block *self,
585 unsigned long action, void *hcpu)
586{
587 unsigned int cpu = (long)hcpu;
588
589 switch (action & ~CPU_TASKS_FROZEN) {
590 case CPU_UP_PREPARE:
591 rapl_cpu_prepare(cpu);
592 break;
593 case CPU_STARTING:
594 rapl_cpu_init(cpu);
595 break;
596 case CPU_UP_CANCELED:
597 case CPU_DYING:
598 rapl_cpu_dying(cpu);
599 break;
600 case CPU_ONLINE:
601 case CPU_DEAD:
602 rapl_cpu_kfree(cpu);
603 break;
604 case CPU_DOWN_PREPARE:
605 rapl_cpu_exit(cpu);
606 break;
607 default:
608 break;
609 }
610
611 return NOTIFY_OK;
612}
613
614static const struct x86_cpu_id rapl_cpu_match[] = {
615 [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
616 [1] = {},
617};
618
619static int __init rapl_pmu_init(void)
620{
621 struct rapl_pmu *pmu;
622 int cpu, ret;
623
624 /*
625 * check for Intel processor family 6
626 */
627 if (!x86_match_cpu(rapl_cpu_match))
628 return 0;
629
630 /* check supported CPU */
631 switch (boot_cpu_data.x86_model) {
632 case 42: /* Sandy Bridge */
633 case 58: /* Ivy Bridge */
634 case 60: /* Haswell */
635 case 69: /* Haswell-Celeron */
636 rapl_cntr_mask = RAPL_IDX_CLN;
637 rapl_pmu_events_group.attrs = rapl_events_cln_attr;
638 break;
639 case 45: /* Sandy Bridge-EP */
640 case 62: /* IvyTown */
641 rapl_cntr_mask = RAPL_IDX_SRV;
642 rapl_pmu_events_group.attrs = rapl_events_srv_attr;
643 break;
644
645 default:
646 /* unsupported */
647 return 0;
648 }
649 get_online_cpus();
650
651 for_each_online_cpu(cpu) {
652 rapl_cpu_prepare(cpu);
653 rapl_cpu_init(cpu);
654 }
655
656 perf_cpu_notifier(rapl_cpu_notifier);
657
658 ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
659 if (WARN_ON(ret)) {
660 pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
661 put_online_cpus();
662 return -1;
663 }
664
665 pmu = __get_cpu_var(rapl_pmu);
666
667 pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
668 " API unit is 2^-32 Joules,"
669 " %d fixed counters"
670 " %llu ms ovfl timer\n",
671 pmu->hw_unit,
672 hweight32(rapl_cntr_mask),
673 ktime_to_ms(pmu->timer_interval));
674
675 put_online_cpus();
676
677 return 0;
678}
679device_initcall(rapl_pmu_init);
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
index 88db010845cb..384df5105fbc 100644
--- a/arch/x86/kernel/cpu/rdrand.c
+++ b/arch/x86/kernel/cpu/rdrand.c
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s)
31} 31}
32__setup("nordrand", x86_rdrand_setup); 32__setup("nordrand", x86_rdrand_setup);
33 33
34/* We can't use arch_get_random_long() here since alternatives haven't run */
35static inline int rdrand_long(unsigned long *v)
36{
37 int ok;
38 asm volatile("1: " RDRAND_LONG "\n\t"
39 "jc 2f\n\t"
40 "decl %0\n\t"
41 "jnz 1b\n\t"
42 "2:"
43 : "=r" (ok), "=a" (*v)
44 : "0" (RDRAND_RETRY_LOOPS));
45 return ok;
46}
47
48/* 34/*
49 * Force a reseed cycle; we are architecturally guaranteed a reseed 35 * Force a reseed cycle; we are architecturally guaranteed a reseed
50 * after no more than 512 128-bit chunks of random data. This also 36 * after no more than 512 128-bit chunks of random data. This also
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index aa0430d69b90..3fa0e5ad86b4 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,6 +1,5 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/mm.h> 2#include <linux/mm.h>
3#include <linux/init.h>
4#include <asm/processor.h> 3#include <asm/processor.h>
5#include <asm/msr.h> 4#include <asm/msr.h>
6#include "cpu.h" 5#include "cpu.h"
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index 75c5ad5d35cc..ef9c2a0078bd 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -1,5 +1,4 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h>
3#include <asm/processor.h> 2#include <asm/processor.h>
4#include "cpu.h" 3#include "cpu.h"
5 4
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 18677a90d6a3..a57902efe2d5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -7,7 +7,6 @@
7 * 7 *
8 */ 8 */
9 9
10#include <linux/init.h>
11#include <linux/types.h> 10#include <linux/types.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/smp.h> 12#include <linux/smp.h>
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c
index 5d3fe8d36e4a..f6dfd9334b67 100644
--- a/arch/x86/kernel/doublefault.c
+++ b/arch/x86/kernel/doublefault.c
@@ -1,6 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/init.h>
4#include <linux/init_task.h> 3#include <linux/init_task.h>
5#include <linux/fs.h> 4#include <linux/fs.h>
6 5
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 174da5fc5a7b..988c00a1f60d 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void)
1120 nr_pages += end_pfn - start_pfn; 1120 nr_pages += end_pfn - start_pfn;
1121 } 1121 }
1122 1122
1123 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { 1123 for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) {
1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); 1124 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); 1125 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1126 if (start_pfn < end_pfn) 1126 if (start_pfn < end_pfn)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 51e2988c5728..a2a4f4697889 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller)
1082 pushl $0 /* Pass NULL as regs pointer */ 1082 pushl $0 /* Pass NULL as regs pointer */
1083 movl 4*4(%esp), %eax 1083 movl 4*4(%esp), %eax
1084 movl 0x4(%ebp), %edx 1084 movl 0x4(%ebp), %edx
1085 leal function_trace_op, %ecx 1085 movl function_trace_op, %ecx
1086 subl $MCOUNT_INSN_SIZE, %eax 1086 subl $MCOUNT_INSN_SIZE, %eax
1087 1087
1088.globl ftrace_call 1088.globl ftrace_call
@@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller)
1140 movl 12*4(%esp), %eax /* Load ip (1st parameter) */ 1140 movl 12*4(%esp), %eax /* Load ip (1st parameter) */
1141 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ 1141 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
1142 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ 1142 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
1143 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ 1143 movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
1144 pushl %esp /* Save pt_regs as 4th parameter */ 1144 pushl %esp /* Save pt_regs as 4th parameter */
1145 1145
1146GLOBAL(ftrace_regs_call) 1146GLOBAL(ftrace_regs_call)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e21b0785a85b..1e96c3628bf2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -88,7 +88,7 @@ END(function_hook)
88 MCOUNT_SAVE_FRAME \skip 88 MCOUNT_SAVE_FRAME \skip
89 89
90 /* Load the ftrace_ops into the 3rd parameter */ 90 /* Load the ftrace_ops into the 3rd parameter */
91 leaq function_trace_op, %rdx 91 movq function_trace_op(%rip), %rdx
92 92
93 /* Load ip into the first parameter */ 93 /* Load ip into the first parameter */
94 movq RIP(%rsp), %rdi 94 movq RIP(%rsp), %rdi
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index f66ff162dce8..a67b47c31314 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -38,7 +38,6 @@
38#include <linux/kernel.h> 38#include <linux/kernel.h>
39#include <linux/module.h> 39#include <linux/module.h>
40#include <linux/sched.h> 40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h> 41#include <linux/smp.h>
43 42
44#include <asm/hw_breakpoint.h> 43#include <asm/hw_breakpoint.h>
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c
new file mode 100644
index 000000000000..c3aae6672843
--- /dev/null
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -0,0 +1,226 @@
1/*
2 * IOSF-SB MailBox Interface Driver
3 * Copyright (c) 2013, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 *
15 * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
16 * mailbox interface (MBI) to communicate with mutiple devices. This
17 * driver implements access to this interface for those platforms that can
18 * enumerate the device using PCI.
19 */
20
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25
26#include <asm/iosf_mbi.h>
27
28static DEFINE_SPINLOCK(iosf_mbi_lock);
29
30static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
31{
32 return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
33}
34
35static struct pci_dev *mbi_pdev; /* one mbi device */
36
37static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
38{
39 int result;
40
41 if (!mbi_pdev)
42 return -ENODEV;
43
44 if (mcrx) {
45 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
46 mcrx);
47 if (result < 0)
48 goto fail_read;
49 }
50
51 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
52 if (result < 0)
53 goto fail_read;
54
55 result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
56 if (result < 0)
57 goto fail_read;
58
59 return 0;
60
61fail_read:
62 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
63 return result;
64}
65
66static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
67{
68 int result;
69
70 if (!mbi_pdev)
71 return -ENODEV;
72
73 result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
74 if (result < 0)
75 goto fail_write;
76
77 if (mcrx) {
78 result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
79 mcrx);
80 if (result < 0)
81 goto fail_write;
82 }
83
84 result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
85 if (result < 0)
86 goto fail_write;
87
88 return 0;
89
90fail_write:
91 dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
92 return result;
93}
94
95int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
96{
97 u32 mcr, mcrx;
98 unsigned long flags;
99 int ret;
100
101 /*Access to the GFX unit is handled by GPU code */
102 if (port == BT_MBI_UNIT_GFX) {
103 WARN_ON(1);
104 return -EPERM;
105 }
106
107 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
108 mcrx = offset & MBI_MASK_HI;
109
110 spin_lock_irqsave(&iosf_mbi_lock, flags);
111 ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
112 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
113
114 return ret;
115}
116EXPORT_SYMBOL(iosf_mbi_read);
117
118int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
119{
120 u32 mcr, mcrx;
121 unsigned long flags;
122 int ret;
123
124 /*Access to the GFX unit is handled by GPU code */
125 if (port == BT_MBI_UNIT_GFX) {
126 WARN_ON(1);
127 return -EPERM;
128 }
129
130 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
131 mcrx = offset & MBI_MASK_HI;
132
133 spin_lock_irqsave(&iosf_mbi_lock, flags);
134 ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
135 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
136
137 return ret;
138}
139EXPORT_SYMBOL(iosf_mbi_write);
140
141int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
142{
143 u32 mcr, mcrx;
144 u32 value;
145 unsigned long flags;
146 int ret;
147
148 /*Access to the GFX unit is handled by GPU code */
149 if (port == BT_MBI_UNIT_GFX) {
150 WARN_ON(1);
151 return -EPERM;
152 }
153
154 mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
155 mcrx = offset & MBI_MASK_HI;
156
157 spin_lock_irqsave(&iosf_mbi_lock, flags);
158
159 /* Read current mdr value */
160 ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
161 if (ret < 0) {
162 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
163 return ret;
164 }
165
166 /* Apply mask */
167 value &= ~mask;
168 mdr &= mask;
169 value |= mdr;
170
171 /* Write back */
172 ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
173
174 spin_unlock_irqrestore(&iosf_mbi_lock, flags);
175
176 return ret;
177}
178EXPORT_SYMBOL(iosf_mbi_modify);
179
180static int iosf_mbi_probe(struct pci_dev *pdev,
181 const struct pci_device_id *unused)
182{
183 int ret;
184
185 ret = pci_enable_device(pdev);
186 if (ret < 0) {
187 dev_err(&pdev->dev, "error: could not enable device\n");
188 return ret;
189 }
190
191 mbi_pdev = pci_dev_get(pdev);
192 return 0;
193}
194
195static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = {
196 { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) },
197 { 0, },
198};
199MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
200
201static struct pci_driver iosf_mbi_pci_driver = {
202 .name = "iosf_mbi_pci",
203 .probe = iosf_mbi_probe,
204 .id_table = iosf_mbi_pci_ids,
205};
206
207static int __init iosf_mbi_init(void)
208{
209 return pci_register_driver(&iosf_mbi_pci_driver);
210}
211
212static void __exit iosf_mbi_exit(void)
213{
214 pci_unregister_driver(&iosf_mbi_pci_driver);
215 if (mbi_pdev) {
216 pci_dev_put(mbi_pdev);
217 mbi_pdev = NULL;
218 }
219}
220
221module_init(iosf_mbi_init);
222module_exit(iosf_mbi_exit);
223
224MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
225MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
226MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 22d0687e7fda..dbb60878b744 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
193 if (!handle_irq(irq, regs)) { 193 if (!handle_irq(irq, regs)) {
194 ack_APIC_irq(); 194 ack_APIC_irq();
195 195
196 if (printk_ratelimit()) 196 if (irq != VECTOR_RETRIGGERED) {
197 pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", 197 pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
198 __func__, smp_processor_id(), vector, irq); 198 __func__, smp_processor_id(),
199 vector, irq);
200 } else {
201 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
202 }
199 } 203 }
200 204
201 irq_exit(); 205 irq_exit();
@@ -262,6 +266,76 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs)
262EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 266EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
263 267
264#ifdef CONFIG_HOTPLUG_CPU 268#ifdef CONFIG_HOTPLUG_CPU
269/*
270 * This cpu is going to be removed and its vectors migrated to the remaining
271 * online cpus. Check to see if there are enough vectors in the remaining cpus.
272 * This function is protected by stop_machine().
273 */
274int check_irq_vectors_for_cpu_disable(void)
275{
276 int irq, cpu;
277 unsigned int this_cpu, vector, this_count, count;
278 struct irq_desc *desc;
279 struct irq_data *data;
280 struct cpumask affinity_new, online_new;
281
282 this_cpu = smp_processor_id();
283 cpumask_copy(&online_new, cpu_online_mask);
284 cpu_clear(this_cpu, online_new);
285
286 this_count = 0;
287 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
288 irq = __this_cpu_read(vector_irq[vector]);
289 if (irq >= 0) {
290 desc = irq_to_desc(irq);
291 data = irq_desc_get_irq_data(desc);
292 cpumask_copy(&affinity_new, data->affinity);
293 cpu_clear(this_cpu, affinity_new);
294
295 /* Do not count inactive or per-cpu irqs. */
296 if (!irq_has_action(irq) || irqd_is_per_cpu(data))
297 continue;
298
299 /*
300 * A single irq may be mapped to multiple
301 * cpu's vector_irq[] (for example IOAPIC cluster
302 * mode). In this case we have two
303 * possibilities:
304 *
305 * 1) the resulting affinity mask is empty; that is
306 * this the down'd cpu is the last cpu in the irq's
307 * affinity mask, or
308 *
309 * 2) the resulting affinity mask is no longer
310 * a subset of the online cpus but the affinity
311 * mask is not zero; that is the down'd cpu is the
312 * last online cpu in a user set affinity mask.
313 */
314 if (cpumask_empty(&affinity_new) ||
315 !cpumask_subset(&affinity_new, &online_new))
316 this_count++;
317 }
318 }
319
320 count = 0;
321 for_each_online_cpu(cpu) {
322 if (cpu == this_cpu)
323 continue;
324 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
325 vector++) {
326 if (per_cpu(vector_irq, cpu)[vector] < 0)
327 count++;
328 }
329 }
330
331 if (count < this_count) {
332 pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n",
333 this_cpu, this_count, count);
334 return -ERANGE;
335 }
336 return 0;
337}
338
265/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ 339/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
266void fixup_irqs(void) 340void fixup_irqs(void)
267{ 341{
@@ -344,7 +418,7 @@ void fixup_irqs(void)
344 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 418 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
345 unsigned int irr; 419 unsigned int irr;
346 420
347 if (__this_cpu_read(vector_irq[vector]) < 0) 421 if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
348 continue; 422 continue;
349 423
350 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 424 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
@@ -355,11 +429,14 @@ void fixup_irqs(void)
355 data = irq_desc_get_irq_data(desc); 429 data = irq_desc_get_irq_data(desc);
356 chip = irq_data_get_irq_chip(data); 430 chip = irq_data_get_irq_chip(data);
357 raw_spin_lock(&desc->lock); 431 raw_spin_lock(&desc->lock);
358 if (chip->irq_retrigger) 432 if (chip->irq_retrigger) {
359 chip->irq_retrigger(data); 433 chip->irq_retrigger(data);
434 __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);
435 }
360 raw_spin_unlock(&desc->lock); 436 raw_spin_unlock(&desc->lock);
361 } 437 }
362 __this_cpu_write(vector_irq[vector], -1); 438 if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
439 __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
363 } 440 }
364} 441}
365#endif 442#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index a2a1fbc594ff..7f50156542fb 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
52}; 52};
53 53
54DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 54DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
55 [0 ... NR_VECTORS - 1] = -1, 55 [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
56}; 56};
57 57
58int vector_used_by_percpu_irq(unsigned int vector) 58int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
60 int cpu; 60 int cpu;
61 61
62 for_each_online_cpu(cpu) { 62 for_each_online_cpu(cpu) {
63 if (per_cpu(vector_irq, cpu)[vector] != -1) 63 if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
64 return 1; 64 return 1;
65 } 65 }
66 66
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960e..7ec1d5f8d283 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -39,7 +39,6 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <linux/delay.h> 40#include <linux/delay.h>
41#include <linux/kgdb.h> 41#include <linux/kgdb.h>
42#include <linux/init.h>
43#include <linux/smp.h> 42#include <linux/smp.h>
44#include <linux/nmi.h> 43#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h> 44#include <linux/hw_breakpoint.h>
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c
new file mode 100644
index 000000000000..c2bedaea11f7
--- /dev/null
+++ b/arch/x86/kernel/ksysfs.c
@@ -0,0 +1,340 @@
1/*
2 * Architecture specific sysfs attributes in /sys/kernel
3 *
4 * Copyright (C) 2007, Intel Corp.
5 * Huang Ying <ying.huang@intel.com>
6 * Copyright (C) 2013, 2013 Red Hat, Inc.
7 * Dave Young <dyoung@redhat.com>
8 *
9 * This file is released under the GPLv2
10 */
11
12#include <linux/kobject.h>
13#include <linux/string.h>
14#include <linux/sysfs.h>
15#include <linux/init.h>
16#include <linux/stat.h>
17#include <linux/slab.h>
18#include <linux/mm.h>
19
20#include <asm/io.h>
21#include <asm/setup.h>
22
23static ssize_t version_show(struct kobject *kobj,
24 struct kobj_attribute *attr, char *buf)
25{
26 return sprintf(buf, "0x%04x\n", boot_params.hdr.version);
27}
28
29static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version);
30
31static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj,
32 struct bin_attribute *bin_attr,
33 char *buf, loff_t off, size_t count)
34{
35 memcpy(buf, (void *)&boot_params + off, count);
36 return count;
37}
38
39static struct bin_attribute boot_params_data_attr = {
40 .attr = {
41 .name = "data",
42 .mode = S_IRUGO,
43 },
44 .read = boot_params_data_read,
45 .size = sizeof(boot_params),
46};
47
48static struct attribute *boot_params_version_attrs[] = {
49 &boot_params_version_attr.attr,
50 NULL,
51};
52
53static struct bin_attribute *boot_params_data_attrs[] = {
54 &boot_params_data_attr,
55 NULL,
56};
57
58static struct attribute_group boot_params_attr_group = {
59 .attrs = boot_params_version_attrs,
60 .bin_attrs = boot_params_data_attrs,
61};
62
63static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr)
64{
65 const char *name;
66
67 name = kobject_name(kobj);
68 return kstrtoint(name, 10, nr);
69}
70
71static int get_setup_data_paddr(int nr, u64 *paddr)
72{
73 int i = 0;
74 struct setup_data *data;
75 u64 pa_data = boot_params.hdr.setup_data;
76
77 while (pa_data) {
78 if (nr == i) {
79 *paddr = pa_data;
80 return 0;
81 }
82 data = ioremap_cache(pa_data, sizeof(*data));
83 if (!data)
84 return -ENOMEM;
85
86 pa_data = data->next;
87 iounmap(data);
88 i++;
89 }
90 return -EINVAL;
91}
92
93static int __init get_setup_data_size(int nr, size_t *size)
94{
95 int i = 0;
96 struct setup_data *data;
97 u64 pa_data = boot_params.hdr.setup_data;
98
99 while (pa_data) {
100 data = ioremap_cache(pa_data, sizeof(*data));
101 if (!data)
102 return -ENOMEM;
103 if (nr == i) {
104 *size = data->len;
105 iounmap(data);
106 return 0;
107 }
108
109 pa_data = data->next;
110 iounmap(data);
111 i++;
112 }
113 return -EINVAL;
114}
115
116static ssize_t type_show(struct kobject *kobj,
117 struct kobj_attribute *attr, char *buf)
118{
119 int nr, ret;
120 u64 paddr;
121 struct setup_data *data;
122
123 ret = kobj_to_setup_data_nr(kobj, &nr);
124 if (ret)
125 return ret;
126
127 ret = get_setup_data_paddr(nr, &paddr);
128 if (ret)
129 return ret;
130 data = ioremap_cache(paddr, sizeof(*data));
131 if (!data)
132 return -ENOMEM;
133
134 ret = sprintf(buf, "0x%x\n", data->type);
135 iounmap(data);
136 return ret;
137}
138
139static ssize_t setup_data_data_read(struct file *fp,
140 struct kobject *kobj,
141 struct bin_attribute *bin_attr,
142 char *buf,
143 loff_t off, size_t count)
144{
145 int nr, ret = 0;
146 u64 paddr;
147 struct setup_data *data;
148 void *p;
149
150 ret = kobj_to_setup_data_nr(kobj, &nr);
151 if (ret)
152 return ret;
153
154 ret = get_setup_data_paddr(nr, &paddr);
155 if (ret)
156 return ret;
157 data = ioremap_cache(paddr, sizeof(*data));
158 if (!data)
159 return -ENOMEM;
160
161 if (off > data->len) {
162 ret = -EINVAL;
163 goto out;
164 }
165
166 if (count > data->len - off)
167 count = data->len - off;
168
169 if (!count)
170 goto out;
171
172 ret = count;
173 p = ioremap_cache(paddr + sizeof(*data), data->len);
174 if (!p) {
175 ret = -ENOMEM;
176 goto out;
177 }
178 memcpy(buf, p + off, count);
179 iounmap(p);
180out:
181 iounmap(data);
182 return ret;
183}
184
185static struct kobj_attribute type_attr = __ATTR_RO(type);
186
187static struct bin_attribute data_attr = {
188 .attr = {
189 .name = "data",
190 .mode = S_IRUGO,
191 },
192 .read = setup_data_data_read,
193};
194
195static struct attribute *setup_data_type_attrs[] = {
196 &type_attr.attr,
197 NULL,
198};
199
200static struct bin_attribute *setup_data_data_attrs[] = {
201 &data_attr,
202 NULL,
203};
204
205static struct attribute_group setup_data_attr_group = {
206 .attrs = setup_data_type_attrs,
207 .bin_attrs = setup_data_data_attrs,
208};
209
210static int __init create_setup_data_node(struct kobject *parent,
211 struct kobject **kobjp, int nr)
212{
213 int ret = 0;
214 size_t size;
215 struct kobject *kobj;
216 char name[16]; /* should be enough for setup_data nodes numbers */
217 snprintf(name, 16, "%d", nr);
218
219 kobj = kobject_create_and_add(name, parent);
220 if (!kobj)
221 return -ENOMEM;
222
223 ret = get_setup_data_size(nr, &size);
224 if (ret)
225 goto out_kobj;
226
227 data_attr.size = size;
228 ret = sysfs_create_group(kobj, &setup_data_attr_group);
229 if (ret)
230 goto out_kobj;
231 *kobjp = kobj;
232
233 return 0;
234out_kobj:
235 kobject_put(kobj);
236 return ret;
237}
238
239static void __init cleanup_setup_data_node(struct kobject *kobj)
240{
241 sysfs_remove_group(kobj, &setup_data_attr_group);
242 kobject_put(kobj);
243}
244
245static int __init get_setup_data_total_num(u64 pa_data, int *nr)
246{
247 int ret = 0;
248 struct setup_data *data;
249
250 *nr = 0;
251 while (pa_data) {
252 *nr += 1;
253 data = ioremap_cache(pa_data, sizeof(*data));
254 if (!data) {
255 ret = -ENOMEM;
256 goto out;
257 }
258 pa_data = data->next;
259 iounmap(data);
260 }
261
262out:
263 return ret;
264}
265
266static int __init create_setup_data_nodes(struct kobject *parent)
267{
268 struct kobject *setup_data_kobj, **kobjp;
269 u64 pa_data;
270 int i, j, nr, ret = 0;
271
272 pa_data = boot_params.hdr.setup_data;
273 if (!pa_data)
274 return 0;
275
276 setup_data_kobj = kobject_create_and_add("setup_data", parent);
277 if (!setup_data_kobj) {
278 ret = -ENOMEM;
279 goto out;
280 }
281
282 ret = get_setup_data_total_num(pa_data, &nr);
283 if (ret)
284 goto out_setup_data_kobj;
285
286 kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL);
287 if (!kobjp) {
288 ret = -ENOMEM;
289 goto out_setup_data_kobj;
290 }
291
292 for (i = 0; i < nr; i++) {
293 ret = create_setup_data_node(setup_data_kobj, kobjp + i, i);
294 if (ret)
295 goto out_clean_nodes;
296 }
297
298 kfree(kobjp);
299 return 0;
300
301out_clean_nodes:
302 for (j = i - 1; j > 0; j--)
303 cleanup_setup_data_node(*(kobjp + j));
304 kfree(kobjp);
305out_setup_data_kobj:
306 kobject_put(setup_data_kobj);
307out:
308 return ret;
309}
310
311static int __init boot_params_ksysfs_init(void)
312{
313 int ret;
314 struct kobject *boot_params_kobj;
315
316 boot_params_kobj = kobject_create_and_add("boot_params",
317 kernel_kobj);
318 if (!boot_params_kobj) {
319 ret = -ENOMEM;
320 goto out;
321 }
322
323 ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group);
324 if (ret)
325 goto out_boot_params_kobj;
326
327 ret = create_setup_data_nodes(boot_params_kobj);
328 if (ret)
329 goto out_create_group;
330
331 return 0;
332out_create_group:
333 sysfs_remove_group(boot_params_kobj, &boot_params_attr_group);
334out_boot_params_kobj:
335 kobject_put(boot_params_kobj);
336out:
337 return ret;
338}
339
340arch_initcall(boot_params_ksysfs_init);
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b00..1667b1de8d5d 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/init.h>
13#include <linux/numa.h> 12#include <linux/numa.h>
14#include <linux/ftrace.h> 13#include <linux/ftrace.h>
15#include <linux/suspend.h> 14#include <linux/suspend.h>
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7d..da15918d1c81 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -3,7 +3,6 @@
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h>
7#include <linux/gfp.h> 6#include <linux/gfp.h>
8#include <linux/pci.h> 7#include <linux/pci.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 6f1236c29c4b..0de43e98ce08 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -24,7 +24,6 @@
24#include <linux/interrupt.h> 24#include <linux/interrupt.h>
25#include <linux/delay.h> 25#include <linux/delay.h>
26#include <linux/reboot.h> 26#include <linux/reboot.h>
27#include <linux/init.h>
28#include <linux/mc146818rtc.h> 27#include <linux/mc146818rtc.h>
29#include <linux/module.h> 28#include <linux/module.h>
30#include <linux/kallsyms.h> 29#include <linux/kallsyms.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index da3c599584a3..c752cb43e52f 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -558,6 +558,17 @@ void native_machine_shutdown(void)
558{ 558{
559 /* Stop the cpus and apics */ 559 /* Stop the cpus and apics */
560#ifdef CONFIG_X86_IO_APIC 560#ifdef CONFIG_X86_IO_APIC
561 /*
562 * Disabling IO APIC before local APIC is a workaround for
563 * erratum AVR31 in "Intel Atom Processor C2000 Product Family
564 * Specification Update". In this situation, interrupts that target
565 * a Logical Processor whose Local APIC is either in the process of
566 * being hardware disabled or software disabled are neither delivered
567 * nor discarded. When this erratum occurs, the processor may hang.
568 *
569 * Even without the erratum, it still makes sense to quiet IO APIC
570 * before disabling Local APIC.
571 */
561 disable_IO_APIC(); 572 disable_IO_APIC();
562#endif 573#endif
563 574
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index cb233bc9dee3..c9675594d7ca 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -295,6 +295,8 @@ static void __init reserve_brk(void)
295 _brk_start = 0; 295 _brk_start = 0;
296} 296}
297 297
298u64 relocated_ramdisk;
299
298#ifdef CONFIG_BLK_DEV_INITRD 300#ifdef CONFIG_BLK_DEV_INITRD
299 301
300static u64 __init get_ramdisk_image(void) 302static u64 __init get_ramdisk_image(void)
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void)
321 u64 ramdisk_image = get_ramdisk_image(); 323 u64 ramdisk_image = get_ramdisk_image();
322 u64 ramdisk_size = get_ramdisk_size(); 324 u64 ramdisk_size = get_ramdisk_size();
323 u64 area_size = PAGE_ALIGN(ramdisk_size); 325 u64 area_size = PAGE_ALIGN(ramdisk_size);
324 u64 ramdisk_here;
325 unsigned long slop, clen, mapaddr; 326 unsigned long slop, clen, mapaddr;
326 char *p, *q; 327 char *p, *q;
327 328
328 /* We need to move the initrd down into directly mapped mem */ 329 /* We need to move the initrd down into directly mapped mem */
329 ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), 330 relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
330 area_size, PAGE_SIZE); 331 area_size, PAGE_SIZE);
331 332
332 if (!ramdisk_here) 333 if (!relocated_ramdisk)
333 panic("Cannot find place for new RAMDISK of size %lld\n", 334 panic("Cannot find place for new RAMDISK of size %lld\n",
334 ramdisk_size); 335 ramdisk_size);
335 336
336 /* Note: this includes all the mem currently occupied by 337 /* Note: this includes all the mem currently occupied by
337 the initrd, we rely on that fact to keep the data intact. */ 338 the initrd, we rely on that fact to keep the data intact. */
338 memblock_reserve(ramdisk_here, area_size); 339 memblock_reserve(relocated_ramdisk, area_size);
339 initrd_start = ramdisk_here + PAGE_OFFSET; 340 initrd_start = relocated_ramdisk + PAGE_OFFSET;
340 initrd_end = initrd_start + ramdisk_size; 341 initrd_end = initrd_start + ramdisk_size;
341 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", 342 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n",
342 ramdisk_here, ramdisk_here + ramdisk_size - 1); 343 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
343 344
344 q = (char *)initrd_start; 345 q = (char *)initrd_start;
345 346
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void)
363 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 364 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to"
364 " [mem %#010llx-%#010llx]\n", 365 " [mem %#010llx-%#010llx]\n",
365 ramdisk_image, ramdisk_image + ramdisk_size - 1, 366 ramdisk_image, ramdisk_image + ramdisk_size - 1,
366 ramdisk_here, ramdisk_here + ramdisk_size - 1); 367 relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1);
367} 368}
368 369
369static void __init early_reserve_initrd(void) 370static void __init early_reserve_initrd(void)
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void)
447 case SETUP_DTB: 448 case SETUP_DTB:
448 add_dtb(pa_data); 449 add_dtb(pa_data);
449 break; 450 break;
451 case SETUP_EFI:
452 parse_efi_setup(pa_data, data_len);
453 break;
450 default: 454 default:
451 break; 455 break;
452 } 456 }
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void)
824} 828}
825 829
826/* 830/*
831 * Dump out kernel offset information on panic.
832 */
833static int
834dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
835{
836 pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
837 "(relocation range: 0x%lx-0x%lx)\n",
838 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
839 __START_KERNEL_map, MODULES_VADDR-1);
840
841 return 0;
842}
843
844/*
827 * Determine if we were loaded by an EFI loader. If so, then we have also been 845 * Determine if we were loaded by an EFI loader. If so, then we have also been
828 * passed the efi memmap, systab, etc., so we should use these data structures 846 * passed the efi memmap, systab, etc., so we should use these data structures
829 * for initialization. Note, the efi init code path is determined by the 847 * for initialization. Note, the efi init code path is determined by the
@@ -924,8 +942,6 @@ void __init setup_arch(char **cmdline_p)
924 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 942 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
925 setup_memory_map(); 943 setup_memory_map();
926 parse_setup_data(); 944 parse_setup_data();
927 /* update the e820_saved too */
928 e820_reserve_setup_data();
929 945
930 copy_edd(); 946 copy_edd();
931 947
@@ -987,6 +1003,8 @@ void __init setup_arch(char **cmdline_p)
987 early_dump_pci_devices(); 1003 early_dump_pci_devices();
988#endif 1004#endif
989 1005
1006 /* update the e820_saved too */
1007 e820_reserve_setup_data();
990 finish_e820_parsing(); 1008 finish_e820_parsing();
991 1009
992 if (efi_enabled(EFI_BOOT)) 1010 if (efi_enabled(EFI_BOOT))
@@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p)
1101 1119
1102 setup_real_mode(); 1120 setup_real_mode();
1103 1121
1104 memblock_set_current_limit(get_max_mapped()); 1122 memblock_set_current_limit(get_max_low_mapped());
1105 dma_contiguous_reserve(0); 1123 dma_contiguous_reserve(0);
1106 1124
1107 /* 1125 /*
@@ -1248,3 +1266,15 @@ void __init i386_reserve_resources(void)
1248} 1266}
1249 1267
1250#endif /* CONFIG_X86_32 */ 1268#endif /* CONFIG_X86_32 */
1269
1270static struct notifier_block kernel_offset_notifier = {
1271 .notifier_call = dump_kernel_offset
1272};
1273
1274static int __init register_kernel_offset_dumper(void)
1275{
1276 atomic_notifier_chain_register(&panic_notifier_list,
1277 &kernel_offset_notifier);
1278 return 0;
1279}
1280__initcall(register_kernel_offset_dumper);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 85dc05a3aa02..a32da804252e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1312,6 +1312,12 @@ void cpu_disable_common(void)
1312 1312
1313int native_cpu_disable(void) 1313int native_cpu_disable(void)
1314{ 1314{
1315 int ret;
1316
1317 ret = check_irq_vectors_for_cpu_disable();
1318 if (ret)
1319 return ret;
1320
1315 clear_local_APIC(); 1321 clear_local_APIC();
1316 1322
1317 cpu_disable_common(); 1323 cpu_disable_common();
@@ -1417,7 +1423,9 @@ static inline void mwait_play_dead(void)
1417 * The WBINVD is insufficient due to the spurious-wakeup 1423 * The WBINVD is insufficient due to the spurious-wakeup
1418 * case where we return around the loop. 1424 * case where we return around the loop.
1419 */ 1425 */
1426 mb();
1420 clflush(mwait_ptr); 1427 clflush(mwait_ptr);
1428 mb();
1421 __monitor(mwait_ptr, 0, 0); 1429 __monitor(mwait_ptr, 0, 0);
1422 mb(); 1430 mb();
1423 __mwait(eax, 0); 1431 __mwait(eax, 0);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b857ed890b4c..57409f6b8c62 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
211 exception_exit(prev_state); \ 211 exception_exit(prev_state); \
212} 212}
213 213
214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 214DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
215 regs->ip) 215DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
216DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) 216DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
217DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) 217DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
218DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, 218DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
219 regs->ip) 219DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
220DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", 220DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present )
221 coprocessor_segment_overrun)
222DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
223DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
224#ifdef CONFIG_X86_32 221#ifdef CONFIG_X86_32
225DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 222DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment )
226#endif 223#endif
227DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, 224DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 )
228 BUS_ADRALN, 0)
229 225
230#ifdef CONFIG_X86_64 226#ifdef CONFIG_X86_64
231/* Runs on IST stack */ 227/* Runs on IST stack */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 930e5d48f560..a3acbac2ee72 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -11,6 +11,7 @@
11#include <linux/clocksource.h> 11#include <linux/clocksource.h>
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/timex.h> 13#include <linux/timex.h>
14#include <linux/static_key.h>
14 15
15#include <asm/hpet.h> 16#include <asm/hpet.h>
16#include <asm/timer.h> 17#include <asm/timer.h>
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable;
37 erroneous rdtsc usage on !cpu_has_tsc processors */ 38 erroneous rdtsc usage on !cpu_has_tsc processors */
38static int __read_mostly tsc_disabled = -1; 39static int __read_mostly tsc_disabled = -1;
39 40
41static struct static_key __use_tsc = STATIC_KEY_INIT;
42
40int tsc_clocksource_reliable; 43int tsc_clocksource_reliable;
44
45/*
46 * Use a ring-buffer like data structure, where a writer advances the head by
47 * writing a new data entry and a reader advances the tail when it observes a
48 * new entry.
49 *
50 * Writers are made to wait on readers until there's space to write a new
51 * entry.
52 *
53 * This means that we can always use an {offset, mul} pair to compute a ns
54 * value that is 'roughly' in the right direction, even if we're writing a new
55 * {offset, mul} pair during the clock read.
56 *
57 * The down-side is that we can no longer guarantee strict monotonicity anymore
58 * (assuming the TSC was that to begin with), because while we compute the
59 * intersection point of the two clock slopes and make sure the time is
60 * continuous at the point of switching; we can no longer guarantee a reader is
61 * strictly before or after the switch point.
62 *
63 * It does mean a reader no longer needs to disable IRQs in order to avoid
64 * CPU-Freq updates messing with his times, and similarly an NMI reader will
65 * no longer run the risk of hitting half-written state.
66 */
67
68struct cyc2ns {
69 struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */
70 struct cyc2ns_data *head; /* 48 + 8 = 56 */
71 struct cyc2ns_data *tail; /* 56 + 8 = 64 */
72}; /* exactly fits one cacheline */
73
74static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
75
76struct cyc2ns_data *cyc2ns_read_begin(void)
77{
78 struct cyc2ns_data *head;
79
80 preempt_disable();
81
82 head = this_cpu_read(cyc2ns.head);
83 /*
84 * Ensure we observe the entry when we observe the pointer to it.
85 * matches the wmb from cyc2ns_write_end().
86 */
87 smp_read_barrier_depends();
88 head->__count++;
89 barrier();
90
91 return head;
92}
93
94void cyc2ns_read_end(struct cyc2ns_data *head)
95{
96 barrier();
97 /*
98 * If we're the outer most nested read; update the tail pointer
99 * when we're done. This notifies possible pending writers
100 * that we've observed the head pointer and that the other
101 * entry is now free.
102 */
103 if (!--head->__count) {
104 /*
105 * x86-TSO does not reorder writes with older reads;
106 * therefore once this write becomes visible to another
107 * cpu, we must be finished reading the cyc2ns_data.
108 *
109 * matches with cyc2ns_write_begin().
110 */
111 this_cpu_write(cyc2ns.tail, head);
112 }
113 preempt_enable();
114}
115
116/*
117 * Begin writing a new @data entry for @cpu.
118 *
119 * Assumes some sort of write side lock; currently 'provided' by the assumption
120 * that cpufreq will call its notifiers sequentially.
121 */
122static struct cyc2ns_data *cyc2ns_write_begin(int cpu)
123{
124 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
125 struct cyc2ns_data *data = c2n->data;
126
127 if (data == c2n->head)
128 data++;
129
130 /* XXX send an IPI to @cpu in order to guarantee a read? */
131
132 /*
133 * When we observe the tail write from cyc2ns_read_end(),
134 * the cpu must be done with that entry and its safe
135 * to start writing to it.
136 */
137 while (c2n->tail == data)
138 cpu_relax();
139
140 return data;
141}
142
143static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data)
144{
145 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
146
147 /*
148 * Ensure the @data writes are visible before we publish the
149 * entry. Matches the data-depencency in cyc2ns_read_begin().
150 */
151 smp_wmb();
152
153 ACCESS_ONCE(c2n->head) = data;
154}
155
156/*
157 * Accelerators for sched_clock()
158 * convert from cycles(64bits) => nanoseconds (64bits)
159 * basic equation:
160 * ns = cycles / (freq / ns_per_sec)
161 * ns = cycles * (ns_per_sec / freq)
162 * ns = cycles * (10^9 / (cpu_khz * 10^3))
163 * ns = cycles * (10^6 / cpu_khz)
164 *
165 * Then we use scaling math (suggested by george@mvista.com) to get:
166 * ns = cycles * (10^6 * SC / cpu_khz) / SC
167 * ns = cycles * cyc2ns_scale / SC
168 *
169 * And since SC is a constant power of two, we can convert the div
170 * into a shift.
171 *
172 * We can use khz divisor instead of mhz to keep a better precision, since
173 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
174 * (mathieu.desnoyers@polymtl.ca)
175 *
176 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
177 */
178
179#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
180
181static void cyc2ns_data_init(struct cyc2ns_data *data)
182{
183 data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR;
184 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
185 data->cyc2ns_offset = 0;
186 data->__count = 0;
187}
188
189static void cyc2ns_init(int cpu)
190{
191 struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu);
192
193 cyc2ns_data_init(&c2n->data[0]);
194 cyc2ns_data_init(&c2n->data[1]);
195
196 c2n->head = c2n->data;
197 c2n->tail = c2n->data;
198}
199
200static inline unsigned long long cycles_2_ns(unsigned long long cyc)
201{
202 struct cyc2ns_data *data, *tail;
203 unsigned long long ns;
204
205 /*
206 * See cyc2ns_read_*() for details; replicated in order to avoid
207 * an extra few instructions that came with the abstraction.
208 * Notable, it allows us to only do the __count and tail update
209 * dance when its actually needed.
210 */
211
212 preempt_disable();
213 data = this_cpu_read(cyc2ns.head);
214 tail = this_cpu_read(cyc2ns.tail);
215
216 if (likely(data == tail)) {
217 ns = data->cyc2ns_offset;
218 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
219 } else {
220 data->__count++;
221
222 barrier();
223
224 ns = data->cyc2ns_offset;
225 ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
226
227 barrier();
228
229 if (!--data->__count)
230 this_cpu_write(cyc2ns.tail, data);
231 }
232 preempt_enable();
233
234 return ns;
235}
236
237/* XXX surely we already have this someplace in the kernel?! */
238#define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d))
239
240static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
241{
242 unsigned long long tsc_now, ns_now;
243 struct cyc2ns_data *data;
244 unsigned long flags;
245
246 local_irq_save(flags);
247 sched_clock_idle_sleep_event();
248
249 if (!cpu_khz)
250 goto done;
251
252 data = cyc2ns_write_begin(cpu);
253
254 rdtscll(tsc_now);
255 ns_now = cycles_2_ns(tsc_now);
256
257 /*
258 * Compute a new multiplier as per the above comment and ensure our
259 * time function is continuous; see the comment near struct
260 * cyc2ns_data.
261 */
262 data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz);
263 data->cyc2ns_shift = CYC2NS_SCALE_FACTOR;
264 data->cyc2ns_offset = ns_now -
265 mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR);
266
267 cyc2ns_write_end(cpu, data);
268
269done:
270 sched_clock_idle_wakeup_event(0);
271 local_irq_restore(flags);
272}
41/* 273/*
42 * Scheduler clock - returns current time in nanosec units. 274 * Scheduler clock - returns current time in nanosec units.
43 */ 275 */
44u64 native_sched_clock(void) 276u64 native_sched_clock(void)
45{ 277{
46 u64 this_offset; 278 u64 tsc_now;
47 279
48 /* 280 /*
49 * Fall back to jiffies if there's no TSC available: 281 * Fall back to jiffies if there's no TSC available:
@@ -53,16 +285,16 @@ u64 native_sched_clock(void)
53 * very important for it to be as fast as the platform 285 * very important for it to be as fast as the platform
54 * can achieve it. ) 286 * can achieve it. )
55 */ 287 */
56 if (unlikely(tsc_disabled)) { 288 if (!static_key_false(&__use_tsc)) {
57 /* No locking but a rare wrong value is not a big deal: */ 289 /* No locking but a rare wrong value is not a big deal: */
58 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); 290 return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
59 } 291 }
60 292
61 /* read the Time Stamp Counter: */ 293 /* read the Time Stamp Counter: */
62 rdtscll(this_offset); 294 rdtscll(tsc_now);
63 295
64 /* return the value in ns */ 296 /* return the value in ns */
65 return __cycles_2_ns(this_offset); 297 return cycles_2_ns(tsc_now);
66} 298}
67 299
68/* We need to define a real function for sched_clock, to override the 300/* We need to define a real function for sched_clock, to override the
@@ -419,6 +651,16 @@ unsigned long native_calibrate_tsc(void)
419 unsigned long flags, latch, ms, fast_calibrate; 651 unsigned long flags, latch, ms, fast_calibrate;
420 int hpet = is_hpet_enabled(), i, loopmin; 652 int hpet = is_hpet_enabled(), i, loopmin;
421 653
654 /* Calibrate TSC using MSR for Intel Atom SoCs */
655 local_irq_save(flags);
656 i = try_msr_calibrate_tsc(&fast_calibrate);
657 local_irq_restore(flags);
658 if (i >= 0) {
659 if (i == 0)
660 pr_warn("Fast TSC calibration using MSR failed\n");
661 return fast_calibrate;
662 }
663
422 local_irq_save(flags); 664 local_irq_save(flags);
423 fast_calibrate = quick_pit_calibrate(); 665 fast_calibrate = quick_pit_calibrate();
424 local_irq_restore(flags); 666 local_irq_restore(flags);
@@ -589,61 +831,11 @@ int recalibrate_cpu_khz(void)
589EXPORT_SYMBOL(recalibrate_cpu_khz); 831EXPORT_SYMBOL(recalibrate_cpu_khz);
590 832
591 833
592/* Accelerators for sched_clock()
593 * convert from cycles(64bits) => nanoseconds (64bits)
594 * basic equation:
595 * ns = cycles / (freq / ns_per_sec)
596 * ns = cycles * (ns_per_sec / freq)
597 * ns = cycles * (10^9 / (cpu_khz * 10^3))
598 * ns = cycles * (10^6 / cpu_khz)
599 *
600 * Then we use scaling math (suggested by george@mvista.com) to get:
601 * ns = cycles * (10^6 * SC / cpu_khz) / SC
602 * ns = cycles * cyc2ns_scale / SC
603 *
604 * And since SC is a constant power of two, we can convert the div
605 * into a shift.
606 *
607 * We can use khz divisor instead of mhz to keep a better precision, since
608 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
609 * (mathieu.desnoyers@polymtl.ca)
610 *
611 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
612 */
613
614DEFINE_PER_CPU(unsigned long, cyc2ns);
615DEFINE_PER_CPU(unsigned long long, cyc2ns_offset);
616
617static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
618{
619 unsigned long long tsc_now, ns_now, *offset;
620 unsigned long flags, *scale;
621
622 local_irq_save(flags);
623 sched_clock_idle_sleep_event();
624
625 scale = &per_cpu(cyc2ns, cpu);
626 offset = &per_cpu(cyc2ns_offset, cpu);
627
628 rdtscll(tsc_now);
629 ns_now = __cycles_2_ns(tsc_now);
630
631 if (cpu_khz) {
632 *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) +
633 cpu_khz / 2) / cpu_khz;
634 *offset = ns_now - mult_frac(tsc_now, *scale,
635 (1UL << CYC2NS_SCALE_FACTOR));
636 }
637
638 sched_clock_idle_wakeup_event(0);
639 local_irq_restore(flags);
640}
641
642static unsigned long long cyc2ns_suspend; 834static unsigned long long cyc2ns_suspend;
643 835
644void tsc_save_sched_clock_state(void) 836void tsc_save_sched_clock_state(void)
645{ 837{
646 if (!sched_clock_stable) 838 if (!sched_clock_stable())
647 return; 839 return;
648 840
649 cyc2ns_suspend = sched_clock(); 841 cyc2ns_suspend = sched_clock();
@@ -663,16 +855,26 @@ void tsc_restore_sched_clock_state(void)
663 unsigned long flags; 855 unsigned long flags;
664 int cpu; 856 int cpu;
665 857
666 if (!sched_clock_stable) 858 if (!sched_clock_stable())
667 return; 859 return;
668 860
669 local_irq_save(flags); 861 local_irq_save(flags);
670 862
671 __this_cpu_write(cyc2ns_offset, 0); 863 /*
864 * We're comming out of suspend, there's no concurrency yet; don't
865 * bother being nice about the RCU stuff, just write to both
866 * data fields.
867 */
868
869 this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0);
870 this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0);
871
672 offset = cyc2ns_suspend - sched_clock(); 872 offset = cyc2ns_suspend - sched_clock();
673 873
674 for_each_possible_cpu(cpu) 874 for_each_possible_cpu(cpu) {
675 per_cpu(cyc2ns_offset, cpu) = offset; 875 per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset;
876 per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset;
877 }
676 878
677 local_irq_restore(flags); 879 local_irq_restore(flags);
678} 880}
@@ -795,7 +997,7 @@ void mark_tsc_unstable(char *reason)
795{ 997{
796 if (!tsc_unstable) { 998 if (!tsc_unstable) {
797 tsc_unstable = 1; 999 tsc_unstable = 1;
798 sched_clock_stable = 0; 1000 clear_sched_clock_stable();
799 disable_sched_clock_irqtime(); 1001 disable_sched_clock_irqtime();
800 pr_info("Marking TSC unstable due to %s\n", reason); 1002 pr_info("Marking TSC unstable due to %s\n", reason);
801 /* Change only the rating, when not registered */ 1003 /* Change only the rating, when not registered */
@@ -995,14 +1197,18 @@ void __init tsc_init(void)
995 * speed as the bootup CPU. (cpufreq notifiers will fix this 1197 * speed as the bootup CPU. (cpufreq notifiers will fix this
996 * up if their speed diverges) 1198 * up if their speed diverges)
997 */ 1199 */
998 for_each_possible_cpu(cpu) 1200 for_each_possible_cpu(cpu) {
1201 cyc2ns_init(cpu);
999 set_cyc2ns_scale(cpu_khz, cpu); 1202 set_cyc2ns_scale(cpu_khz, cpu);
1203 }
1000 1204
1001 if (tsc_disabled > 0) 1205 if (tsc_disabled > 0)
1002 return; 1206 return;
1003 1207
1004 /* now allow native_sched_clock() to use rdtsc */ 1208 /* now allow native_sched_clock() to use rdtsc */
1209
1005 tsc_disabled = 0; 1210 tsc_disabled = 0;
1211 static_key_slow_inc(&__use_tsc);
1006 1212
1007 if (!no_sched_irq_time) 1213 if (!no_sched_irq_time)
1008 enable_sched_clock_irqtime(); 1214 enable_sched_clock_irqtime();
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
new file mode 100644
index 000000000000..8b5434f4389f
--- /dev/null
+++ b/arch/x86/kernel/tsc_msr.c
@@ -0,0 +1,127 @@
1/*
2 * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms.
3 *
4 * TSC in Intel Atom SoC runs at a constant rate which can be figured
5 * by this formula:
6 * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency>
7 * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5
8 * for details.
9 * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR
10 * based calibration is the only option.
11 *
12 *
13 * Copyright (C) 2013 Intel Corporation
14 * Author: Bin Gao <bin.gao@intel.com>
15 *
16 * This file is released under the GPLv2.
17 */
18
19#include <linux/kernel.h>
20#include <asm/processor.h>
21#include <asm/setup.h>
22#include <asm/apic.h>
23#include <asm/param.h>
24
25/* CPU reference clock frequency: in KHz */
26#define FREQ_83 83200
27#define FREQ_100 99840
28#define FREQ_133 133200
29#define FREQ_166 166400
30
31#define MAX_NUM_FREQS 8
32
33/*
34 * According to Intel 64 and IA-32 System Programming Guide,
35 * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be
36 * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40].
37 * Unfortunately some Intel Atom SoCs aren't quite compliant to this,
38 * so we need manually differentiate SoC families. This is what the
39 * field msr_plat does.
40 */
41struct freq_desc {
42 u8 x86_family; /* CPU family */
43 u8 x86_model; /* model */
44 u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */
45 u32 freqs[MAX_NUM_FREQS];
46};
47
48static struct freq_desc freq_desc_tables[] = {
49 /* PNW */
50 { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
51 /* CLV+ */
52 { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } },
53 /* TNG */
54 { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } },
55 /* VLV2 */
56 { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } },
57 /* ANN */
58 { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } },
59};
60
61static int match_cpu(u8 family, u8 model)
62{
63 int i;
64
65 for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) {
66 if ((family == freq_desc_tables[i].x86_family) &&
67 (model == freq_desc_tables[i].x86_model))
68 return i;
69 }
70
71 return -1;
72}
73
74/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */
75#define id_to_freq(cpu_index, freq_id) \
76 (freq_desc_tables[cpu_index].freqs[freq_id])
77
78/*
79 * Do MSR calibration only for known/supported CPUs.
80 * Return values:
81 * -1: CPU is unknown/unsupported for MSR based calibration
82 * 0: CPU is known/supported, but calibration failed
83 * 1: CPU is known/supported, and calibration succeeded
84 */
85int try_msr_calibrate_tsc(unsigned long *fast_calibrate)
86{
87 int cpu_index;
88 u32 lo, hi, ratio, freq_id, freq;
89
90 cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model);
91 if (cpu_index < 0)
92 return -1;
93
94 *fast_calibrate = 0;
95
96 if (freq_desc_tables[cpu_index].msr_plat) {
97 rdmsr(MSR_PLATFORM_INFO, lo, hi);
98 ratio = (lo >> 8) & 0x1f;
99 } else {
100 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
101 ratio = (hi >> 8) & 0x1f;
102 }
103 pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio);
104
105 if (!ratio)
106 return 0;
107
108 /* Get FSB FREQ ID */
109 rdmsr(MSR_FSB_FREQ, lo, hi);
110 freq_id = lo & 0x7;
111 freq = id_to_freq(cpu_index, freq_id);
112 pr_info("Resolved frequency ID: %u, frequency: %u KHz\n",
113 freq_id, freq);
114 if (!freq)
115 return 0;
116
117 /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */
118 *fast_calibrate = freq * ratio;
119 pr_info("TSC runs at %lu KHz\n", *fast_calibrate);
120
121#ifdef CONFIG_X86_LOCAL_APIC
122 lapic_timer_frequency = (freq * 1000) / HZ;
123 pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency);
124#endif
125
126 return 1;
127}
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index adfdf56a3714..26488487bc61 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -16,7 +16,6 @@
16 */ 16 */
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/smp.h> 19#include <linux/smp.h>
21#include <linux/nmi.h> 20#include <linux/nmi.h>
22#include <asm/tsc.h> 21#include <asm/tsc.h>
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 422fd8223470..a4b451c6addf 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void)
562 if (cpu_has_xsaveopt && eagerfpu != DISABLE) 562 if (cpu_has_xsaveopt && eagerfpu != DISABLE)
563 eagerfpu = ENABLE; 563 eagerfpu = ENABLE;
564 564
565 if (pcntxt_mask & XSTATE_EAGER) {
566 if (eagerfpu == DISABLE) {
567 pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n",
568 pcntxt_mask & XSTATE_EAGER);
569 pcntxt_mask &= ~XSTATE_EAGER;
570 } else {
571 eagerfpu = ENABLE;
572 }
573 }
574
565 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", 575 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",
566 pcntxt_mask, xstate_size); 576 pcntxt_mask, xstate_size);
567} 577}