diff options
Diffstat (limited to 'arch/x86/kernel')
59 files changed, 2189 insertions, 324 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9b0a34e2cd79..cb648c84b327 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -29,10 +29,11 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o | |||
29 | obj-y += syscall_$(BITS).o | 29 | obj-y += syscall_$(BITS).o |
30 | obj-$(CONFIG_X86_64) += vsyscall_64.o | 30 | obj-$(CONFIG_X86_64) += vsyscall_64.o |
31 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o | 31 | obj-$(CONFIG_X86_64) += vsyscall_emu_64.o |
32 | obj-$(CONFIG_SYSFS) += ksysfs.o | ||
32 | obj-y += bootflag.o e820.o | 33 | obj-y += bootflag.o e820.o |
33 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o | 34 | obj-y += pci-dma.o quirks.o topology.o kdebugfs.o |
34 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o | 35 | obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o |
35 | obj-y += tsc.o io_delay.o rtc.o | 36 | obj-y += tsc.o tsc_msr.o io_delay.o rtc.o |
36 | obj-y += pci-iommu_table.o | 37 | obj-y += pci-iommu_table.o |
37 | obj-y += resource.o | 38 | obj-y += resource.o |
38 | 39 | ||
@@ -91,15 +92,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o | |||
91 | 92 | ||
92 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o | 93 | obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o |
93 | 94 | ||
94 | obj-$(CONFIG_MICROCODE_EARLY) += microcode_core_early.o | ||
95 | obj-$(CONFIG_MICROCODE_INTEL_EARLY) += microcode_intel_early.o | ||
96 | obj-$(CONFIG_MICROCODE_INTEL_LIB) += microcode_intel_lib.o | ||
97 | microcode-y := microcode_core.o | ||
98 | microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o | ||
99 | microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o | ||
100 | obj-$(CONFIG_MICROCODE_AMD_EARLY) += microcode_amd_early.o | ||
101 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
102 | |||
103 | obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o | 95 | obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o |
104 | 96 | ||
105 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o | 97 | obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o |
@@ -111,6 +103,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o | |||
111 | 103 | ||
112 | obj-$(CONFIG_PERF_EVENTS) += perf_regs.o | 104 | obj-$(CONFIG_PERF_EVENTS) += perf_regs.o |
113 | obj-$(CONFIG_TRACING) += tracepoint.o | 105 | obj-$(CONFIG_TRACING) += tracepoint.o |
106 | obj-$(CONFIG_IOSF_MBI) += iosf_mbi.o | ||
114 | 107 | ||
115 | ### | 108 | ### |
116 | # 64 bit specific files | 109 | # 64 bit specific files |
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d2b7f27781bc..e69182fd01cf 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c | |||
@@ -150,29 +150,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, | |||
150 | } | 150 | } |
151 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); | 151 | EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); |
152 | 152 | ||
153 | /* | ||
154 | * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, | ||
155 | * which can obviate IPI to trigger checking of need_resched. | ||
156 | * We execute MONITOR against need_resched and enter optimized wait state | ||
157 | * through MWAIT. Whenever someone changes need_resched, we would be woken | ||
158 | * up from MWAIT (without an IPI). | ||
159 | * | ||
160 | * New with Core Duo processors, MWAIT can take some hints based on CPU | ||
161 | * capability. | ||
162 | */ | ||
163 | void mwait_idle_with_hints(unsigned long ax, unsigned long cx) | ||
164 | { | ||
165 | if (!need_resched()) { | ||
166 | if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) | ||
167 | clflush((void *)¤t_thread_info()->flags); | ||
168 | |||
169 | __monitor((void *)¤t_thread_info()->flags, 0, 0); | ||
170 | smp_mb(); | ||
171 | if (!need_resched()) | ||
172 | __mwait(ax, cx); | ||
173 | } | ||
174 | } | ||
175 | |||
176 | void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) | 153 | void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) |
177 | { | 154 | { |
178 | unsigned int cpu = smp_processor_id(); | 155 | unsigned int cpu = smp_processor_id(); |
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index d278736bf774..7f26c9a70a9e 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c | |||
@@ -75,6 +75,13 @@ unsigned int max_physical_apicid; | |||
75 | physid_mask_t phys_cpu_present_map; | 75 | physid_mask_t phys_cpu_present_map; |
76 | 76 | ||
77 | /* | 77 | /* |
78 | * Processor to be disabled specified by kernel parameter | ||
79 | * disable_cpu_apicid=<int>, mostly used for the kdump 2nd kernel to | ||
80 | * avoid undefined behaviour caused by sending INIT from AP to BSP. | ||
81 | */ | ||
82 | static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; | ||
83 | |||
84 | /* | ||
78 | * Map cpu index to physical APIC ID | 85 | * Map cpu index to physical APIC ID |
79 | */ | 86 | */ |
80 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); | 87 | DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); |
@@ -1968,7 +1975,7 @@ __visible void smp_trace_spurious_interrupt(struct pt_regs *regs) | |||
1968 | */ | 1975 | */ |
1969 | static inline void __smp_error_interrupt(struct pt_regs *regs) | 1976 | static inline void __smp_error_interrupt(struct pt_regs *regs) |
1970 | { | 1977 | { |
1971 | u32 v0, v1; | 1978 | u32 v; |
1972 | u32 i = 0; | 1979 | u32 i = 0; |
1973 | static const char * const error_interrupt_reason[] = { | 1980 | static const char * const error_interrupt_reason[] = { |
1974 | "Send CS error", /* APIC Error Bit 0 */ | 1981 | "Send CS error", /* APIC Error Bit 0 */ |
@@ -1982,21 +1989,20 @@ static inline void __smp_error_interrupt(struct pt_regs *regs) | |||
1982 | }; | 1989 | }; |
1983 | 1990 | ||
1984 | /* First tickle the hardware, only then report what went on. -- REW */ | 1991 | /* First tickle the hardware, only then report what went on. -- REW */ |
1985 | v0 = apic_read(APIC_ESR); | ||
1986 | apic_write(APIC_ESR, 0); | 1992 | apic_write(APIC_ESR, 0); |
1987 | v1 = apic_read(APIC_ESR); | 1993 | v = apic_read(APIC_ESR); |
1988 | ack_APIC_irq(); | 1994 | ack_APIC_irq(); |
1989 | atomic_inc(&irq_err_count); | 1995 | atomic_inc(&irq_err_count); |
1990 | 1996 | ||
1991 | apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", | 1997 | apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x", |
1992 | smp_processor_id(), v0 , v1); | 1998 | smp_processor_id(), v); |
1993 | 1999 | ||
1994 | v1 = v1 & 0xff; | 2000 | v &= 0xff; |
1995 | while (v1) { | 2001 | while (v) { |
1996 | if (v1 & 0x1) | 2002 | if (v & 0x1) |
1997 | apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); | 2003 | apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); |
1998 | i++; | 2004 | i++; |
1999 | v1 >>= 1; | 2005 | v >>= 1; |
2000 | } | 2006 | } |
2001 | 2007 | ||
2002 | apic_printk(APIC_DEBUG, KERN_CONT "\n"); | 2008 | apic_printk(APIC_DEBUG, KERN_CONT "\n"); |
@@ -2115,6 +2121,39 @@ int generic_processor_info(int apicid, int version) | |||
2115 | phys_cpu_present_map); | 2121 | phys_cpu_present_map); |
2116 | 2122 | ||
2117 | /* | 2123 | /* |
2124 | * boot_cpu_physical_apicid is designed to have the apicid | ||
2125 | * returned by read_apic_id(), i.e, the apicid of the | ||
2126 | * currently booting-up processor. However, on some platforms, | ||
2127 | * it is temporarily modified by the apicid reported as BSP | ||
2128 | * through MP table. Concretely: | ||
2129 | * | ||
2130 | * - arch/x86/kernel/mpparse.c: MP_processor_info() | ||
2131 | * - arch/x86/mm/amdtopology.c: amd_numa_init() | ||
2132 | * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() | ||
2133 | * | ||
2134 | * This function is executed with the modified | ||
2135 | * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel | ||
2136 | * parameter doesn't work to disable APs on kdump 2nd kernel. | ||
2137 | * | ||
2138 | * Since fixing handling of boot_cpu_physical_apicid requires | ||
2139 | * another discussion and tests on each platform, we leave it | ||
2140 | * for now and here we use read_apic_id() directly in this | ||
2141 | * function, generic_processor_info(). | ||
2142 | */ | ||
2143 | if (disabled_cpu_apicid != BAD_APICID && | ||
2144 | disabled_cpu_apicid != read_apic_id() && | ||
2145 | disabled_cpu_apicid == apicid) { | ||
2146 | int thiscpu = num_processors + disabled_cpus; | ||
2147 | |||
2148 | pr_warning("APIC: Disabling requested cpu." | ||
2149 | " Processor %d/0x%x ignored.\n", | ||
2150 | thiscpu, apicid); | ||
2151 | |||
2152 | disabled_cpus++; | ||
2153 | return -ENODEV; | ||
2154 | } | ||
2155 | |||
2156 | /* | ||
2118 | * If boot cpu has not been detected yet, then only allow upto | 2157 | * If boot cpu has not been detected yet, then only allow upto |
2119 | * nr_cpu_ids - 1 processors and keep one slot free for boot cpu | 2158 | * nr_cpu_ids - 1 processors and keep one slot free for boot cpu |
2120 | */ | 2159 | */ |
@@ -2592,3 +2631,12 @@ static int __init lapic_insert_resource(void) | |||
2592 | * that is using request_resource | 2631 | * that is using request_resource |
2593 | */ | 2632 | */ |
2594 | late_initcall(lapic_insert_resource); | 2633 | late_initcall(lapic_insert_resource); |
2634 | |||
2635 | static int __init apic_set_disabled_cpu_apicid(char *arg) | ||
2636 | { | ||
2637 | if (!arg || !get_option(&arg, &disabled_cpu_apicid)) | ||
2638 | return -EINVAL; | ||
2639 | |||
2640 | return 0; | ||
2641 | } | ||
2642 | early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); | ||
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 00c77cf78e9e..5d5b9eb2b7a4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c | |||
@@ -14,7 +14,6 @@ | |||
14 | #include <linux/string.h> | 14 | #include <linux/string.h> |
15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
16 | #include <linux/ctype.h> | 16 | #include <linux/ctype.h> |
17 | #include <linux/init.h> | ||
18 | #include <linux/hardirq.h> | 17 | #include <linux/hardirq.h> |
19 | #include <linux/module.h> | 18 | #include <linux/module.h> |
20 | #include <asm/smp.h> | 19 | #include <asm/smp.h> |
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index e145f28b4099..191ce75c0e54 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c | |||
@@ -15,7 +15,6 @@ | |||
15 | #include <linux/string.h> | 15 | #include <linux/string.h> |
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/ctype.h> | 17 | #include <linux/ctype.h> |
18 | #include <linux/init.h> | ||
19 | #include <linux/errno.h> | 18 | #include <linux/errno.h> |
20 | #include <asm/fixmap.h> | 19 | #include <asm/fixmap.h> |
21 | #include <asm/mpspec.h> | 20 | #include <asm/mpspec.h> |
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e63a5bd2a78f..a43f068ebec1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c | |||
@@ -1142,9 +1142,10 @@ next: | |||
1142 | if (test_bit(vector, used_vectors)) | 1142 | if (test_bit(vector, used_vectors)) |
1143 | goto next; | 1143 | goto next; |
1144 | 1144 | ||
1145 | for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) | 1145 | for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) { |
1146 | if (per_cpu(vector_irq, new_cpu)[vector] != -1) | 1146 | if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED) |
1147 | goto next; | 1147 | goto next; |
1148 | } | ||
1148 | /* Found one! */ | 1149 | /* Found one! */ |
1149 | current_vector = vector; | 1150 | current_vector = vector; |
1150 | current_offset = offset; | 1151 | current_offset = offset; |
@@ -1183,7 +1184,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) | |||
1183 | 1184 | ||
1184 | vector = cfg->vector; | 1185 | vector = cfg->vector; |
1185 | for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) | 1186 | for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) |
1186 | per_cpu(vector_irq, cpu)[vector] = -1; | 1187 | per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; |
1187 | 1188 | ||
1188 | cfg->vector = 0; | 1189 | cfg->vector = 0; |
1189 | cpumask_clear(cfg->domain); | 1190 | cpumask_clear(cfg->domain); |
@@ -1191,11 +1192,10 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) | |||
1191 | if (likely(!cfg->move_in_progress)) | 1192 | if (likely(!cfg->move_in_progress)) |
1192 | return; | 1193 | return; |
1193 | for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { | 1194 | for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { |
1194 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; | 1195 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { |
1195 | vector++) { | ||
1196 | if (per_cpu(vector_irq, cpu)[vector] != irq) | 1196 | if (per_cpu(vector_irq, cpu)[vector] != irq) |
1197 | continue; | 1197 | continue; |
1198 | per_cpu(vector_irq, cpu)[vector] = -1; | 1198 | per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; |
1199 | break; | 1199 | break; |
1200 | } | 1200 | } |
1201 | } | 1201 | } |
@@ -1228,12 +1228,12 @@ void __setup_vector_irq(int cpu) | |||
1228 | /* Mark the free vectors */ | 1228 | /* Mark the free vectors */ |
1229 | for (vector = 0; vector < NR_VECTORS; ++vector) { | 1229 | for (vector = 0; vector < NR_VECTORS; ++vector) { |
1230 | irq = per_cpu(vector_irq, cpu)[vector]; | 1230 | irq = per_cpu(vector_irq, cpu)[vector]; |
1231 | if (irq < 0) | 1231 | if (irq <= VECTOR_UNDEFINED) |
1232 | continue; | 1232 | continue; |
1233 | 1233 | ||
1234 | cfg = irq_cfg(irq); | 1234 | cfg = irq_cfg(irq); |
1235 | if (!cpumask_test_cpu(cpu, cfg->domain)) | 1235 | if (!cpumask_test_cpu(cpu, cfg->domain)) |
1236 | per_cpu(vector_irq, cpu)[vector] = -1; | 1236 | per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; |
1237 | } | 1237 | } |
1238 | raw_spin_unlock(&vector_lock); | 1238 | raw_spin_unlock(&vector_lock); |
1239 | } | 1239 | } |
@@ -2202,13 +2202,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) | |||
2202 | 2202 | ||
2203 | me = smp_processor_id(); | 2203 | me = smp_processor_id(); |
2204 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | 2204 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { |
2205 | unsigned int irq; | 2205 | int irq; |
2206 | unsigned int irr; | 2206 | unsigned int irr; |
2207 | struct irq_desc *desc; | 2207 | struct irq_desc *desc; |
2208 | struct irq_cfg *cfg; | 2208 | struct irq_cfg *cfg; |
2209 | irq = __this_cpu_read(vector_irq[vector]); | 2209 | irq = __this_cpu_read(vector_irq[vector]); |
2210 | 2210 | ||
2211 | if (irq == -1) | 2211 | if (irq <= VECTOR_UNDEFINED) |
2212 | continue; | 2212 | continue; |
2213 | 2213 | ||
2214 | desc = irq_to_desc(irq); | 2214 | desc = irq_to_desc(irq); |
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 7434d8556d09..62071569bd50 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/cpumask.h> | 1 | #include <linux/cpumask.h> |
2 | #include <linux/interrupt.h> | 2 | #include <linux/interrupt.h> |
3 | #include <linux/init.h> | ||
4 | 3 | ||
5 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
6 | #include <linux/delay.h> | 5 | #include <linux/delay.h> |
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 77c95c0e1bf7..00146f9b0254 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #define pr_fmt(fmt) "summit: %s: " fmt, __func__ | 29 | #define pr_fmt(fmt) "summit: %s: " fmt, __func__ |
30 | 30 | ||
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
32 | #include <linux/init.h> | ||
33 | #include <asm/io.h> | 32 | #include <asm/io.h> |
34 | #include <asm/bios_ebda.h> | 33 | #include <asm/bios_ebda.h> |
35 | 34 | ||
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 140e29db478d..cac85ee6913f 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c | |||
@@ -3,7 +3,6 @@ | |||
3 | #include <linux/string.h> | 3 | #include <linux/string.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/ctype.h> | 5 | #include <linux/ctype.h> |
6 | #include <linux/init.h> | ||
7 | #include <linux/dmar.h> | 6 | #include <linux/dmar.h> |
8 | #include <linux/cpu.h> | 7 | #include <linux/cpu.h> |
9 | 8 | ||
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 562a76d433c8..de231e328cae 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c | |||
@@ -3,7 +3,6 @@ | |||
3 | #include <linux/string.h> | 3 | #include <linux/string.h> |
4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
5 | #include <linux/ctype.h> | 5 | #include <linux/ctype.h> |
6 | #include <linux/init.h> | ||
7 | #include <linux/dmar.h> | 6 | #include <linux/dmar.h> |
8 | 7 | ||
9 | #include <asm/smp.h> | 8 | #include <asm/smp.h> |
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index e2dbcb7dabdd..83a7995625a6 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c | |||
@@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void) | |||
91 | 91 | ||
92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); | 92 | corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); |
93 | 93 | ||
94 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 94 | for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) { |
95 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), | 95 | start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), |
96 | PAGE_SIZE, corruption_check_size); | 96 | PAGE_SIZE, corruption_check_size); |
97 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), | 97 | end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), |
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 47b56a7e99cb..7fd54f09b011 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile | |||
@@ -36,12 +36,13 @@ obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd_iommu.o | |||
36 | endif | 36 | endif |
37 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o | 37 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o |
38 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o | 38 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o |
39 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o | 39 | obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o perf_event_intel_rapl.o |
40 | endif | 40 | endif |
41 | 41 | ||
42 | 42 | ||
43 | obj-$(CONFIG_X86_MCE) += mcheck/ | 43 | obj-$(CONFIG_X86_MCE) += mcheck/ |
44 | obj-$(CONFIG_MTRR) += mtrr/ | 44 | obj-$(CONFIG_MTRR) += mtrr/ |
45 | obj-$(CONFIG_MICROCODE) += microcode/ | ||
45 | 46 | ||
46 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o | 47 | obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o |
47 | 48 | ||
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index bca023bdd6b2..d3153e281d72 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c | |||
@@ -1,5 +1,4 @@ | |||
1 | #include <linux/export.h> | 1 | #include <linux/export.h> |
2 | #include <linux/init.h> | ||
3 | #include <linux/bitops.h> | 2 | #include <linux/bitops.h> |
4 | #include <linux/elf.h> | 3 | #include <linux/elf.h> |
5 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
@@ -487,7 +486,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) | |||
487 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 486 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
488 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 487 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
489 | if (!check_tsc_unstable()) | 488 | if (!check_tsc_unstable()) |
490 | sched_clock_stable = 1; | 489 | set_sched_clock_stable(); |
491 | } | 490 | } |
492 | 491 | ||
493 | #ifdef CONFIG_X86_64 | 492 | #ifdef CONFIG_X86_64 |
@@ -508,6 +507,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) | |||
508 | set_cpu_cap(c, X86_FEATURE_EXTD_APICID); | 507 | set_cpu_cap(c, X86_FEATURE_EXTD_APICID); |
509 | } | 508 | } |
510 | #endif | 509 | #endif |
510 | |||
511 | /* F16h erratum 793, CVE-2013-6885 */ | ||
512 | if (c->x86 == 0x16 && c->x86_model <= 0xf) { | ||
513 | u64 val; | ||
514 | |||
515 | rdmsrl(MSR_AMD64_LS_CFG, val); | ||
516 | if (!(val & BIT(15))) | ||
517 | wrmsrl(MSR_AMD64_LS_CFG, val | BIT(15)); | ||
518 | } | ||
519 | |||
511 | } | 520 | } |
512 | 521 | ||
513 | static const int amd_erratum_383[]; | 522 | static const int amd_erratum_383[]; |
@@ -790,14 +799,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) | |||
790 | } | 799 | } |
791 | 800 | ||
792 | /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ | 801 | /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ |
793 | if (!((eax >> 16) & mask)) { | 802 | if (!((eax >> 16) & mask)) |
794 | u32 a, b, c, d; | 803 | tlb_lld_2m[ENTRIES] = (cpuid_eax(0x80000005) >> 16) & 0xff; |
795 | 804 | else | |
796 | cpuid(0x80000005, &a, &b, &c, &d); | ||
797 | tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; | ||
798 | } else { | ||
799 | tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; | 805 | tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; |
800 | } | ||
801 | 806 | ||
802 | /* a 4M entry uses two 2M entries */ | 807 | /* a 4M entry uses two 2M entries */ |
803 | tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; | 808 | tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; |
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 8d5652dc99dd..8779edab684e 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/bitops.h> | 1 | #include <linux/bitops.h> |
2 | #include <linux/kernel.h> | 2 | #include <linux/kernel.h> |
3 | #include <linux/init.h> | ||
4 | 3 | ||
5 | #include <asm/processor.h> | 4 | #include <asm/processor.h> |
6 | #include <asm/e820.h> | 5 | #include <asm/e820.h> |
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6abc172b8258..24b6fd10625a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c | |||
@@ -472,6 +472,7 @@ u16 __read_mostly tlb_lli_4m[NR_INFO]; | |||
472 | u16 __read_mostly tlb_lld_4k[NR_INFO]; | 472 | u16 __read_mostly tlb_lld_4k[NR_INFO]; |
473 | u16 __read_mostly tlb_lld_2m[NR_INFO]; | 473 | u16 __read_mostly tlb_lld_2m[NR_INFO]; |
474 | u16 __read_mostly tlb_lld_4m[NR_INFO]; | 474 | u16 __read_mostly tlb_lld_4m[NR_INFO]; |
475 | u16 __read_mostly tlb_lld_1g[NR_INFO]; | ||
475 | 476 | ||
476 | /* | 477 | /* |
477 | * tlb_flushall_shift shows the balance point in replacing cr3 write | 478 | * tlb_flushall_shift shows the balance point in replacing cr3 write |
@@ -486,13 +487,13 @@ void cpu_detect_tlb(struct cpuinfo_x86 *c) | |||
486 | if (this_cpu->c_detect_tlb) | 487 | if (this_cpu->c_detect_tlb) |
487 | this_cpu->c_detect_tlb(c); | 488 | this_cpu->c_detect_tlb(c); |
488 | 489 | ||
489 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ | 490 | printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" |
490 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \ | 491 | "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d, 1GB %d\n" |
491 | "tlb_flushall_shift: %d\n", | 492 | "tlb_flushall_shift: %d\n", |
492 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], | 493 | tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES], |
493 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], | 494 | tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES], |
494 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], | 495 | tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], |
495 | tlb_flushall_shift); | 496 | tlb_lld_1g[ENTRIES], tlb_flushall_shift); |
496 | } | 497 | } |
497 | 498 | ||
498 | void detect_ht(struct cpuinfo_x86 *c) | 499 | void detect_ht(struct cpuinfo_x86 *c) |
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index d0969c75ab54..aaf152e79637 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c | |||
@@ -1,4 +1,3 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/bitops.h> | 1 | #include <linux/bitops.h> |
3 | #include <linux/delay.h> | 2 | #include <linux/delay.h> |
4 | #include <linux/pci.h> | 3 | #include <linux/pci.h> |
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index dc1ec0dff939..3db61c644e44 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c | |||
@@ -1,4 +1,3 @@ | |||
1 | #include <linux/init.h> | ||
2 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
3 | 2 | ||
4 | #include <linux/string.h> | 3 | #include <linux/string.h> |
@@ -93,7 +92,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) | |||
93 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); | 92 | set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); |
94 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); | 93 | set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); |
95 | if (!check_tsc_unstable()) | 94 | if (!check_tsc_unstable()) |
96 | sched_clock_stable = 1; | 95 | set_sched_clock_stable(); |
97 | } | 96 | } |
98 | 97 | ||
99 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ | 98 | /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ |
@@ -387,7 +386,8 @@ static void init_intel(struct cpuinfo_x86 *c) | |||
387 | set_cpu_cap(c, X86_FEATURE_PEBS); | 386 | set_cpu_cap(c, X86_FEATURE_PEBS); |
388 | } | 387 | } |
389 | 388 | ||
390 | if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) | 389 | if (c->x86 == 6 && cpu_has_clflush && |
390 | (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) | ||
391 | set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); | 391 | set_cpu_cap(c, X86_FEATURE_CLFLUSH_MONITOR); |
392 | 392 | ||
393 | #ifdef CONFIG_X86_64 | 393 | #ifdef CONFIG_X86_64 |
@@ -505,6 +505,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) | |||
505 | #define TLB_DATA0_2M_4M 0x23 | 505 | #define TLB_DATA0_2M_4M 0x23 |
506 | 506 | ||
507 | #define STLB_4K 0x41 | 507 | #define STLB_4K 0x41 |
508 | #define STLB_4K_2M 0x42 | ||
508 | 509 | ||
509 | static const struct _tlb_table intel_tlb_table[] = { | 510 | static const struct _tlb_table intel_tlb_table[] = { |
510 | { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, | 511 | { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" }, |
@@ -525,13 +526,20 @@ static const struct _tlb_table intel_tlb_table[] = { | |||
525 | { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, | 526 | { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" }, |
526 | { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, | 527 | { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" }, |
527 | { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, | 528 | { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" }, |
529 | { 0x61, TLB_INST_4K, 48, " TLB_INST 4 KByte pages, full associative" }, | ||
530 | { 0x63, TLB_DATA_1G, 4, " TLB_DATA 1 GByte pages, 4-way set associative" }, | ||
531 | { 0x76, TLB_INST_2M_4M, 8, " TLB_INST 2-MByte or 4-MByte pages, fully associative" }, | ||
528 | { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, | 532 | { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" }, |
529 | { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, | 533 | { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" }, |
530 | { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, | 534 | { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" }, |
531 | { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, | 535 | { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" }, |
532 | { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, | 536 | { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" }, |
537 | { 0xb5, TLB_INST_4K, 64, " TLB_INST 4 KByte pages, 8-way set ssociative" }, | ||
538 | { 0xb6, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 8-way set ssociative" }, | ||
533 | { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, | 539 | { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" }, |
534 | { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, | 540 | { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" }, |
541 | { 0xc1, STLB_4K_2M, 1024, " STLB 4 KByte and 2 MByte pages, 8-way associative" }, | ||
542 | { 0xc2, TLB_DATA_2M_4M, 16, " DTLB 2 MByte/4MByte pages, 4-way associative" }, | ||
535 | { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, | 543 | { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" }, |
536 | { 0x00, 0, 0 } | 544 | { 0x00, 0, 0 } |
537 | }; | 545 | }; |
@@ -557,6 +565,20 @@ static void intel_tlb_lookup(const unsigned char desc) | |||
557 | if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) | 565 | if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) |
558 | tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; | 566 | tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; |
559 | break; | 567 | break; |
568 | case STLB_4K_2M: | ||
569 | if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) | ||
570 | tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; | ||
571 | if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries) | ||
572 | tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries; | ||
573 | if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries) | ||
574 | tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries; | ||
575 | if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries) | ||
576 | tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries; | ||
577 | if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries) | ||
578 | tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries; | ||
579 | if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) | ||
580 | tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; | ||
581 | break; | ||
560 | case TLB_INST_ALL: | 582 | case TLB_INST_ALL: |
561 | if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) | 583 | if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries) |
562 | tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; | 584 | tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries; |
@@ -602,6 +624,10 @@ static void intel_tlb_lookup(const unsigned char desc) | |||
602 | if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) | 624 | if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries) |
603 | tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; | 625 | tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries; |
604 | break; | 626 | break; |
627 | case TLB_DATA_1G: | ||
628 | if (tlb_lld_1g[ENTRIES] < intel_tlb_table[k].entries) | ||
629 | tlb_lld_1g[ENTRIES] = intel_tlb_table[k].entries; | ||
630 | break; | ||
605 | } | 631 | } |
606 | } | 632 | } |
607 | 633 | ||
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index de8b60a53f69..a1aef9533154 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c | |||
@@ -33,22 +33,28 @@ | |||
33 | #include <linux/acpi.h> | 33 | #include <linux/acpi.h> |
34 | #include <linux/cper.h> | 34 | #include <linux/cper.h> |
35 | #include <acpi/apei.h> | 35 | #include <acpi/apei.h> |
36 | #include <acpi/ghes.h> | ||
36 | #include <asm/mce.h> | 37 | #include <asm/mce.h> |
37 | 38 | ||
38 | #include "mce-internal.h" | 39 | #include "mce-internal.h" |
39 | 40 | ||
40 | void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) | 41 | void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) |
41 | { | 42 | { |
42 | struct mce m; | 43 | struct mce m; |
43 | 44 | ||
44 | /* Only corrected MC is reported */ | 45 | if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) |
45 | if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) | ||
46 | return; | 46 | return; |
47 | 47 | ||
48 | mce_setup(&m); | 48 | mce_setup(&m); |
49 | m.bank = 1; | 49 | m.bank = 1; |
50 | /* Fake a memory read corrected error with unknown channel */ | 50 | /* Fake a memory read error with unknown channel */ |
51 | m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; | 51 | m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; |
52 | |||
53 | if (severity >= GHES_SEV_RECOVERABLE) | ||
54 | m.status |= MCI_STATUS_UC; | ||
55 | if (severity >= GHES_SEV_PANIC) | ||
56 | m.status |= MCI_STATUS_PCC; | ||
57 | |||
52 | m.addr = mem_err->physical_addr; | 58 | m.addr = mem_err->physical_addr; |
53 | mce_log(&m); | 59 | mce_log(&m); |
54 | mce_notify_irq(); | 60 | mce_notify_irq(); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b3218cdee95f..4d5419b249da 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c | |||
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) | |||
1638 | 1638 | ||
1639 | static void mce_start_timer(unsigned int cpu, struct timer_list *t) | 1639 | static void mce_start_timer(unsigned int cpu, struct timer_list *t) |
1640 | { | 1640 | { |
1641 | unsigned long iv = mce_adjust_timer(check_interval * HZ); | 1641 | unsigned long iv = check_interval * HZ; |
1642 | |||
1643 | __this_cpu_write(mce_next_interval, iv); | ||
1644 | 1642 | ||
1645 | if (mca_cfg.ignore_ce || !iv) | 1643 | if (mca_cfg.ignore_ce || !iv) |
1646 | return; | 1644 | return; |
1647 | 1645 | ||
1646 | per_cpu(mce_next_interval, cpu) = iv; | ||
1647 | |||
1648 | t->expires = round_jiffies(jiffies + iv); | 1648 | t->expires = round_jiffies(jiffies + iv); |
1649 | add_timer_on(t, smp_processor_id()); | 1649 | add_timer_on(t, cpu); |
1650 | } | 1650 | } |
1651 | 1651 | ||
1652 | static void __mcheck_cpu_init_timer(void) | 1652 | static void __mcheck_cpu_init_timer(void) |
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu) | |||
2272 | dev->release = &mce_device_release; | 2272 | dev->release = &mce_device_release; |
2273 | 2273 | ||
2274 | err = device_register(dev); | 2274 | err = device_register(dev); |
2275 | if (err) | 2275 | if (err) { |
2276 | put_device(dev); | ||
2276 | return err; | 2277 | return err; |
2278 | } | ||
2277 | 2279 | ||
2278 | for (i = 0; mce_device_attrs[i]; i++) { | 2280 | for (i = 0; mce_device_attrs[i]; i++) { |
2279 | err = device_create_file(dev, mce_device_attrs[i]); | 2281 | err = device_create_file(dev, mce_device_attrs[i]); |
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 4cfe0458ca66..fb6156fee6f7 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c | |||
@@ -6,7 +6,6 @@ | |||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/gfp.h> | 8 | #include <linux/gfp.h> |
9 | #include <linux/init.h> | ||
10 | #include <linux/interrupt.h> | 9 | #include <linux/interrupt.h> |
11 | #include <linux/percpu.h> | 10 | #include <linux/percpu.h> |
12 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 1c044b1ccc59..a3042989398c 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c | |||
@@ -5,7 +5,6 @@ | |||
5 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/init.h> | ||
9 | #include <linux/smp.h> | 8 | #include <linux/smp.h> |
10 | 9 | ||
11 | #include <asm/processor.h> | 10 | #include <asm/processor.h> |
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index e9a701aecaa1..7dc5564d0cdf 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c | |||
@@ -5,7 +5,6 @@ | |||
5 | #include <linux/interrupt.h> | 5 | #include <linux/interrupt.h> |
6 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/init.h> | ||
9 | 8 | ||
10 | #include <asm/processor.h> | 9 | #include <asm/processor.h> |
11 | #include <asm/mce.h> | 10 | #include <asm/mce.h> |
diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile new file mode 100644 index 000000000000..285c85427c32 --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/Makefile | |||
@@ -0,0 +1,7 @@ | |||
1 | microcode-y := core.o | ||
2 | obj-$(CONFIG_MICROCODE) += microcode.o | ||
3 | microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o | ||
4 | microcode-$(CONFIG_MICROCODE_AMD) += amd.o | ||
5 | obj-$(CONFIG_MICROCODE_EARLY) += core_early.o | ||
6 | obj-$(CONFIG_MICROCODE_INTEL_EARLY) += intel_early.o | ||
7 | obj-$(CONFIG_MICROCODE_AMD_EARLY) += amd_early.o | ||
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/cpu/microcode/amd.c index c3d4cc972eca..8fffd845e22b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c | |||
@@ -182,10 +182,10 @@ int __apply_microcode_amd(struct microcode_amd *mc_amd) | |||
182 | { | 182 | { |
183 | u32 rev, dummy; | 183 | u32 rev, dummy; |
184 | 184 | ||
185 | wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); | 185 | native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); |
186 | 186 | ||
187 | /* verify patch application was successful */ | 187 | /* verify patch application was successful */ |
188 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); | 188 | native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); |
189 | if (rev != mc_amd->hdr.patch_id) | 189 | if (rev != mc_amd->hdr.patch_id) |
190 | return -1; | 190 | return -1; |
191 | 191 | ||
@@ -332,6 +332,9 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover) | |||
332 | patch->patch_id = mc_hdr->patch_id; | 332 | patch->patch_id = mc_hdr->patch_id; |
333 | patch->equiv_cpu = proc_id; | 333 | patch->equiv_cpu = proc_id; |
334 | 334 | ||
335 | pr_debug("%s: Added patch_id: 0x%08x, proc_id: 0x%04x\n", | ||
336 | __func__, patch->patch_id, proc_id); | ||
337 | |||
335 | /* ... and add to cache. */ | 338 | /* ... and add to cache. */ |
336 | update_cache(patch); | 339 | update_cache(patch); |
337 | 340 | ||
@@ -390,9 +393,9 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) | |||
390 | if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { | 393 | if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { |
391 | struct ucode_patch *p = find_patch(smp_processor_id()); | 394 | struct ucode_patch *p = find_patch(smp_processor_id()); |
392 | if (p) { | 395 | if (p) { |
393 | memset(amd_bsp_mpb, 0, MPB_MAX_SIZE); | 396 | memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); |
394 | memcpy(amd_bsp_mpb, p->data, min_t(u32, ksize(p->data), | 397 | memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), |
395 | MPB_MAX_SIZE)); | 398 | PATCH_MAX_SIZE)); |
396 | } | 399 | } |
397 | } | 400 | } |
398 | #endif | 401 | #endif |
@@ -430,7 +433,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, | |||
430 | if (c->x86 >= 0x15) | 433 | if (c->x86 >= 0x15) |
431 | snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); | 434 | snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); |
432 | 435 | ||
433 | if (request_firmware(&fw, (const char *)fw_name, device)) { | 436 | if (request_firmware_direct(&fw, (const char *)fw_name, device)) { |
434 | pr_debug("failed to load file %s\n", fw_name); | 437 | pr_debug("failed to load file %s\n", fw_name); |
435 | goto out; | 438 | goto out; |
436 | } | 439 | } |
diff --git a/arch/x86/kernel/microcode_amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 6073104ccaa3..8384c0fa206f 100644 --- a/arch/x86/kernel/microcode_amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * Copyright (C) 2013 Advanced Micro Devices, Inc. | 2 | * Copyright (C) 2013 Advanced Micro Devices, Inc. |
3 | * | 3 | * |
4 | * Author: Jacob Shin <jacob.shin@amd.com> | 4 | * Author: Jacob Shin <jacob.shin@amd.com> |
5 | * Fixes: Borislav Petkov <bp@suse.de> | ||
5 | * | 6 | * |
6 | * This program is free software; you can redistribute it and/or modify | 7 | * This program is free software; you can redistribute it and/or modify |
7 | * it under the terms of the GNU General Public License version 2 as | 8 | * it under the terms of the GNU General Public License version 2 as |
@@ -15,10 +16,18 @@ | |||
15 | #include <asm/setup.h> | 16 | #include <asm/setup.h> |
16 | #include <asm/microcode_amd.h> | 17 | #include <asm/microcode_amd.h> |
17 | 18 | ||
18 | static bool ucode_loaded; | 19 | /* |
20 | * This points to the current valid container of microcode patches which we will | ||
21 | * save from the initrd before jettisoning its contents. | ||
22 | */ | ||
23 | static u8 *container; | ||
24 | static size_t container_size; | ||
25 | |||
19 | static u32 ucode_new_rev; | 26 | static u32 ucode_new_rev; |
20 | static unsigned long ucode_offset; | 27 | u8 amd_ucode_patch[PATCH_MAX_SIZE]; |
21 | static size_t ucode_size; | 28 | static u16 this_equiv_id; |
29 | |||
30 | struct cpio_data ucode_cpio; | ||
22 | 31 | ||
23 | /* | 32 | /* |
24 | * Microcode patch container file is prepended to the initrd in cpio format. | 33 | * Microcode patch container file is prepended to the initrd in cpio format. |
@@ -32,9 +41,6 @@ static struct cpio_data __init find_ucode_in_initrd(void) | |||
32 | char *path; | 41 | char *path; |
33 | void *start; | 42 | void *start; |
34 | size_t size; | 43 | size_t size; |
35 | unsigned long *uoffset; | ||
36 | size_t *usize; | ||
37 | struct cpio_data cd; | ||
38 | 44 | ||
39 | #ifdef CONFIG_X86_32 | 45 | #ifdef CONFIG_X86_32 |
40 | struct boot_params *p; | 46 | struct boot_params *p; |
@@ -47,30 +53,50 @@ static struct cpio_data __init find_ucode_in_initrd(void) | |||
47 | path = (char *)__pa_nodebug(ucode_path); | 53 | path = (char *)__pa_nodebug(ucode_path); |
48 | start = (void *)p->hdr.ramdisk_image; | 54 | start = (void *)p->hdr.ramdisk_image; |
49 | size = p->hdr.ramdisk_size; | 55 | size = p->hdr.ramdisk_size; |
50 | uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); | ||
51 | usize = (size_t *)__pa_nodebug(&ucode_size); | ||
52 | #else | 56 | #else |
53 | path = ucode_path; | 57 | path = ucode_path; |
54 | start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); | 58 | start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); |
55 | size = boot_params.hdr.ramdisk_size; | 59 | size = boot_params.hdr.ramdisk_size; |
56 | uoffset = &ucode_offset; | ||
57 | usize = &ucode_size; | ||
58 | #endif | 60 | #endif |
59 | 61 | ||
60 | cd = find_cpio_data(path, start, size, &offset); | 62 | return find_cpio_data(path, start, size, &offset); |
61 | if (!cd.data) | 63 | } |
62 | return cd; | ||
63 | 64 | ||
64 | if (*(u32 *)cd.data != UCODE_MAGIC) { | 65 | static size_t compute_container_size(u8 *data, u32 total_size) |
65 | cd.data = NULL; | 66 | { |
66 | cd.size = 0; | 67 | size_t size = 0; |
67 | return cd; | 68 | u32 *header = (u32 *)data; |
68 | } | ||
69 | 69 | ||
70 | *uoffset = (u8 *)cd.data - (u8 *)start; | 70 | if (header[0] != UCODE_MAGIC || |
71 | *usize = cd.size; | 71 | header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ |
72 | header[2] == 0) /* size */ | ||
73 | return size; | ||
72 | 74 | ||
73 | return cd; | 75 | size = header[2] + CONTAINER_HDR_SZ; |
76 | total_size -= size; | ||
77 | data += size; | ||
78 | |||
79 | while (total_size) { | ||
80 | u16 patch_size; | ||
81 | |||
82 | header = (u32 *)data; | ||
83 | |||
84 | if (header[0] != UCODE_UCODE_TYPE) | ||
85 | break; | ||
86 | |||
87 | /* | ||
88 | * Sanity-check patch size. | ||
89 | */ | ||
90 | patch_size = header[1]; | ||
91 | if (patch_size > PATCH_MAX_SIZE) | ||
92 | break; | ||
93 | |||
94 | size += patch_size + SECTION_HDR_SIZE; | ||
95 | data += patch_size + SECTION_HDR_SIZE; | ||
96 | total_size -= patch_size + SECTION_HDR_SIZE; | ||
97 | } | ||
98 | |||
99 | return size; | ||
74 | } | 100 | } |
75 | 101 | ||
76 | /* | 102 | /* |
@@ -85,23 +111,22 @@ static struct cpio_data __init find_ucode_in_initrd(void) | |||
85 | static void apply_ucode_in_initrd(void *ucode, size_t size) | 111 | static void apply_ucode_in_initrd(void *ucode, size_t size) |
86 | { | 112 | { |
87 | struct equiv_cpu_entry *eq; | 113 | struct equiv_cpu_entry *eq; |
114 | size_t *cont_sz; | ||
88 | u32 *header; | 115 | u32 *header; |
89 | u8 *data; | 116 | u8 *data, **cont; |
90 | u16 eq_id = 0; | 117 | u16 eq_id = 0; |
91 | int offset, left; | 118 | int offset, left; |
92 | u32 rev, eax; | 119 | u32 rev, eax, ebx, ecx, edx; |
93 | u32 *new_rev; | 120 | u32 *new_rev; |
94 | unsigned long *uoffset; | ||
95 | size_t *usize; | ||
96 | 121 | ||
97 | #ifdef CONFIG_X86_32 | 122 | #ifdef CONFIG_X86_32 |
98 | new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); | 123 | new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); |
99 | uoffset = (unsigned long *)__pa_nodebug(&ucode_offset); | 124 | cont_sz = (size_t *)__pa_nodebug(&container_size); |
100 | usize = (size_t *)__pa_nodebug(&ucode_size); | 125 | cont = (u8 **)__pa_nodebug(&container); |
101 | #else | 126 | #else |
102 | new_rev = &ucode_new_rev; | 127 | new_rev = &ucode_new_rev; |
103 | uoffset = &ucode_offset; | 128 | cont_sz = &container_size; |
104 | usize = &ucode_size; | 129 | cont = &container; |
105 | #endif | 130 | #endif |
106 | 131 | ||
107 | data = ucode; | 132 | data = ucode; |
@@ -109,23 +134,37 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) | |||
109 | header = (u32 *)data; | 134 | header = (u32 *)data; |
110 | 135 | ||
111 | /* find equiv cpu table */ | 136 | /* find equiv cpu table */ |
112 | 137 | if (header[0] != UCODE_MAGIC || | |
113 | if (header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ | 138 | header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ |
114 | header[2] == 0) /* size */ | 139 | header[2] == 0) /* size */ |
115 | return; | 140 | return; |
116 | 141 | ||
117 | eax = cpuid_eax(0x00000001); | 142 | eax = 0x00000001; |
143 | ecx = 0; | ||
144 | native_cpuid(&eax, &ebx, &ecx, &edx); | ||
118 | 145 | ||
119 | while (left > 0) { | 146 | while (left > 0) { |
120 | eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); | 147 | eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); |
121 | 148 | ||
149 | *cont = data; | ||
150 | |||
151 | /* Advance past the container header */ | ||
122 | offset = header[2] + CONTAINER_HDR_SZ; | 152 | offset = header[2] + CONTAINER_HDR_SZ; |
123 | data += offset; | 153 | data += offset; |
124 | left -= offset; | 154 | left -= offset; |
125 | 155 | ||
126 | eq_id = find_equiv_id(eq, eax); | 156 | eq_id = find_equiv_id(eq, eax); |
127 | if (eq_id) | 157 | if (eq_id) { |
158 | this_equiv_id = eq_id; | ||
159 | *cont_sz = compute_container_size(*cont, left + offset); | ||
160 | |||
161 | /* | ||
162 | * truncate how much we need to iterate over in the | ||
163 | * ucode update loop below | ||
164 | */ | ||
165 | left = *cont_sz - offset; | ||
128 | break; | 166 | break; |
167 | } | ||
129 | 168 | ||
130 | /* | 169 | /* |
131 | * support multiple container files appended together. if this | 170 | * support multiple container files appended together. if this |
@@ -145,19 +184,18 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) | |||
145 | 184 | ||
146 | /* mark where the next microcode container file starts */ | 185 | /* mark where the next microcode container file starts */ |
147 | offset = data - (u8 *)ucode; | 186 | offset = data - (u8 *)ucode; |
148 | *uoffset += offset; | ||
149 | *usize -= offset; | ||
150 | ucode = data; | 187 | ucode = data; |
151 | } | 188 | } |
152 | 189 | ||
153 | if (!eq_id) { | 190 | if (!eq_id) { |
154 | *usize = 0; | 191 | *cont = NULL; |
192 | *cont_sz = 0; | ||
155 | return; | 193 | return; |
156 | } | 194 | } |
157 | 195 | ||
158 | /* find ucode and update if needed */ | 196 | /* find ucode and update if needed */ |
159 | 197 | ||
160 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); | 198 | native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); |
161 | 199 | ||
162 | while (left > 0) { | 200 | while (left > 0) { |
163 | struct microcode_amd *mc; | 201 | struct microcode_amd *mc; |
@@ -168,73 +206,83 @@ static void apply_ucode_in_initrd(void *ucode, size_t size) | |||
168 | break; | 206 | break; |
169 | 207 | ||
170 | mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); | 208 | mc = (struct microcode_amd *)(data + SECTION_HDR_SIZE); |
171 | if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) | 209 | |
172 | if (__apply_microcode_amd(mc) == 0) { | 210 | if (eq_id == mc->hdr.processor_rev_id && rev < mc->hdr.patch_id) { |
211 | |||
212 | if (!__apply_microcode_amd(mc)) { | ||
173 | rev = mc->hdr.patch_id; | 213 | rev = mc->hdr.patch_id; |
174 | *new_rev = rev; | 214 | *new_rev = rev; |
215 | |||
216 | /* save ucode patch */ | ||
217 | memcpy(amd_ucode_patch, mc, | ||
218 | min_t(u32, header[1], PATCH_MAX_SIZE)); | ||
175 | } | 219 | } |
220 | } | ||
176 | 221 | ||
177 | offset = header[1] + SECTION_HDR_SIZE; | 222 | offset = header[1] + SECTION_HDR_SIZE; |
178 | data += offset; | 223 | data += offset; |
179 | left -= offset; | 224 | left -= offset; |
180 | } | 225 | } |
181 | |||
182 | /* mark where this microcode container file ends */ | ||
183 | offset = *usize - (data - (u8 *)ucode); | ||
184 | *usize -= offset; | ||
185 | |||
186 | if (!(*new_rev)) | ||
187 | *usize = 0; | ||
188 | } | 226 | } |
189 | 227 | ||
190 | void __init load_ucode_amd_bsp(void) | 228 | void __init load_ucode_amd_bsp(void) |
191 | { | 229 | { |
192 | struct cpio_data cd = find_ucode_in_initrd(); | 230 | struct cpio_data cp; |
193 | if (!cd.data) | 231 | void **data; |
232 | size_t *size; | ||
233 | |||
234 | #ifdef CONFIG_X86_32 | ||
235 | data = (void **)__pa_nodebug(&ucode_cpio.data); | ||
236 | size = (size_t *)__pa_nodebug(&ucode_cpio.size); | ||
237 | #else | ||
238 | data = &ucode_cpio.data; | ||
239 | size = &ucode_cpio.size; | ||
240 | #endif | ||
241 | |||
242 | cp = find_ucode_in_initrd(); | ||
243 | if (!cp.data) | ||
194 | return; | 244 | return; |
195 | 245 | ||
196 | apply_ucode_in_initrd(cd.data, cd.size); | 246 | *data = cp.data; |
247 | *size = cp.size; | ||
248 | |||
249 | apply_ucode_in_initrd(cp.data, cp.size); | ||
197 | } | 250 | } |
198 | 251 | ||
199 | #ifdef CONFIG_X86_32 | 252 | #ifdef CONFIG_X86_32 |
200 | u8 amd_bsp_mpb[MPB_MAX_SIZE]; | ||
201 | |||
202 | /* | 253 | /* |
203 | * On 32-bit, since AP's early load occurs before paging is turned on, we | 254 | * On 32-bit, since AP's early load occurs before paging is turned on, we |
204 | * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during | 255 | * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during |
205 | * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During | 256 | * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During |
206 | * save_microcode_in_initrd_amd() BSP's patch is copied to amd_bsp_mpb, which | 257 | * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, |
207 | * is used upon resume from suspend. | 258 | * which is used upon resume from suspend. |
208 | */ | 259 | */ |
209 | void load_ucode_amd_ap(void) | 260 | void load_ucode_amd_ap(void) |
210 | { | 261 | { |
211 | struct microcode_amd *mc; | 262 | struct microcode_amd *mc; |
212 | unsigned long *initrd; | ||
213 | unsigned long *uoffset; | ||
214 | size_t *usize; | 263 | size_t *usize; |
215 | void *ucode; | 264 | void **ucode; |
216 | 265 | ||
217 | mc = (struct microcode_amd *)__pa(amd_bsp_mpb); | 266 | mc = (struct microcode_amd *)__pa(amd_ucode_patch); |
218 | if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { | 267 | if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { |
219 | __apply_microcode_amd(mc); | 268 | __apply_microcode_amd(mc); |
220 | return; | 269 | return; |
221 | } | 270 | } |
222 | 271 | ||
223 | initrd = (unsigned long *)__pa(&initrd_start); | 272 | ucode = (void *)__pa_nodebug(&container); |
224 | uoffset = (unsigned long *)__pa(&ucode_offset); | 273 | usize = (size_t *)__pa_nodebug(&container_size); |
225 | usize = (size_t *)__pa(&ucode_size); | ||
226 | 274 | ||
227 | if (!*usize || !*initrd) | 275 | if (!*ucode || !*usize) |
228 | return; | 276 | return; |
229 | 277 | ||
230 | ucode = (void *)((unsigned long)__pa(*initrd) + *uoffset); | 278 | apply_ucode_in_initrd(*ucode, *usize); |
231 | apply_ucode_in_initrd(ucode, *usize); | ||
232 | } | 279 | } |
233 | 280 | ||
234 | static void __init collect_cpu_sig_on_bsp(void *arg) | 281 | static void __init collect_cpu_sig_on_bsp(void *arg) |
235 | { | 282 | { |
236 | unsigned int cpu = smp_processor_id(); | 283 | unsigned int cpu = smp_processor_id(); |
237 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 284 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
285 | |||
238 | uci->cpu_sig.sig = cpuid_eax(0x00000001); | 286 | uci->cpu_sig.sig = cpuid_eax(0x00000001); |
239 | } | 287 | } |
240 | #else | 288 | #else |
@@ -242,36 +290,54 @@ void load_ucode_amd_ap(void) | |||
242 | { | 290 | { |
243 | unsigned int cpu = smp_processor_id(); | 291 | unsigned int cpu = smp_processor_id(); |
244 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; | 292 | struct ucode_cpu_info *uci = ucode_cpu_info + cpu; |
293 | struct equiv_cpu_entry *eq; | ||
294 | struct microcode_amd *mc; | ||
245 | u32 rev, eax; | 295 | u32 rev, eax; |
296 | u16 eq_id; | ||
297 | |||
298 | /* Exit if called on the BSP. */ | ||
299 | if (!cpu) | ||
300 | return; | ||
301 | |||
302 | if (!container) | ||
303 | return; | ||
246 | 304 | ||
247 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); | 305 | rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); |
248 | eax = cpuid_eax(0x00000001); | ||
249 | 306 | ||
250 | uci->cpu_sig.rev = rev; | 307 | uci->cpu_sig.rev = rev; |
251 | uci->cpu_sig.sig = eax; | 308 | uci->cpu_sig.sig = eax; |
252 | 309 | ||
253 | if (cpu && !ucode_loaded) { | 310 | eax = cpuid_eax(0x00000001); |
254 | void *ucode; | 311 | eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); |
255 | 312 | ||
256 | if (!ucode_size || !initrd_start) | 313 | eq_id = find_equiv_id(eq, eax); |
257 | return; | 314 | if (!eq_id) |
315 | return; | ||
316 | |||
317 | if (eq_id == this_equiv_id) { | ||
318 | mc = (struct microcode_amd *)amd_ucode_patch; | ||
258 | 319 | ||
259 | ucode = (void *)(initrd_start + ucode_offset); | 320 | if (mc && rev < mc->hdr.patch_id) { |
260 | eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); | 321 | if (!__apply_microcode_amd(mc)) |
261 | if (load_microcode_amd(eax, ucode, ucode_size) != UCODE_OK) | 322 | ucode_new_rev = mc->hdr.patch_id; |
323 | } | ||
324 | |||
325 | } else { | ||
326 | if (!ucode_cpio.data) | ||
262 | return; | 327 | return; |
263 | 328 | ||
264 | ucode_loaded = true; | 329 | /* |
330 | * AP has a different equivalence ID than BSP, looks like | ||
331 | * mixed-steppings silicon so go through the ucode blob anew. | ||
332 | */ | ||
333 | apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size); | ||
265 | } | 334 | } |
266 | |||
267 | apply_microcode_amd(cpu); | ||
268 | } | 335 | } |
269 | #endif | 336 | #endif |
270 | 337 | ||
271 | int __init save_microcode_in_initrd_amd(void) | 338 | int __init save_microcode_in_initrd_amd(void) |
272 | { | 339 | { |
273 | enum ucode_state ret; | 340 | enum ucode_state ret; |
274 | void *ucode; | ||
275 | u32 eax; | 341 | u32 eax; |
276 | 342 | ||
277 | #ifdef CONFIG_X86_32 | 343 | #ifdef CONFIG_X86_32 |
@@ -280,22 +346,35 @@ int __init save_microcode_in_initrd_amd(void) | |||
280 | 346 | ||
281 | if (!uci->cpu_sig.sig) | 347 | if (!uci->cpu_sig.sig) |
282 | smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); | 348 | smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); |
349 | |||
350 | /* | ||
351 | * Take into account the fact that the ramdisk might get relocated | ||
352 | * and therefore we need to recompute the container's position in | ||
353 | * virtual memory space. | ||
354 | */ | ||
355 | container = (u8 *)(__va((u32)relocated_ramdisk) + | ||
356 | ((u32)container - boot_params.hdr.ramdisk_image)); | ||
283 | #endif | 357 | #endif |
284 | if (ucode_new_rev) | 358 | if (ucode_new_rev) |
285 | pr_info("microcode: updated early to new patch_level=0x%08x\n", | 359 | pr_info("microcode: updated early to new patch_level=0x%08x\n", |
286 | ucode_new_rev); | 360 | ucode_new_rev); |
287 | 361 | ||
288 | if (ucode_loaded || !ucode_size || !initrd_start) | 362 | if (!container) |
289 | return 0; | 363 | return -EINVAL; |
290 | 364 | ||
291 | ucode = (void *)(initrd_start + ucode_offset); | ||
292 | eax = cpuid_eax(0x00000001); | 365 | eax = cpuid_eax(0x00000001); |
293 | eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); | 366 | eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); |
294 | 367 | ||
295 | ret = load_microcode_amd(eax, ucode, ucode_size); | 368 | ret = load_microcode_amd(eax, container, container_size); |
296 | if (ret != UCODE_OK) | 369 | if (ret != UCODE_OK) |
297 | return -EINVAL; | 370 | return -EINVAL; |
298 | 371 | ||
299 | ucode_loaded = true; | 372 | /* |
373 | * This will be freed any msec now, stash patches for the current | ||
374 | * family and switch to patch cache for cpu hotplug, etc later. | ||
375 | */ | ||
376 | container = NULL; | ||
377 | container_size = 0; | ||
378 | |||
300 | return 0; | 379 | return 0; |
301 | } | 380 | } |
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/cpu/microcode/core.c index 15c987698b0f..15c987698b0f 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/cpu/microcode/core.c | |||
diff --git a/arch/x86/kernel/microcode_core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index be7f8514f577..be7f8514f577 100644 --- a/arch/x86/kernel/microcode_core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c | |||
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 5fb2cebf556b..a276fa75d9b5 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c | |||
@@ -278,7 +278,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, | |||
278 | sprintf(name, "intel-ucode/%02x-%02x-%02x", | 278 | sprintf(name, "intel-ucode/%02x-%02x-%02x", |
279 | c->x86, c->x86_model, c->x86_mask); | 279 | c->x86, c->x86_model, c->x86_mask); |
280 | 280 | ||
281 | if (request_firmware(&firmware, name, device)) { | 281 | if (request_firmware_direct(&firmware, name, device)) { |
282 | pr_debug("data file %s load failed\n", name); | 282 | pr_debug("data file %s load failed\n", name); |
283 | return UCODE_NFOUND; | 283 | return UCODE_NFOUND; |
284 | } | 284 | } |
diff --git a/arch/x86/kernel/microcode_intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 1575deb2e636..18f739129e72 100644 --- a/arch/x86/kernel/microcode_intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c | |||
@@ -365,16 +365,6 @@ out: | |||
365 | return state; | 365 | return state; |
366 | } | 366 | } |
367 | 367 | ||
368 | #define native_rdmsr(msr, val1, val2) \ | ||
369 | do { \ | ||
370 | u64 __val = native_read_msr((msr)); \ | ||
371 | (void)((val1) = (u32)__val); \ | ||
372 | (void)((val2) = (u32)(__val >> 32)); \ | ||
373 | } while (0) | ||
374 | |||
375 | #define native_wrmsr(msr, low, high) \ | ||
376 | native_write_msr(msr, low, high); | ||
377 | |||
378 | static int collect_cpu_info_early(struct ucode_cpu_info *uci) | 368 | static int collect_cpu_info_early(struct ucode_cpu_info *uci) |
379 | { | 369 | { |
380 | unsigned int val[2]; | 370 | unsigned int val[2]; |
diff --git a/arch/x86/kernel/microcode_intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ce69320d0179..ce69320d0179 100644 --- a/arch/x86/kernel/microcode_intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c | |||
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8e132931614d..b88645191fe5 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c | |||
@@ -1883,21 +1883,27 @@ static struct pmu pmu = { | |||
1883 | 1883 | ||
1884 | void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) | 1884 | void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) |
1885 | { | 1885 | { |
1886 | struct cyc2ns_data *data; | ||
1887 | |||
1886 | userpg->cap_user_time = 0; | 1888 | userpg->cap_user_time = 0; |
1887 | userpg->cap_user_time_zero = 0; | 1889 | userpg->cap_user_time_zero = 0; |
1888 | userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; | 1890 | userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; |
1889 | userpg->pmc_width = x86_pmu.cntval_bits; | 1891 | userpg->pmc_width = x86_pmu.cntval_bits; |
1890 | 1892 | ||
1891 | if (!sched_clock_stable) | 1893 | if (!sched_clock_stable()) |
1892 | return; | 1894 | return; |
1893 | 1895 | ||
1896 | data = cyc2ns_read_begin(); | ||
1897 | |||
1894 | userpg->cap_user_time = 1; | 1898 | userpg->cap_user_time = 1; |
1895 | userpg->time_mult = this_cpu_read(cyc2ns); | 1899 | userpg->time_mult = data->cyc2ns_mul; |
1896 | userpg->time_shift = CYC2NS_SCALE_FACTOR; | 1900 | userpg->time_shift = data->cyc2ns_shift; |
1897 | userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; | 1901 | userpg->time_offset = data->cyc2ns_offset - now; |
1898 | 1902 | ||
1899 | userpg->cap_user_time_zero = 1; | 1903 | userpg->cap_user_time_zero = 1; |
1900 | userpg->time_zero = this_cpu_read(cyc2ns_offset); | 1904 | userpg->time_zero = data->cyc2ns_offset; |
1905 | |||
1906 | cyc2ns_read_end(data); | ||
1901 | } | 1907 | } |
1902 | 1908 | ||
1903 | /* | 1909 | /* |
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fd00bb29425d..c1a861829d81 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h | |||
@@ -262,11 +262,20 @@ struct cpu_hw_events { | |||
262 | __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ | 262 | __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ |
263 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) | 263 | HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) |
264 | 264 | ||
265 | #define EVENT_CONSTRAINT_END \ | 265 | /* |
266 | EVENT_CONSTRAINT(0, 0, 0) | 266 | * We define the end marker as having a weight of -1 |
267 | * to enable blacklisting of events using a counter bitmask | ||
268 | * of zero and thus a weight of zero. | ||
269 | * The end marker has a weight that cannot possibly be | ||
270 | * obtained from counting the bits in the bitmask. | ||
271 | */ | ||
272 | #define EVENT_CONSTRAINT_END { .weight = -1 } | ||
267 | 273 | ||
274 | /* | ||
275 | * Check for end marker with weight == -1 | ||
276 | */ | ||
268 | #define for_each_event_constraint(e, c) \ | 277 | #define for_each_event_constraint(e, c) \ |
269 | for ((e) = (c); (e)->weight; (e)++) | 278 | for ((e) = (c); (e)->weight != -1; (e)++) |
270 | 279 | ||
271 | /* | 280 | /* |
272 | * Extra registers for specific events. | 281 | * Extra registers for specific events. |
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index e09f0bfb7b8f..4b8e4d3cd6ea 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/pci.h> | 11 | #include <linux/pci.h> |
12 | #include <linux/ptrace.h> | 12 | #include <linux/ptrace.h> |
13 | #include <linux/syscore_ops.h> | ||
13 | 14 | ||
14 | #include <asm/apic.h> | 15 | #include <asm/apic.h> |
15 | 16 | ||
@@ -816,6 +817,18 @@ out: | |||
816 | return ret; | 817 | return ret; |
817 | } | 818 | } |
818 | 819 | ||
820 | static void ibs_eilvt_setup(void) | ||
821 | { | ||
822 | /* | ||
823 | * Force LVT offset assignment for family 10h: The offsets are | ||
824 | * not assigned by the BIOS for this family, so the OS is | ||
825 | * responsible for doing it. If the OS assignment fails, fall | ||
826 | * back to BIOS settings and try to setup this. | ||
827 | */ | ||
828 | if (boot_cpu_data.x86 == 0x10) | ||
829 | force_ibs_eilvt_setup(); | ||
830 | } | ||
831 | |||
819 | static inline int get_ibs_lvt_offset(void) | 832 | static inline int get_ibs_lvt_offset(void) |
820 | { | 833 | { |
821 | u64 val; | 834 | u64 val; |
@@ -851,6 +864,36 @@ static void clear_APIC_ibs(void *dummy) | |||
851 | setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); | 864 | setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); |
852 | } | 865 | } |
853 | 866 | ||
867 | #ifdef CONFIG_PM | ||
868 | |||
869 | static int perf_ibs_suspend(void) | ||
870 | { | ||
871 | clear_APIC_ibs(NULL); | ||
872 | return 0; | ||
873 | } | ||
874 | |||
875 | static void perf_ibs_resume(void) | ||
876 | { | ||
877 | ibs_eilvt_setup(); | ||
878 | setup_APIC_ibs(NULL); | ||
879 | } | ||
880 | |||
881 | static struct syscore_ops perf_ibs_syscore_ops = { | ||
882 | .resume = perf_ibs_resume, | ||
883 | .suspend = perf_ibs_suspend, | ||
884 | }; | ||
885 | |||
886 | static void perf_ibs_pm_init(void) | ||
887 | { | ||
888 | register_syscore_ops(&perf_ibs_syscore_ops); | ||
889 | } | ||
890 | |||
891 | #else | ||
892 | |||
893 | static inline void perf_ibs_pm_init(void) { } | ||
894 | |||
895 | #endif | ||
896 | |||
854 | static int | 897 | static int |
855 | perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) | 898 | perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) |
856 | { | 899 | { |
@@ -877,18 +920,12 @@ static __init int amd_ibs_init(void) | |||
877 | if (!caps) | 920 | if (!caps) |
878 | return -ENODEV; /* ibs not supported by the cpu */ | 921 | return -ENODEV; /* ibs not supported by the cpu */ |
879 | 922 | ||
880 | /* | 923 | ibs_eilvt_setup(); |
881 | * Force LVT offset assignment for family 10h: The offsets are | ||
882 | * not assigned by the BIOS for this family, so the OS is | ||
883 | * responsible for doing it. If the OS assignment fails, fall | ||
884 | * back to BIOS settings and try to setup this. | ||
885 | */ | ||
886 | if (boot_cpu_data.x86 == 0x10) | ||
887 | force_ibs_eilvt_setup(); | ||
888 | 924 | ||
889 | if (!ibs_eilvt_valid()) | 925 | if (!ibs_eilvt_valid()) |
890 | goto out; | 926 | goto out; |
891 | 927 | ||
928 | perf_ibs_pm_init(); | ||
892 | get_online_cpus(); | 929 | get_online_cpus(); |
893 | ibs_caps = caps; | 930 | ibs_caps = caps; |
894 | /* make ibs_caps visible to other cpus: */ | 931 | /* make ibs_caps visible to other cpus: */ |
diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c new file mode 100644 index 000000000000..5ad35ad94d0f --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c | |||
@@ -0,0 +1,679 @@ | |||
1 | /* | ||
2 | * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters | ||
3 | * Copyright (C) 2013 Google, Inc., Stephane Eranian | ||
4 | * | ||
5 | * Intel RAPL interface is specified in the IA-32 Manual Vol3b | ||
6 | * section 14.7.1 (September 2013) | ||
7 | * | ||
8 | * RAPL provides more controls than just reporting energy consumption | ||
9 | * however here we only expose the 3 energy consumption free running | ||
10 | * counters (pp0, pkg, dram). | ||
11 | * | ||
12 | * Each of those counters increments in a power unit defined by the | ||
13 | * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules | ||
14 | * but it can vary. | ||
15 | * | ||
16 | * Counter to rapl events mappings: | ||
17 | * | ||
18 | * pp0 counter: consumption of all physical cores (power plane 0) | ||
19 | * event: rapl_energy_cores | ||
20 | * perf code: 0x1 | ||
21 | * | ||
22 | * pkg counter: consumption of the whole processor package | ||
23 | * event: rapl_energy_pkg | ||
24 | * perf code: 0x2 | ||
25 | * | ||
26 | * dram counter: consumption of the dram domain (servers only) | ||
27 | * event: rapl_energy_dram | ||
28 | * perf code: 0x3 | ||
29 | * | ||
30 | * dram counter: consumption of the builtin-gpu domain (client only) | ||
31 | * event: rapl_energy_gpu | ||
32 | * perf code: 0x4 | ||
33 | * | ||
34 | * We manage those counters as free running (read-only). They may be | ||
35 | * use simultaneously by other tools, such as turbostat. | ||
36 | * | ||
37 | * The events only support system-wide mode counting. There is no | ||
38 | * sampling support because it does not make sense and is not | ||
39 | * supported by the RAPL hardware. | ||
40 | * | ||
41 | * Because we want to avoid floating-point operations in the kernel, | ||
42 | * the events are all reported in fixed point arithmetic (32.32). | ||
43 | * Tools must adjust the counts to convert them to Watts using | ||
44 | * the duration of the measurement. Tools may use a function such as | ||
45 | * ldexp(raw_count, -32); | ||
46 | */ | ||
47 | #include <linux/module.h> | ||
48 | #include <linux/slab.h> | ||
49 | #include <linux/perf_event.h> | ||
50 | #include <asm/cpu_device_id.h> | ||
51 | #include "perf_event.h" | ||
52 | |||
53 | /* | ||
54 | * RAPL energy status counters | ||
55 | */ | ||
56 | #define RAPL_IDX_PP0_NRG_STAT 0 /* all cores */ | ||
57 | #define INTEL_RAPL_PP0 0x1 /* pseudo-encoding */ | ||
58 | #define RAPL_IDX_PKG_NRG_STAT 1 /* entire package */ | ||
59 | #define INTEL_RAPL_PKG 0x2 /* pseudo-encoding */ | ||
60 | #define RAPL_IDX_RAM_NRG_STAT 2 /* DRAM */ | ||
61 | #define INTEL_RAPL_RAM 0x3 /* pseudo-encoding */ | ||
62 | #define RAPL_IDX_PP1_NRG_STAT 3 /* DRAM */ | ||
63 | #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ | ||
64 | |||
65 | /* Clients have PP0, PKG */ | ||
66 | #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ | ||
67 | 1<<RAPL_IDX_PKG_NRG_STAT|\ | ||
68 | 1<<RAPL_IDX_PP1_NRG_STAT) | ||
69 | |||
70 | /* Servers have PP0, PKG, RAM */ | ||
71 | #define RAPL_IDX_SRV (1<<RAPL_IDX_PP0_NRG_STAT|\ | ||
72 | 1<<RAPL_IDX_PKG_NRG_STAT|\ | ||
73 | 1<<RAPL_IDX_RAM_NRG_STAT) | ||
74 | |||
75 | /* | ||
76 | * event code: LSB 8 bits, passed in attr->config | ||
77 | * any other bit is reserved | ||
78 | */ | ||
79 | #define RAPL_EVENT_MASK 0xFFULL | ||
80 | |||
81 | #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format) \ | ||
82 | static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ | ||
83 | struct kobj_attribute *attr, \ | ||
84 | char *page) \ | ||
85 | { \ | ||
86 | BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ | ||
87 | return sprintf(page, _format "\n"); \ | ||
88 | } \ | ||
89 | static struct kobj_attribute format_attr_##_var = \ | ||
90 | __ATTR(_name, 0444, __rapl_##_var##_show, NULL) | ||
91 | |||
92 | #define RAPL_EVENT_DESC(_name, _config) \ | ||
93 | { \ | ||
94 | .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ | ||
95 | .config = _config, \ | ||
96 | } | ||
97 | |||
98 | #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ | ||
99 | |||
100 | struct rapl_pmu { | ||
101 | spinlock_t lock; | ||
102 | int hw_unit; /* 1/2^hw_unit Joule */ | ||
103 | int n_active; /* number of active events */ | ||
104 | struct list_head active_list; | ||
105 | struct pmu *pmu; /* pointer to rapl_pmu_class */ | ||
106 | ktime_t timer_interval; /* in ktime_t unit */ | ||
107 | struct hrtimer hrtimer; | ||
108 | }; | ||
109 | |||
110 | static struct pmu rapl_pmu_class; | ||
111 | static cpumask_t rapl_cpu_mask; | ||
112 | static int rapl_cntr_mask; | ||
113 | |||
114 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); | ||
115 | static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); | ||
116 | |||
117 | static inline u64 rapl_read_counter(struct perf_event *event) | ||
118 | { | ||
119 | u64 raw; | ||
120 | rdmsrl(event->hw.event_base, raw); | ||
121 | return raw; | ||
122 | } | ||
123 | |||
124 | static inline u64 rapl_scale(u64 v) | ||
125 | { | ||
126 | /* | ||
127 | * scale delta to smallest unit (1/2^32) | ||
128 | * users must then scale back: count * 1/(1e9*2^32) to get Joules | ||
129 | * or use ldexp(count, -32). | ||
130 | * Watts = Joules/Time delta | ||
131 | */ | ||
132 | return v << (32 - __get_cpu_var(rapl_pmu)->hw_unit); | ||
133 | } | ||
134 | |||
135 | static u64 rapl_event_update(struct perf_event *event) | ||
136 | { | ||
137 | struct hw_perf_event *hwc = &event->hw; | ||
138 | u64 prev_raw_count, new_raw_count; | ||
139 | s64 delta, sdelta; | ||
140 | int shift = RAPL_CNTR_WIDTH; | ||
141 | |||
142 | again: | ||
143 | prev_raw_count = local64_read(&hwc->prev_count); | ||
144 | rdmsrl(event->hw.event_base, new_raw_count); | ||
145 | |||
146 | if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, | ||
147 | new_raw_count) != prev_raw_count) { | ||
148 | cpu_relax(); | ||
149 | goto again; | ||
150 | } | ||
151 | |||
152 | /* | ||
153 | * Now we have the new raw value and have updated the prev | ||
154 | * timestamp already. We can now calculate the elapsed delta | ||
155 | * (event-)time and add that to the generic event. | ||
156 | * | ||
157 | * Careful, not all hw sign-extends above the physical width | ||
158 | * of the count. | ||
159 | */ | ||
160 | delta = (new_raw_count << shift) - (prev_raw_count << shift); | ||
161 | delta >>= shift; | ||
162 | |||
163 | sdelta = rapl_scale(delta); | ||
164 | |||
165 | local64_add(sdelta, &event->count); | ||
166 | |||
167 | return new_raw_count; | ||
168 | } | ||
169 | |||
170 | static void rapl_start_hrtimer(struct rapl_pmu *pmu) | ||
171 | { | ||
172 | __hrtimer_start_range_ns(&pmu->hrtimer, | ||
173 | pmu->timer_interval, 0, | ||
174 | HRTIMER_MODE_REL_PINNED, 0); | ||
175 | } | ||
176 | |||
177 | static void rapl_stop_hrtimer(struct rapl_pmu *pmu) | ||
178 | { | ||
179 | hrtimer_cancel(&pmu->hrtimer); | ||
180 | } | ||
181 | |||
182 | static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) | ||
183 | { | ||
184 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
185 | struct perf_event *event; | ||
186 | unsigned long flags; | ||
187 | |||
188 | if (!pmu->n_active) | ||
189 | return HRTIMER_NORESTART; | ||
190 | |||
191 | spin_lock_irqsave(&pmu->lock, flags); | ||
192 | |||
193 | list_for_each_entry(event, &pmu->active_list, active_entry) { | ||
194 | rapl_event_update(event); | ||
195 | } | ||
196 | |||
197 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
198 | |||
199 | hrtimer_forward_now(hrtimer, pmu->timer_interval); | ||
200 | |||
201 | return HRTIMER_RESTART; | ||
202 | } | ||
203 | |||
204 | static void rapl_hrtimer_init(struct rapl_pmu *pmu) | ||
205 | { | ||
206 | struct hrtimer *hr = &pmu->hrtimer; | ||
207 | |||
208 | hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
209 | hr->function = rapl_hrtimer_handle; | ||
210 | } | ||
211 | |||
212 | static void __rapl_pmu_event_start(struct rapl_pmu *pmu, | ||
213 | struct perf_event *event) | ||
214 | { | ||
215 | if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) | ||
216 | return; | ||
217 | |||
218 | event->hw.state = 0; | ||
219 | |||
220 | list_add_tail(&event->active_entry, &pmu->active_list); | ||
221 | |||
222 | local64_set(&event->hw.prev_count, rapl_read_counter(event)); | ||
223 | |||
224 | pmu->n_active++; | ||
225 | if (pmu->n_active == 1) | ||
226 | rapl_start_hrtimer(pmu); | ||
227 | } | ||
228 | |||
229 | static void rapl_pmu_event_start(struct perf_event *event, int mode) | ||
230 | { | ||
231 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
232 | unsigned long flags; | ||
233 | |||
234 | spin_lock_irqsave(&pmu->lock, flags); | ||
235 | __rapl_pmu_event_start(pmu, event); | ||
236 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
237 | } | ||
238 | |||
239 | static void rapl_pmu_event_stop(struct perf_event *event, int mode) | ||
240 | { | ||
241 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
242 | struct hw_perf_event *hwc = &event->hw; | ||
243 | unsigned long flags; | ||
244 | |||
245 | spin_lock_irqsave(&pmu->lock, flags); | ||
246 | |||
247 | /* mark event as deactivated and stopped */ | ||
248 | if (!(hwc->state & PERF_HES_STOPPED)) { | ||
249 | WARN_ON_ONCE(pmu->n_active <= 0); | ||
250 | pmu->n_active--; | ||
251 | if (pmu->n_active == 0) | ||
252 | rapl_stop_hrtimer(pmu); | ||
253 | |||
254 | list_del(&event->active_entry); | ||
255 | |||
256 | WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); | ||
257 | hwc->state |= PERF_HES_STOPPED; | ||
258 | } | ||
259 | |||
260 | /* check if update of sw counter is necessary */ | ||
261 | if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { | ||
262 | /* | ||
263 | * Drain the remaining delta count out of a event | ||
264 | * that we are disabling: | ||
265 | */ | ||
266 | rapl_event_update(event); | ||
267 | hwc->state |= PERF_HES_UPTODATE; | ||
268 | } | ||
269 | |||
270 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
271 | } | ||
272 | |||
273 | static int rapl_pmu_event_add(struct perf_event *event, int mode) | ||
274 | { | ||
275 | struct rapl_pmu *pmu = __get_cpu_var(rapl_pmu); | ||
276 | struct hw_perf_event *hwc = &event->hw; | ||
277 | unsigned long flags; | ||
278 | |||
279 | spin_lock_irqsave(&pmu->lock, flags); | ||
280 | |||
281 | hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; | ||
282 | |||
283 | if (mode & PERF_EF_START) | ||
284 | __rapl_pmu_event_start(pmu, event); | ||
285 | |||
286 | spin_unlock_irqrestore(&pmu->lock, flags); | ||
287 | |||
288 | return 0; | ||
289 | } | ||
290 | |||
291 | static void rapl_pmu_event_del(struct perf_event *event, int flags) | ||
292 | { | ||
293 | rapl_pmu_event_stop(event, PERF_EF_UPDATE); | ||
294 | } | ||
295 | |||
296 | static int rapl_pmu_event_init(struct perf_event *event) | ||
297 | { | ||
298 | u64 cfg = event->attr.config & RAPL_EVENT_MASK; | ||
299 | int bit, msr, ret = 0; | ||
300 | |||
301 | /* only look at RAPL events */ | ||
302 | if (event->attr.type != rapl_pmu_class.type) | ||
303 | return -ENOENT; | ||
304 | |||
305 | /* check only supported bits are set */ | ||
306 | if (event->attr.config & ~RAPL_EVENT_MASK) | ||
307 | return -EINVAL; | ||
308 | |||
309 | /* | ||
310 | * check event is known (determines counter) | ||
311 | */ | ||
312 | switch (cfg) { | ||
313 | case INTEL_RAPL_PP0: | ||
314 | bit = RAPL_IDX_PP0_NRG_STAT; | ||
315 | msr = MSR_PP0_ENERGY_STATUS; | ||
316 | break; | ||
317 | case INTEL_RAPL_PKG: | ||
318 | bit = RAPL_IDX_PKG_NRG_STAT; | ||
319 | msr = MSR_PKG_ENERGY_STATUS; | ||
320 | break; | ||
321 | case INTEL_RAPL_RAM: | ||
322 | bit = RAPL_IDX_RAM_NRG_STAT; | ||
323 | msr = MSR_DRAM_ENERGY_STATUS; | ||
324 | break; | ||
325 | case INTEL_RAPL_PP1: | ||
326 | bit = RAPL_IDX_PP1_NRG_STAT; | ||
327 | msr = MSR_PP1_ENERGY_STATUS; | ||
328 | break; | ||
329 | default: | ||
330 | return -EINVAL; | ||
331 | } | ||
332 | /* check event supported */ | ||
333 | if (!(rapl_cntr_mask & (1 << bit))) | ||
334 | return -EINVAL; | ||
335 | |||
336 | /* unsupported modes and filters */ | ||
337 | if (event->attr.exclude_user || | ||
338 | event->attr.exclude_kernel || | ||
339 | event->attr.exclude_hv || | ||
340 | event->attr.exclude_idle || | ||
341 | event->attr.exclude_host || | ||
342 | event->attr.exclude_guest || | ||
343 | event->attr.sample_period) /* no sampling */ | ||
344 | return -EINVAL; | ||
345 | |||
346 | /* must be done before validate_group */ | ||
347 | event->hw.event_base = msr; | ||
348 | event->hw.config = cfg; | ||
349 | event->hw.idx = bit; | ||
350 | |||
351 | return ret; | ||
352 | } | ||
353 | |||
354 | static void rapl_pmu_event_read(struct perf_event *event) | ||
355 | { | ||
356 | rapl_event_update(event); | ||
357 | } | ||
358 | |||
359 | static ssize_t rapl_get_attr_cpumask(struct device *dev, | ||
360 | struct device_attribute *attr, char *buf) | ||
361 | { | ||
362 | int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask); | ||
363 | |||
364 | buf[n++] = '\n'; | ||
365 | buf[n] = '\0'; | ||
366 | return n; | ||
367 | } | ||
368 | |||
369 | static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); | ||
370 | |||
371 | static struct attribute *rapl_pmu_attrs[] = { | ||
372 | &dev_attr_cpumask.attr, | ||
373 | NULL, | ||
374 | }; | ||
375 | |||
376 | static struct attribute_group rapl_pmu_attr_group = { | ||
377 | .attrs = rapl_pmu_attrs, | ||
378 | }; | ||
379 | |||
380 | EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); | ||
381 | EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); | ||
382 | EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); | ||
383 | EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); | ||
384 | |||
385 | EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); | ||
386 | EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); | ||
387 | EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); | ||
388 | EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); | ||
389 | |||
390 | /* | ||
391 | * we compute in 0.23 nJ increments regardless of MSR | ||
392 | */ | ||
393 | EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10"); | ||
394 | EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10"); | ||
395 | EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); | ||
396 | EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); | ||
397 | |||
398 | static struct attribute *rapl_events_srv_attr[] = { | ||
399 | EVENT_PTR(rapl_cores), | ||
400 | EVENT_PTR(rapl_pkg), | ||
401 | EVENT_PTR(rapl_ram), | ||
402 | |||
403 | EVENT_PTR(rapl_cores_unit), | ||
404 | EVENT_PTR(rapl_pkg_unit), | ||
405 | EVENT_PTR(rapl_ram_unit), | ||
406 | |||
407 | EVENT_PTR(rapl_cores_scale), | ||
408 | EVENT_PTR(rapl_pkg_scale), | ||
409 | EVENT_PTR(rapl_ram_scale), | ||
410 | NULL, | ||
411 | }; | ||
412 | |||
413 | static struct attribute *rapl_events_cln_attr[] = { | ||
414 | EVENT_PTR(rapl_cores), | ||
415 | EVENT_PTR(rapl_pkg), | ||
416 | EVENT_PTR(rapl_gpu), | ||
417 | |||
418 | EVENT_PTR(rapl_cores_unit), | ||
419 | EVENT_PTR(rapl_pkg_unit), | ||
420 | EVENT_PTR(rapl_gpu_unit), | ||
421 | |||
422 | EVENT_PTR(rapl_cores_scale), | ||
423 | EVENT_PTR(rapl_pkg_scale), | ||
424 | EVENT_PTR(rapl_gpu_scale), | ||
425 | NULL, | ||
426 | }; | ||
427 | |||
428 | static struct attribute_group rapl_pmu_events_group = { | ||
429 | .name = "events", | ||
430 | .attrs = NULL, /* patched at runtime */ | ||
431 | }; | ||
432 | |||
433 | DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7"); | ||
434 | static struct attribute *rapl_formats_attr[] = { | ||
435 | &format_attr_event.attr, | ||
436 | NULL, | ||
437 | }; | ||
438 | |||
439 | static struct attribute_group rapl_pmu_format_group = { | ||
440 | .name = "format", | ||
441 | .attrs = rapl_formats_attr, | ||
442 | }; | ||
443 | |||
444 | const struct attribute_group *rapl_attr_groups[] = { | ||
445 | &rapl_pmu_attr_group, | ||
446 | &rapl_pmu_format_group, | ||
447 | &rapl_pmu_events_group, | ||
448 | NULL, | ||
449 | }; | ||
450 | |||
451 | static struct pmu rapl_pmu_class = { | ||
452 | .attr_groups = rapl_attr_groups, | ||
453 | .task_ctx_nr = perf_invalid_context, /* system-wide only */ | ||
454 | .event_init = rapl_pmu_event_init, | ||
455 | .add = rapl_pmu_event_add, /* must have */ | ||
456 | .del = rapl_pmu_event_del, /* must have */ | ||
457 | .start = rapl_pmu_event_start, | ||
458 | .stop = rapl_pmu_event_stop, | ||
459 | .read = rapl_pmu_event_read, | ||
460 | }; | ||
461 | |||
462 | static void rapl_cpu_exit(int cpu) | ||
463 | { | ||
464 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
465 | int i, phys_id = topology_physical_package_id(cpu); | ||
466 | int target = -1; | ||
467 | |||
468 | /* find a new cpu on same package */ | ||
469 | for_each_online_cpu(i) { | ||
470 | if (i == cpu) | ||
471 | continue; | ||
472 | if (phys_id == topology_physical_package_id(i)) { | ||
473 | target = i; | ||
474 | break; | ||
475 | } | ||
476 | } | ||
477 | /* | ||
478 | * clear cpu from cpumask | ||
479 | * if was set in cpumask and still some cpu on package, | ||
480 | * then move to new cpu | ||
481 | */ | ||
482 | if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0) | ||
483 | cpumask_set_cpu(target, &rapl_cpu_mask); | ||
484 | |||
485 | WARN_ON(cpumask_empty(&rapl_cpu_mask)); | ||
486 | /* | ||
487 | * migrate events and context to new cpu | ||
488 | */ | ||
489 | if (target >= 0) | ||
490 | perf_pmu_migrate_context(pmu->pmu, cpu, target); | ||
491 | |||
492 | /* cancel overflow polling timer for CPU */ | ||
493 | rapl_stop_hrtimer(pmu); | ||
494 | } | ||
495 | |||
496 | static void rapl_cpu_init(int cpu) | ||
497 | { | ||
498 | int i, phys_id = topology_physical_package_id(cpu); | ||
499 | |||
500 | /* check if phys_is is already covered */ | ||
501 | for_each_cpu(i, &rapl_cpu_mask) { | ||
502 | if (phys_id == topology_physical_package_id(i)) | ||
503 | return; | ||
504 | } | ||
505 | /* was not found, so add it */ | ||
506 | cpumask_set_cpu(cpu, &rapl_cpu_mask); | ||
507 | } | ||
508 | |||
509 | static int rapl_cpu_prepare(int cpu) | ||
510 | { | ||
511 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
512 | int phys_id = topology_physical_package_id(cpu); | ||
513 | u64 ms; | ||
514 | |||
515 | if (pmu) | ||
516 | return 0; | ||
517 | |||
518 | if (phys_id < 0) | ||
519 | return -1; | ||
520 | |||
521 | pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); | ||
522 | if (!pmu) | ||
523 | return -1; | ||
524 | |||
525 | spin_lock_init(&pmu->lock); | ||
526 | |||
527 | INIT_LIST_HEAD(&pmu->active_list); | ||
528 | |||
529 | /* | ||
530 | * grab power unit as: 1/2^unit Joules | ||
531 | * | ||
532 | * we cache in local PMU instance | ||
533 | */ | ||
534 | rdmsrl(MSR_RAPL_POWER_UNIT, pmu->hw_unit); | ||
535 | pmu->hw_unit = (pmu->hw_unit >> 8) & 0x1FULL; | ||
536 | pmu->pmu = &rapl_pmu_class; | ||
537 | |||
538 | /* | ||
539 | * use reference of 200W for scaling the timeout | ||
540 | * to avoid missing counter overflows. | ||
541 | * 200W = 200 Joules/sec | ||
542 | * divide interval by 2 to avoid lockstep (2 * 100) | ||
543 | * if hw unit is 32, then we use 2 ms 1/200/2 | ||
544 | */ | ||
545 | if (pmu->hw_unit < 32) | ||
546 | ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); | ||
547 | else | ||
548 | ms = 2; | ||
549 | |||
550 | pmu->timer_interval = ms_to_ktime(ms); | ||
551 | |||
552 | rapl_hrtimer_init(pmu); | ||
553 | |||
554 | /* set RAPL pmu for this cpu for now */ | ||
555 | per_cpu(rapl_pmu, cpu) = pmu; | ||
556 | per_cpu(rapl_pmu_to_free, cpu) = NULL; | ||
557 | |||
558 | return 0; | ||
559 | } | ||
560 | |||
561 | static void rapl_cpu_kfree(int cpu) | ||
562 | { | ||
563 | struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu); | ||
564 | |||
565 | kfree(pmu); | ||
566 | |||
567 | per_cpu(rapl_pmu_to_free, cpu) = NULL; | ||
568 | } | ||
569 | |||
570 | static int rapl_cpu_dying(int cpu) | ||
571 | { | ||
572 | struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); | ||
573 | |||
574 | if (!pmu) | ||
575 | return 0; | ||
576 | |||
577 | per_cpu(rapl_pmu, cpu) = NULL; | ||
578 | |||
579 | per_cpu(rapl_pmu_to_free, cpu) = pmu; | ||
580 | |||
581 | return 0; | ||
582 | } | ||
583 | |||
584 | static int rapl_cpu_notifier(struct notifier_block *self, | ||
585 | unsigned long action, void *hcpu) | ||
586 | { | ||
587 | unsigned int cpu = (long)hcpu; | ||
588 | |||
589 | switch (action & ~CPU_TASKS_FROZEN) { | ||
590 | case CPU_UP_PREPARE: | ||
591 | rapl_cpu_prepare(cpu); | ||
592 | break; | ||
593 | case CPU_STARTING: | ||
594 | rapl_cpu_init(cpu); | ||
595 | break; | ||
596 | case CPU_UP_CANCELED: | ||
597 | case CPU_DYING: | ||
598 | rapl_cpu_dying(cpu); | ||
599 | break; | ||
600 | case CPU_ONLINE: | ||
601 | case CPU_DEAD: | ||
602 | rapl_cpu_kfree(cpu); | ||
603 | break; | ||
604 | case CPU_DOWN_PREPARE: | ||
605 | rapl_cpu_exit(cpu); | ||
606 | break; | ||
607 | default: | ||
608 | break; | ||
609 | } | ||
610 | |||
611 | return NOTIFY_OK; | ||
612 | } | ||
613 | |||
614 | static const struct x86_cpu_id rapl_cpu_match[] = { | ||
615 | [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, | ||
616 | [1] = {}, | ||
617 | }; | ||
618 | |||
619 | static int __init rapl_pmu_init(void) | ||
620 | { | ||
621 | struct rapl_pmu *pmu; | ||
622 | int cpu, ret; | ||
623 | |||
624 | /* | ||
625 | * check for Intel processor family 6 | ||
626 | */ | ||
627 | if (!x86_match_cpu(rapl_cpu_match)) | ||
628 | return 0; | ||
629 | |||
630 | /* check supported CPU */ | ||
631 | switch (boot_cpu_data.x86_model) { | ||
632 | case 42: /* Sandy Bridge */ | ||
633 | case 58: /* Ivy Bridge */ | ||
634 | case 60: /* Haswell */ | ||
635 | case 69: /* Haswell-Celeron */ | ||
636 | rapl_cntr_mask = RAPL_IDX_CLN; | ||
637 | rapl_pmu_events_group.attrs = rapl_events_cln_attr; | ||
638 | break; | ||
639 | case 45: /* Sandy Bridge-EP */ | ||
640 | case 62: /* IvyTown */ | ||
641 | rapl_cntr_mask = RAPL_IDX_SRV; | ||
642 | rapl_pmu_events_group.attrs = rapl_events_srv_attr; | ||
643 | break; | ||
644 | |||
645 | default: | ||
646 | /* unsupported */ | ||
647 | return 0; | ||
648 | } | ||
649 | get_online_cpus(); | ||
650 | |||
651 | for_each_online_cpu(cpu) { | ||
652 | rapl_cpu_prepare(cpu); | ||
653 | rapl_cpu_init(cpu); | ||
654 | } | ||
655 | |||
656 | perf_cpu_notifier(rapl_cpu_notifier); | ||
657 | |||
658 | ret = perf_pmu_register(&rapl_pmu_class, "power", -1); | ||
659 | if (WARN_ON(ret)) { | ||
660 | pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret); | ||
661 | put_online_cpus(); | ||
662 | return -1; | ||
663 | } | ||
664 | |||
665 | pmu = __get_cpu_var(rapl_pmu); | ||
666 | |||
667 | pr_info("RAPL PMU detected, hw unit 2^-%d Joules," | ||
668 | " API unit is 2^-32 Joules," | ||
669 | " %d fixed counters" | ||
670 | " %llu ms ovfl timer\n", | ||
671 | pmu->hw_unit, | ||
672 | hweight32(rapl_cntr_mask), | ||
673 | ktime_to_ms(pmu->timer_interval)); | ||
674 | |||
675 | put_online_cpus(); | ||
676 | |||
677 | return 0; | ||
678 | } | ||
679 | device_initcall(rapl_pmu_init); | ||
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index 88db010845cb..384df5105fbc 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c | |||
@@ -31,20 +31,6 @@ static int __init x86_rdrand_setup(char *s) | |||
31 | } | 31 | } |
32 | __setup("nordrand", x86_rdrand_setup); | 32 | __setup("nordrand", x86_rdrand_setup); |
33 | 33 | ||
34 | /* We can't use arch_get_random_long() here since alternatives haven't run */ | ||
35 | static inline int rdrand_long(unsigned long *v) | ||
36 | { | ||
37 | int ok; | ||
38 | asm volatile("1: " RDRAND_LONG "\n\t" | ||
39 | "jc 2f\n\t" | ||
40 | "decl %0\n\t" | ||
41 | "jnz 1b\n\t" | ||
42 | "2:" | ||
43 | : "=r" (ok), "=a" (*v) | ||
44 | : "0" (RDRAND_RETRY_LOOPS)); | ||
45 | return ok; | ||
46 | } | ||
47 | |||
48 | /* | 34 | /* |
49 | * Force a reseed cycle; we are architecturally guaranteed a reseed | 35 | * Force a reseed cycle; we are architecturally guaranteed a reseed |
50 | * after no more than 512 128-bit chunks of random data. This also | 36 | * after no more than 512 128-bit chunks of random data. This also |
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index aa0430d69b90..3fa0e5ad86b4 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/mm.h> | 2 | #include <linux/mm.h> |
3 | #include <linux/init.h> | ||
4 | #include <asm/processor.h> | 3 | #include <asm/processor.h> |
5 | #include <asm/msr.h> | 4 | #include <asm/msr.h> |
6 | #include "cpu.h" | 5 | #include "cpu.h" |
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index 75c5ad5d35cc..ef9c2a0078bd 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c | |||
@@ -1,5 +1,4 @@ | |||
1 | #include <linux/kernel.h> | 1 | #include <linux/kernel.h> |
2 | #include <linux/init.h> | ||
3 | #include <asm/processor.h> | 2 | #include <asm/processor.h> |
4 | #include "cpu.h" | 3 | #include "cpu.h" |
5 | 4 | ||
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 18677a90d6a3..a57902efe2d5 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c | |||
@@ -7,7 +7,6 @@ | |||
7 | * | 7 | * |
8 | */ | 8 | */ |
9 | 9 | ||
10 | #include <linux/init.h> | ||
11 | #include <linux/types.h> | 10 | #include <linux/types.h> |
12 | #include <linux/kernel.h> | 11 | #include <linux/kernel.h> |
13 | #include <linux/smp.h> | 12 | #include <linux/smp.h> |
diff --git a/arch/x86/kernel/doublefault.c b/arch/x86/kernel/doublefault.c index 5d3fe8d36e4a..f6dfd9334b67 100644 --- a/arch/x86/kernel/doublefault.c +++ b/arch/x86/kernel/doublefault.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
2 | #include <linux/sched.h> | 2 | #include <linux/sched.h> |
3 | #include <linux/init.h> | ||
4 | #include <linux/init_task.h> | 3 | #include <linux/init_task.h> |
5 | #include <linux/fs.h> | 4 | #include <linux/fs.h> |
6 | 5 | ||
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 174da5fc5a7b..988c00a1f60d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c | |||
@@ -1120,7 +1120,7 @@ void __init memblock_find_dma_reserve(void) | |||
1120 | nr_pages += end_pfn - start_pfn; | 1120 | nr_pages += end_pfn - start_pfn; |
1121 | } | 1121 | } |
1122 | 1122 | ||
1123 | for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { | 1123 | for_each_free_mem_range(u, NUMA_NO_NODE, &start, &end, NULL) { |
1124 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); | 1124 | start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); |
1125 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); | 1125 | end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); |
1126 | if (start_pfn < end_pfn) | 1126 | if (start_pfn < end_pfn) |
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 51e2988c5728..a2a4f4697889 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S | |||
@@ -1082,7 +1082,7 @@ ENTRY(ftrace_caller) | |||
1082 | pushl $0 /* Pass NULL as regs pointer */ | 1082 | pushl $0 /* Pass NULL as regs pointer */ |
1083 | movl 4*4(%esp), %eax | 1083 | movl 4*4(%esp), %eax |
1084 | movl 0x4(%ebp), %edx | 1084 | movl 0x4(%ebp), %edx |
1085 | leal function_trace_op, %ecx | 1085 | movl function_trace_op, %ecx |
1086 | subl $MCOUNT_INSN_SIZE, %eax | 1086 | subl $MCOUNT_INSN_SIZE, %eax |
1087 | 1087 | ||
1088 | .globl ftrace_call | 1088 | .globl ftrace_call |
@@ -1140,7 +1140,7 @@ ENTRY(ftrace_regs_caller) | |||
1140 | movl 12*4(%esp), %eax /* Load ip (1st parameter) */ | 1140 | movl 12*4(%esp), %eax /* Load ip (1st parameter) */ |
1141 | subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ | 1141 | subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ |
1142 | movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ | 1142 | movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ |
1143 | leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ | 1143 | movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ |
1144 | pushl %esp /* Save pt_regs as 4th parameter */ | 1144 | pushl %esp /* Save pt_regs as 4th parameter */ |
1145 | 1145 | ||
1146 | GLOBAL(ftrace_regs_call) | 1146 | GLOBAL(ftrace_regs_call) |
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e21b0785a85b..1e96c3628bf2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S | |||
@@ -88,7 +88,7 @@ END(function_hook) | |||
88 | MCOUNT_SAVE_FRAME \skip | 88 | MCOUNT_SAVE_FRAME \skip |
89 | 89 | ||
90 | /* Load the ftrace_ops into the 3rd parameter */ | 90 | /* Load the ftrace_ops into the 3rd parameter */ |
91 | leaq function_trace_op, %rdx | 91 | movq function_trace_op(%rip), %rdx |
92 | 92 | ||
93 | /* Load ip into the first parameter */ | 93 | /* Load ip into the first parameter */ |
94 | movq RIP(%rsp), %rdi | 94 | movq RIP(%rsp), %rdi |
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index f66ff162dce8..a67b47c31314 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c | |||
@@ -38,7 +38,6 @@ | |||
38 | #include <linux/kernel.h> | 38 | #include <linux/kernel.h> |
39 | #include <linux/module.h> | 39 | #include <linux/module.h> |
40 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
41 | #include <linux/init.h> | ||
42 | #include <linux/smp.h> | 41 | #include <linux/smp.h> |
43 | 42 | ||
44 | #include <asm/hw_breakpoint.h> | 43 | #include <asm/hw_breakpoint.h> |
diff --git a/arch/x86/kernel/iosf_mbi.c b/arch/x86/kernel/iosf_mbi.c new file mode 100644 index 000000000000..c3aae6672843 --- /dev/null +++ b/arch/x86/kernel/iosf_mbi.c | |||
@@ -0,0 +1,226 @@ | |||
1 | /* | ||
2 | * IOSF-SB MailBox Interface Driver | ||
3 | * Copyright (c) 2013, Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * | ||
15 | * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a | ||
16 | * mailbox interface (MBI) to communicate with mutiple devices. This | ||
17 | * driver implements access to this interface for those platforms that can | ||
18 | * enumerate the device using PCI. | ||
19 | */ | ||
20 | |||
21 | #include <linux/module.h> | ||
22 | #include <linux/init.h> | ||
23 | #include <linux/spinlock.h> | ||
24 | #include <linux/pci.h> | ||
25 | |||
26 | #include <asm/iosf_mbi.h> | ||
27 | |||
28 | static DEFINE_SPINLOCK(iosf_mbi_lock); | ||
29 | |||
30 | static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset) | ||
31 | { | ||
32 | return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE; | ||
33 | } | ||
34 | |||
35 | static struct pci_dev *mbi_pdev; /* one mbi device */ | ||
36 | |||
37 | static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr) | ||
38 | { | ||
39 | int result; | ||
40 | |||
41 | if (!mbi_pdev) | ||
42 | return -ENODEV; | ||
43 | |||
44 | if (mcrx) { | ||
45 | result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, | ||
46 | mcrx); | ||
47 | if (result < 0) | ||
48 | goto fail_read; | ||
49 | } | ||
50 | |||
51 | result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); | ||
52 | if (result < 0) | ||
53 | goto fail_read; | ||
54 | |||
55 | result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); | ||
56 | if (result < 0) | ||
57 | goto fail_read; | ||
58 | |||
59 | return 0; | ||
60 | |||
61 | fail_read: | ||
62 | dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); | ||
63 | return result; | ||
64 | } | ||
65 | |||
66 | static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr) | ||
67 | { | ||
68 | int result; | ||
69 | |||
70 | if (!mbi_pdev) | ||
71 | return -ENODEV; | ||
72 | |||
73 | result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr); | ||
74 | if (result < 0) | ||
75 | goto fail_write; | ||
76 | |||
77 | if (mcrx) { | ||
78 | result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET, | ||
79 | mcrx); | ||
80 | if (result < 0) | ||
81 | goto fail_write; | ||
82 | } | ||
83 | |||
84 | result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr); | ||
85 | if (result < 0) | ||
86 | goto fail_write; | ||
87 | |||
88 | return 0; | ||
89 | |||
90 | fail_write: | ||
91 | dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result); | ||
92 | return result; | ||
93 | } | ||
94 | |||
95 | int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr) | ||
96 | { | ||
97 | u32 mcr, mcrx; | ||
98 | unsigned long flags; | ||
99 | int ret; | ||
100 | |||
101 | /*Access to the GFX unit is handled by GPU code */ | ||
102 | if (port == BT_MBI_UNIT_GFX) { | ||
103 | WARN_ON(1); | ||
104 | return -EPERM; | ||
105 | } | ||
106 | |||
107 | mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); | ||
108 | mcrx = offset & MBI_MASK_HI; | ||
109 | |||
110 | spin_lock_irqsave(&iosf_mbi_lock, flags); | ||
111 | ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr); | ||
112 | spin_unlock_irqrestore(&iosf_mbi_lock, flags); | ||
113 | |||
114 | return ret; | ||
115 | } | ||
116 | EXPORT_SYMBOL(iosf_mbi_read); | ||
117 | |||
118 | int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr) | ||
119 | { | ||
120 | u32 mcr, mcrx; | ||
121 | unsigned long flags; | ||
122 | int ret; | ||
123 | |||
124 | /*Access to the GFX unit is handled by GPU code */ | ||
125 | if (port == BT_MBI_UNIT_GFX) { | ||
126 | WARN_ON(1); | ||
127 | return -EPERM; | ||
128 | } | ||
129 | |||
130 | mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); | ||
131 | mcrx = offset & MBI_MASK_HI; | ||
132 | |||
133 | spin_lock_irqsave(&iosf_mbi_lock, flags); | ||
134 | ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr); | ||
135 | spin_unlock_irqrestore(&iosf_mbi_lock, flags); | ||
136 | |||
137 | return ret; | ||
138 | } | ||
139 | EXPORT_SYMBOL(iosf_mbi_write); | ||
140 | |||
141 | int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask) | ||
142 | { | ||
143 | u32 mcr, mcrx; | ||
144 | u32 value; | ||
145 | unsigned long flags; | ||
146 | int ret; | ||
147 | |||
148 | /*Access to the GFX unit is handled by GPU code */ | ||
149 | if (port == BT_MBI_UNIT_GFX) { | ||
150 | WARN_ON(1); | ||
151 | return -EPERM; | ||
152 | } | ||
153 | |||
154 | mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO); | ||
155 | mcrx = offset & MBI_MASK_HI; | ||
156 | |||
157 | spin_lock_irqsave(&iosf_mbi_lock, flags); | ||
158 | |||
159 | /* Read current mdr value */ | ||
160 | ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value); | ||
161 | if (ret < 0) { | ||
162 | spin_unlock_irqrestore(&iosf_mbi_lock, flags); | ||
163 | return ret; | ||
164 | } | ||
165 | |||
166 | /* Apply mask */ | ||
167 | value &= ~mask; | ||
168 | mdr &= mask; | ||
169 | value |= mdr; | ||
170 | |||
171 | /* Write back */ | ||
172 | ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value); | ||
173 | |||
174 | spin_unlock_irqrestore(&iosf_mbi_lock, flags); | ||
175 | |||
176 | return ret; | ||
177 | } | ||
178 | EXPORT_SYMBOL(iosf_mbi_modify); | ||
179 | |||
180 | static int iosf_mbi_probe(struct pci_dev *pdev, | ||
181 | const struct pci_device_id *unused) | ||
182 | { | ||
183 | int ret; | ||
184 | |||
185 | ret = pci_enable_device(pdev); | ||
186 | if (ret < 0) { | ||
187 | dev_err(&pdev->dev, "error: could not enable device\n"); | ||
188 | return ret; | ||
189 | } | ||
190 | |||
191 | mbi_pdev = pci_dev_get(pdev); | ||
192 | return 0; | ||
193 | } | ||
194 | |||
195 | static DEFINE_PCI_DEVICE_TABLE(iosf_mbi_pci_ids) = { | ||
196 | { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x0F00) }, | ||
197 | { 0, }, | ||
198 | }; | ||
199 | MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids); | ||
200 | |||
201 | static struct pci_driver iosf_mbi_pci_driver = { | ||
202 | .name = "iosf_mbi_pci", | ||
203 | .probe = iosf_mbi_probe, | ||
204 | .id_table = iosf_mbi_pci_ids, | ||
205 | }; | ||
206 | |||
207 | static int __init iosf_mbi_init(void) | ||
208 | { | ||
209 | return pci_register_driver(&iosf_mbi_pci_driver); | ||
210 | } | ||
211 | |||
212 | static void __exit iosf_mbi_exit(void) | ||
213 | { | ||
214 | pci_unregister_driver(&iosf_mbi_pci_driver); | ||
215 | if (mbi_pdev) { | ||
216 | pci_dev_put(mbi_pdev); | ||
217 | mbi_pdev = NULL; | ||
218 | } | ||
219 | } | ||
220 | |||
221 | module_init(iosf_mbi_init); | ||
222 | module_exit(iosf_mbi_exit); | ||
223 | |||
224 | MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>"); | ||
225 | MODULE_DESCRIPTION("IOSF Mailbox Interface accessor"); | ||
226 | MODULE_LICENSE("GPL v2"); | ||
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 22d0687e7fda..dbb60878b744 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c | |||
@@ -193,9 +193,13 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) | |||
193 | if (!handle_irq(irq, regs)) { | 193 | if (!handle_irq(irq, regs)) { |
194 | ack_APIC_irq(); | 194 | ack_APIC_irq(); |
195 | 195 | ||
196 | if (printk_ratelimit()) | 196 | if (irq != VECTOR_RETRIGGERED) { |
197 | pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n", | 197 | pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n", |
198 | __func__, smp_processor_id(), vector, irq); | 198 | __func__, smp_processor_id(), |
199 | vector, irq); | ||
200 | } else { | ||
201 | __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); | ||
202 | } | ||
199 | } | 203 | } |
200 | 204 | ||
201 | irq_exit(); | 205 | irq_exit(); |
@@ -262,6 +266,76 @@ __visible void smp_trace_x86_platform_ipi(struct pt_regs *regs) | |||
262 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); | 266 | EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); |
263 | 267 | ||
264 | #ifdef CONFIG_HOTPLUG_CPU | 268 | #ifdef CONFIG_HOTPLUG_CPU |
269 | /* | ||
270 | * This cpu is going to be removed and its vectors migrated to the remaining | ||
271 | * online cpus. Check to see if there are enough vectors in the remaining cpus. | ||
272 | * This function is protected by stop_machine(). | ||
273 | */ | ||
274 | int check_irq_vectors_for_cpu_disable(void) | ||
275 | { | ||
276 | int irq, cpu; | ||
277 | unsigned int this_cpu, vector, this_count, count; | ||
278 | struct irq_desc *desc; | ||
279 | struct irq_data *data; | ||
280 | struct cpumask affinity_new, online_new; | ||
281 | |||
282 | this_cpu = smp_processor_id(); | ||
283 | cpumask_copy(&online_new, cpu_online_mask); | ||
284 | cpu_clear(this_cpu, online_new); | ||
285 | |||
286 | this_count = 0; | ||
287 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | ||
288 | irq = __this_cpu_read(vector_irq[vector]); | ||
289 | if (irq >= 0) { | ||
290 | desc = irq_to_desc(irq); | ||
291 | data = irq_desc_get_irq_data(desc); | ||
292 | cpumask_copy(&affinity_new, data->affinity); | ||
293 | cpu_clear(this_cpu, affinity_new); | ||
294 | |||
295 | /* Do not count inactive or per-cpu irqs. */ | ||
296 | if (!irq_has_action(irq) || irqd_is_per_cpu(data)) | ||
297 | continue; | ||
298 | |||
299 | /* | ||
300 | * A single irq may be mapped to multiple | ||
301 | * cpu's vector_irq[] (for example IOAPIC cluster | ||
302 | * mode). In this case we have two | ||
303 | * possibilities: | ||
304 | * | ||
305 | * 1) the resulting affinity mask is empty; that is | ||
306 | * this the down'd cpu is the last cpu in the irq's | ||
307 | * affinity mask, or | ||
308 | * | ||
309 | * 2) the resulting affinity mask is no longer | ||
310 | * a subset of the online cpus but the affinity | ||
311 | * mask is not zero; that is the down'd cpu is the | ||
312 | * last online cpu in a user set affinity mask. | ||
313 | */ | ||
314 | if (cpumask_empty(&affinity_new) || | ||
315 | !cpumask_subset(&affinity_new, &online_new)) | ||
316 | this_count++; | ||
317 | } | ||
318 | } | ||
319 | |||
320 | count = 0; | ||
321 | for_each_online_cpu(cpu) { | ||
322 | if (cpu == this_cpu) | ||
323 | continue; | ||
324 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; | ||
325 | vector++) { | ||
326 | if (per_cpu(vector_irq, cpu)[vector] < 0) | ||
327 | count++; | ||
328 | } | ||
329 | } | ||
330 | |||
331 | if (count < this_count) { | ||
332 | pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", | ||
333 | this_cpu, this_count, count); | ||
334 | return -ERANGE; | ||
335 | } | ||
336 | return 0; | ||
337 | } | ||
338 | |||
265 | /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ | 339 | /* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ |
266 | void fixup_irqs(void) | 340 | void fixup_irqs(void) |
267 | { | 341 | { |
@@ -344,7 +418,7 @@ void fixup_irqs(void) | |||
344 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { | 418 | for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { |
345 | unsigned int irr; | 419 | unsigned int irr; |
346 | 420 | ||
347 | if (__this_cpu_read(vector_irq[vector]) < 0) | 421 | if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED) |
348 | continue; | 422 | continue; |
349 | 423 | ||
350 | irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); | 424 | irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); |
@@ -355,11 +429,14 @@ void fixup_irqs(void) | |||
355 | data = irq_desc_get_irq_data(desc); | 429 | data = irq_desc_get_irq_data(desc); |
356 | chip = irq_data_get_irq_chip(data); | 430 | chip = irq_data_get_irq_chip(data); |
357 | raw_spin_lock(&desc->lock); | 431 | raw_spin_lock(&desc->lock); |
358 | if (chip->irq_retrigger) | 432 | if (chip->irq_retrigger) { |
359 | chip->irq_retrigger(data); | 433 | chip->irq_retrigger(data); |
434 | __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); | ||
435 | } | ||
360 | raw_spin_unlock(&desc->lock); | 436 | raw_spin_unlock(&desc->lock); |
361 | } | 437 | } |
362 | __this_cpu_write(vector_irq[vector], -1); | 438 | if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED) |
439 | __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED); | ||
363 | } | 440 | } |
364 | } | 441 | } |
365 | #endif | 442 | #endif |
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a2a1fbc594ff..7f50156542fb 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c | |||
@@ -52,7 +52,7 @@ static struct irqaction irq2 = { | |||
52 | }; | 52 | }; |
53 | 53 | ||
54 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { | 54 | DEFINE_PER_CPU(vector_irq_t, vector_irq) = { |
55 | [0 ... NR_VECTORS - 1] = -1, | 55 | [0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED, |
56 | }; | 56 | }; |
57 | 57 | ||
58 | int vector_used_by_percpu_irq(unsigned int vector) | 58 | int vector_used_by_percpu_irq(unsigned int vector) |
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector) | |||
60 | int cpu; | 60 | int cpu; |
61 | 61 | ||
62 | for_each_online_cpu(cpu) { | 62 | for_each_online_cpu(cpu) { |
63 | if (per_cpu(vector_irq, cpu)[vector] != -1) | 63 | if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED) |
64 | return 1; | 64 | return 1; |
65 | } | 65 | } |
66 | 66 | ||
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 836f8322960e..7ec1d5f8d283 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
41 | #include <linux/kgdb.h> | 41 | #include <linux/kgdb.h> |
42 | #include <linux/init.h> | ||
43 | #include <linux/smp.h> | 42 | #include <linux/smp.h> |
44 | #include <linux/nmi.h> | 43 | #include <linux/nmi.h> |
45 | #include <linux/hw_breakpoint.h> | 44 | #include <linux/hw_breakpoint.h> |
diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c new file mode 100644 index 000000000000..c2bedaea11f7 --- /dev/null +++ b/arch/x86/kernel/ksysfs.c | |||
@@ -0,0 +1,340 @@ | |||
1 | /* | ||
2 | * Architecture specific sysfs attributes in /sys/kernel | ||
3 | * | ||
4 | * Copyright (C) 2007, Intel Corp. | ||
5 | * Huang Ying <ying.huang@intel.com> | ||
6 | * Copyright (C) 2013, 2013 Red Hat, Inc. | ||
7 | * Dave Young <dyoung@redhat.com> | ||
8 | * | ||
9 | * This file is released under the GPLv2 | ||
10 | */ | ||
11 | |||
12 | #include <linux/kobject.h> | ||
13 | #include <linux/string.h> | ||
14 | #include <linux/sysfs.h> | ||
15 | #include <linux/init.h> | ||
16 | #include <linux/stat.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/mm.h> | ||
19 | |||
20 | #include <asm/io.h> | ||
21 | #include <asm/setup.h> | ||
22 | |||
23 | static ssize_t version_show(struct kobject *kobj, | ||
24 | struct kobj_attribute *attr, char *buf) | ||
25 | { | ||
26 | return sprintf(buf, "0x%04x\n", boot_params.hdr.version); | ||
27 | } | ||
28 | |||
29 | static struct kobj_attribute boot_params_version_attr = __ATTR_RO(version); | ||
30 | |||
31 | static ssize_t boot_params_data_read(struct file *fp, struct kobject *kobj, | ||
32 | struct bin_attribute *bin_attr, | ||
33 | char *buf, loff_t off, size_t count) | ||
34 | { | ||
35 | memcpy(buf, (void *)&boot_params + off, count); | ||
36 | return count; | ||
37 | } | ||
38 | |||
39 | static struct bin_attribute boot_params_data_attr = { | ||
40 | .attr = { | ||
41 | .name = "data", | ||
42 | .mode = S_IRUGO, | ||
43 | }, | ||
44 | .read = boot_params_data_read, | ||
45 | .size = sizeof(boot_params), | ||
46 | }; | ||
47 | |||
48 | static struct attribute *boot_params_version_attrs[] = { | ||
49 | &boot_params_version_attr.attr, | ||
50 | NULL, | ||
51 | }; | ||
52 | |||
53 | static struct bin_attribute *boot_params_data_attrs[] = { | ||
54 | &boot_params_data_attr, | ||
55 | NULL, | ||
56 | }; | ||
57 | |||
58 | static struct attribute_group boot_params_attr_group = { | ||
59 | .attrs = boot_params_version_attrs, | ||
60 | .bin_attrs = boot_params_data_attrs, | ||
61 | }; | ||
62 | |||
63 | static int kobj_to_setup_data_nr(struct kobject *kobj, int *nr) | ||
64 | { | ||
65 | const char *name; | ||
66 | |||
67 | name = kobject_name(kobj); | ||
68 | return kstrtoint(name, 10, nr); | ||
69 | } | ||
70 | |||
71 | static int get_setup_data_paddr(int nr, u64 *paddr) | ||
72 | { | ||
73 | int i = 0; | ||
74 | struct setup_data *data; | ||
75 | u64 pa_data = boot_params.hdr.setup_data; | ||
76 | |||
77 | while (pa_data) { | ||
78 | if (nr == i) { | ||
79 | *paddr = pa_data; | ||
80 | return 0; | ||
81 | } | ||
82 | data = ioremap_cache(pa_data, sizeof(*data)); | ||
83 | if (!data) | ||
84 | return -ENOMEM; | ||
85 | |||
86 | pa_data = data->next; | ||
87 | iounmap(data); | ||
88 | i++; | ||
89 | } | ||
90 | return -EINVAL; | ||
91 | } | ||
92 | |||
93 | static int __init get_setup_data_size(int nr, size_t *size) | ||
94 | { | ||
95 | int i = 0; | ||
96 | struct setup_data *data; | ||
97 | u64 pa_data = boot_params.hdr.setup_data; | ||
98 | |||
99 | while (pa_data) { | ||
100 | data = ioremap_cache(pa_data, sizeof(*data)); | ||
101 | if (!data) | ||
102 | return -ENOMEM; | ||
103 | if (nr == i) { | ||
104 | *size = data->len; | ||
105 | iounmap(data); | ||
106 | return 0; | ||
107 | } | ||
108 | |||
109 | pa_data = data->next; | ||
110 | iounmap(data); | ||
111 | i++; | ||
112 | } | ||
113 | return -EINVAL; | ||
114 | } | ||
115 | |||
116 | static ssize_t type_show(struct kobject *kobj, | ||
117 | struct kobj_attribute *attr, char *buf) | ||
118 | { | ||
119 | int nr, ret; | ||
120 | u64 paddr; | ||
121 | struct setup_data *data; | ||
122 | |||
123 | ret = kobj_to_setup_data_nr(kobj, &nr); | ||
124 | if (ret) | ||
125 | return ret; | ||
126 | |||
127 | ret = get_setup_data_paddr(nr, &paddr); | ||
128 | if (ret) | ||
129 | return ret; | ||
130 | data = ioremap_cache(paddr, sizeof(*data)); | ||
131 | if (!data) | ||
132 | return -ENOMEM; | ||
133 | |||
134 | ret = sprintf(buf, "0x%x\n", data->type); | ||
135 | iounmap(data); | ||
136 | return ret; | ||
137 | } | ||
138 | |||
139 | static ssize_t setup_data_data_read(struct file *fp, | ||
140 | struct kobject *kobj, | ||
141 | struct bin_attribute *bin_attr, | ||
142 | char *buf, | ||
143 | loff_t off, size_t count) | ||
144 | { | ||
145 | int nr, ret = 0; | ||
146 | u64 paddr; | ||
147 | struct setup_data *data; | ||
148 | void *p; | ||
149 | |||
150 | ret = kobj_to_setup_data_nr(kobj, &nr); | ||
151 | if (ret) | ||
152 | return ret; | ||
153 | |||
154 | ret = get_setup_data_paddr(nr, &paddr); | ||
155 | if (ret) | ||
156 | return ret; | ||
157 | data = ioremap_cache(paddr, sizeof(*data)); | ||
158 | if (!data) | ||
159 | return -ENOMEM; | ||
160 | |||
161 | if (off > data->len) { | ||
162 | ret = -EINVAL; | ||
163 | goto out; | ||
164 | } | ||
165 | |||
166 | if (count > data->len - off) | ||
167 | count = data->len - off; | ||
168 | |||
169 | if (!count) | ||
170 | goto out; | ||
171 | |||
172 | ret = count; | ||
173 | p = ioremap_cache(paddr + sizeof(*data), data->len); | ||
174 | if (!p) { | ||
175 | ret = -ENOMEM; | ||
176 | goto out; | ||
177 | } | ||
178 | memcpy(buf, p + off, count); | ||
179 | iounmap(p); | ||
180 | out: | ||
181 | iounmap(data); | ||
182 | return ret; | ||
183 | } | ||
184 | |||
185 | static struct kobj_attribute type_attr = __ATTR_RO(type); | ||
186 | |||
187 | static struct bin_attribute data_attr = { | ||
188 | .attr = { | ||
189 | .name = "data", | ||
190 | .mode = S_IRUGO, | ||
191 | }, | ||
192 | .read = setup_data_data_read, | ||
193 | }; | ||
194 | |||
195 | static struct attribute *setup_data_type_attrs[] = { | ||
196 | &type_attr.attr, | ||
197 | NULL, | ||
198 | }; | ||
199 | |||
200 | static struct bin_attribute *setup_data_data_attrs[] = { | ||
201 | &data_attr, | ||
202 | NULL, | ||
203 | }; | ||
204 | |||
205 | static struct attribute_group setup_data_attr_group = { | ||
206 | .attrs = setup_data_type_attrs, | ||
207 | .bin_attrs = setup_data_data_attrs, | ||
208 | }; | ||
209 | |||
210 | static int __init create_setup_data_node(struct kobject *parent, | ||
211 | struct kobject **kobjp, int nr) | ||
212 | { | ||
213 | int ret = 0; | ||
214 | size_t size; | ||
215 | struct kobject *kobj; | ||
216 | char name[16]; /* should be enough for setup_data nodes numbers */ | ||
217 | snprintf(name, 16, "%d", nr); | ||
218 | |||
219 | kobj = kobject_create_and_add(name, parent); | ||
220 | if (!kobj) | ||
221 | return -ENOMEM; | ||
222 | |||
223 | ret = get_setup_data_size(nr, &size); | ||
224 | if (ret) | ||
225 | goto out_kobj; | ||
226 | |||
227 | data_attr.size = size; | ||
228 | ret = sysfs_create_group(kobj, &setup_data_attr_group); | ||
229 | if (ret) | ||
230 | goto out_kobj; | ||
231 | *kobjp = kobj; | ||
232 | |||
233 | return 0; | ||
234 | out_kobj: | ||
235 | kobject_put(kobj); | ||
236 | return ret; | ||
237 | } | ||
238 | |||
239 | static void __init cleanup_setup_data_node(struct kobject *kobj) | ||
240 | { | ||
241 | sysfs_remove_group(kobj, &setup_data_attr_group); | ||
242 | kobject_put(kobj); | ||
243 | } | ||
244 | |||
245 | static int __init get_setup_data_total_num(u64 pa_data, int *nr) | ||
246 | { | ||
247 | int ret = 0; | ||
248 | struct setup_data *data; | ||
249 | |||
250 | *nr = 0; | ||
251 | while (pa_data) { | ||
252 | *nr += 1; | ||
253 | data = ioremap_cache(pa_data, sizeof(*data)); | ||
254 | if (!data) { | ||
255 | ret = -ENOMEM; | ||
256 | goto out; | ||
257 | } | ||
258 | pa_data = data->next; | ||
259 | iounmap(data); | ||
260 | } | ||
261 | |||
262 | out: | ||
263 | return ret; | ||
264 | } | ||
265 | |||
266 | static int __init create_setup_data_nodes(struct kobject *parent) | ||
267 | { | ||
268 | struct kobject *setup_data_kobj, **kobjp; | ||
269 | u64 pa_data; | ||
270 | int i, j, nr, ret = 0; | ||
271 | |||
272 | pa_data = boot_params.hdr.setup_data; | ||
273 | if (!pa_data) | ||
274 | return 0; | ||
275 | |||
276 | setup_data_kobj = kobject_create_and_add("setup_data", parent); | ||
277 | if (!setup_data_kobj) { | ||
278 | ret = -ENOMEM; | ||
279 | goto out; | ||
280 | } | ||
281 | |||
282 | ret = get_setup_data_total_num(pa_data, &nr); | ||
283 | if (ret) | ||
284 | goto out_setup_data_kobj; | ||
285 | |||
286 | kobjp = kmalloc(sizeof(*kobjp) * nr, GFP_KERNEL); | ||
287 | if (!kobjp) { | ||
288 | ret = -ENOMEM; | ||
289 | goto out_setup_data_kobj; | ||
290 | } | ||
291 | |||
292 | for (i = 0; i < nr; i++) { | ||
293 | ret = create_setup_data_node(setup_data_kobj, kobjp + i, i); | ||
294 | if (ret) | ||
295 | goto out_clean_nodes; | ||
296 | } | ||
297 | |||
298 | kfree(kobjp); | ||
299 | return 0; | ||
300 | |||
301 | out_clean_nodes: | ||
302 | for (j = i - 1; j > 0; j--) | ||
303 | cleanup_setup_data_node(*(kobjp + j)); | ||
304 | kfree(kobjp); | ||
305 | out_setup_data_kobj: | ||
306 | kobject_put(setup_data_kobj); | ||
307 | out: | ||
308 | return ret; | ||
309 | } | ||
310 | |||
311 | static int __init boot_params_ksysfs_init(void) | ||
312 | { | ||
313 | int ret; | ||
314 | struct kobject *boot_params_kobj; | ||
315 | |||
316 | boot_params_kobj = kobject_create_and_add("boot_params", | ||
317 | kernel_kobj); | ||
318 | if (!boot_params_kobj) { | ||
319 | ret = -ENOMEM; | ||
320 | goto out; | ||
321 | } | ||
322 | |||
323 | ret = sysfs_create_group(boot_params_kobj, &boot_params_attr_group); | ||
324 | if (ret) | ||
325 | goto out_boot_params_kobj; | ||
326 | |||
327 | ret = create_setup_data_nodes(boot_params_kobj); | ||
328 | if (ret) | ||
329 | goto out_create_group; | ||
330 | |||
331 | return 0; | ||
332 | out_create_group: | ||
333 | sysfs_remove_group(boot_params_kobj, &boot_params_attr_group); | ||
334 | out_boot_params_kobj: | ||
335 | kobject_put(boot_params_kobj); | ||
336 | out: | ||
337 | return ret; | ||
338 | } | ||
339 | |||
340 | arch_initcall(boot_params_ksysfs_init); | ||
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5b19e4d78b00..1667b1de8d5d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c | |||
@@ -9,7 +9,6 @@ | |||
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/kexec.h> | 10 | #include <linux/kexec.h> |
11 | #include <linux/delay.h> | 11 | #include <linux/delay.h> |
12 | #include <linux/init.h> | ||
13 | #include <linux/numa.h> | 12 | #include <linux/numa.h> |
14 | #include <linux/ftrace.h> | 13 | #include <linux/ftrace.h> |
15 | #include <linux/suspend.h> | 14 | #include <linux/suspend.h> |
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 871be4a84c7d..da15918d1c81 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c | |||
@@ -3,7 +3,6 @@ | |||
3 | #include <linux/dma-mapping.h> | 3 | #include <linux/dma-mapping.h> |
4 | #include <linux/scatterlist.h> | 4 | #include <linux/scatterlist.h> |
5 | #include <linux/string.h> | 5 | #include <linux/string.h> |
6 | #include <linux/init.h> | ||
7 | #include <linux/gfp.h> | 6 | #include <linux/gfp.h> |
8 | #include <linux/pci.h> | 7 | #include <linux/pci.h> |
9 | #include <linux/mm.h> | 8 | #include <linux/mm.h> |
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6f1236c29c4b..0de43e98ce08 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c | |||
@@ -24,7 +24,6 @@ | |||
24 | #include <linux/interrupt.h> | 24 | #include <linux/interrupt.h> |
25 | #include <linux/delay.h> | 25 | #include <linux/delay.h> |
26 | #include <linux/reboot.h> | 26 | #include <linux/reboot.h> |
27 | #include <linux/init.h> | ||
28 | #include <linux/mc146818rtc.h> | 27 | #include <linux/mc146818rtc.h> |
29 | #include <linux/module.h> | 28 | #include <linux/module.h> |
30 | #include <linux/kallsyms.h> | 29 | #include <linux/kallsyms.h> |
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index da3c599584a3..c752cb43e52f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c | |||
@@ -558,6 +558,17 @@ void native_machine_shutdown(void) | |||
558 | { | 558 | { |
559 | /* Stop the cpus and apics */ | 559 | /* Stop the cpus and apics */ |
560 | #ifdef CONFIG_X86_IO_APIC | 560 | #ifdef CONFIG_X86_IO_APIC |
561 | /* | ||
562 | * Disabling IO APIC before local APIC is a workaround for | ||
563 | * erratum AVR31 in "Intel Atom Processor C2000 Product Family | ||
564 | * Specification Update". In this situation, interrupts that target | ||
565 | * a Logical Processor whose Local APIC is either in the process of | ||
566 | * being hardware disabled or software disabled are neither delivered | ||
567 | * nor discarded. When this erratum occurs, the processor may hang. | ||
568 | * | ||
569 | * Even without the erratum, it still makes sense to quiet IO APIC | ||
570 | * before disabling Local APIC. | ||
571 | */ | ||
561 | disable_IO_APIC(); | 572 | disable_IO_APIC(); |
562 | #endif | 573 | #endif |
563 | 574 | ||
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cb233bc9dee3..c9675594d7ca 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c | |||
@@ -295,6 +295,8 @@ static void __init reserve_brk(void) | |||
295 | _brk_start = 0; | 295 | _brk_start = 0; |
296 | } | 296 | } |
297 | 297 | ||
298 | u64 relocated_ramdisk; | ||
299 | |||
298 | #ifdef CONFIG_BLK_DEV_INITRD | 300 | #ifdef CONFIG_BLK_DEV_INITRD |
299 | 301 | ||
300 | static u64 __init get_ramdisk_image(void) | 302 | static u64 __init get_ramdisk_image(void) |
@@ -321,25 +323,24 @@ static void __init relocate_initrd(void) | |||
321 | u64 ramdisk_image = get_ramdisk_image(); | 323 | u64 ramdisk_image = get_ramdisk_image(); |
322 | u64 ramdisk_size = get_ramdisk_size(); | 324 | u64 ramdisk_size = get_ramdisk_size(); |
323 | u64 area_size = PAGE_ALIGN(ramdisk_size); | 325 | u64 area_size = PAGE_ALIGN(ramdisk_size); |
324 | u64 ramdisk_here; | ||
325 | unsigned long slop, clen, mapaddr; | 326 | unsigned long slop, clen, mapaddr; |
326 | char *p, *q; | 327 | char *p, *q; |
327 | 328 | ||
328 | /* We need to move the initrd down into directly mapped mem */ | 329 | /* We need to move the initrd down into directly mapped mem */ |
329 | ramdisk_here = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), | 330 | relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), |
330 | area_size, PAGE_SIZE); | 331 | area_size, PAGE_SIZE); |
331 | 332 | ||
332 | if (!ramdisk_here) | 333 | if (!relocated_ramdisk) |
333 | panic("Cannot find place for new RAMDISK of size %lld\n", | 334 | panic("Cannot find place for new RAMDISK of size %lld\n", |
334 | ramdisk_size); | 335 | ramdisk_size); |
335 | 336 | ||
336 | /* Note: this includes all the mem currently occupied by | 337 | /* Note: this includes all the mem currently occupied by |
337 | the initrd, we rely on that fact to keep the data intact. */ | 338 | the initrd, we rely on that fact to keep the data intact. */ |
338 | memblock_reserve(ramdisk_here, area_size); | 339 | memblock_reserve(relocated_ramdisk, area_size); |
339 | initrd_start = ramdisk_here + PAGE_OFFSET; | 340 | initrd_start = relocated_ramdisk + PAGE_OFFSET; |
340 | initrd_end = initrd_start + ramdisk_size; | 341 | initrd_end = initrd_start + ramdisk_size; |
341 | printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", | 342 | printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", |
342 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | 343 | relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); |
343 | 344 | ||
344 | q = (char *)initrd_start; | 345 | q = (char *)initrd_start; |
345 | 346 | ||
@@ -363,7 +364,7 @@ static void __init relocate_initrd(void) | |||
363 | printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" | 364 | printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" |
364 | " [mem %#010llx-%#010llx]\n", | 365 | " [mem %#010llx-%#010llx]\n", |
365 | ramdisk_image, ramdisk_image + ramdisk_size - 1, | 366 | ramdisk_image, ramdisk_image + ramdisk_size - 1, |
366 | ramdisk_here, ramdisk_here + ramdisk_size - 1); | 367 | relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); |
367 | } | 368 | } |
368 | 369 | ||
369 | static void __init early_reserve_initrd(void) | 370 | static void __init early_reserve_initrd(void) |
@@ -447,6 +448,9 @@ static void __init parse_setup_data(void) | |||
447 | case SETUP_DTB: | 448 | case SETUP_DTB: |
448 | add_dtb(pa_data); | 449 | add_dtb(pa_data); |
449 | break; | 450 | break; |
451 | case SETUP_EFI: | ||
452 | parse_efi_setup(pa_data, data_len); | ||
453 | break; | ||
450 | default: | 454 | default: |
451 | break; | 455 | break; |
452 | } | 456 | } |
@@ -824,6 +828,20 @@ static void __init trim_low_memory_range(void) | |||
824 | } | 828 | } |
825 | 829 | ||
826 | /* | 830 | /* |
831 | * Dump out kernel offset information on panic. | ||
832 | */ | ||
833 | static int | ||
834 | dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) | ||
835 | { | ||
836 | pr_emerg("Kernel Offset: 0x%lx from 0x%lx " | ||
837 | "(relocation range: 0x%lx-0x%lx)\n", | ||
838 | (unsigned long)&_text - __START_KERNEL, __START_KERNEL, | ||
839 | __START_KERNEL_map, MODULES_VADDR-1); | ||
840 | |||
841 | return 0; | ||
842 | } | ||
843 | |||
844 | /* | ||
827 | * Determine if we were loaded by an EFI loader. If so, then we have also been | 845 | * Determine if we were loaded by an EFI loader. If so, then we have also been |
828 | * passed the efi memmap, systab, etc., so we should use these data structures | 846 | * passed the efi memmap, systab, etc., so we should use these data structures |
829 | * for initialization. Note, the efi init code path is determined by the | 847 | * for initialization. Note, the efi init code path is determined by the |
@@ -924,8 +942,6 @@ void __init setup_arch(char **cmdline_p) | |||
924 | iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; | 942 | iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; |
925 | setup_memory_map(); | 943 | setup_memory_map(); |
926 | parse_setup_data(); | 944 | parse_setup_data(); |
927 | /* update the e820_saved too */ | ||
928 | e820_reserve_setup_data(); | ||
929 | 945 | ||
930 | copy_edd(); | 946 | copy_edd(); |
931 | 947 | ||
@@ -987,6 +1003,8 @@ void __init setup_arch(char **cmdline_p) | |||
987 | early_dump_pci_devices(); | 1003 | early_dump_pci_devices(); |
988 | #endif | 1004 | #endif |
989 | 1005 | ||
1006 | /* update the e820_saved too */ | ||
1007 | e820_reserve_setup_data(); | ||
990 | finish_e820_parsing(); | 1008 | finish_e820_parsing(); |
991 | 1009 | ||
992 | if (efi_enabled(EFI_BOOT)) | 1010 | if (efi_enabled(EFI_BOOT)) |
@@ -1101,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) | |||
1101 | 1119 | ||
1102 | setup_real_mode(); | 1120 | setup_real_mode(); |
1103 | 1121 | ||
1104 | memblock_set_current_limit(get_max_mapped()); | 1122 | memblock_set_current_limit(get_max_low_mapped()); |
1105 | dma_contiguous_reserve(0); | 1123 | dma_contiguous_reserve(0); |
1106 | 1124 | ||
1107 | /* | 1125 | /* |
@@ -1248,3 +1266,15 @@ void __init i386_reserve_resources(void) | |||
1248 | } | 1266 | } |
1249 | 1267 | ||
1250 | #endif /* CONFIG_X86_32 */ | 1268 | #endif /* CONFIG_X86_32 */ |
1269 | |||
1270 | static struct notifier_block kernel_offset_notifier = { | ||
1271 | .notifier_call = dump_kernel_offset | ||
1272 | }; | ||
1273 | |||
1274 | static int __init register_kernel_offset_dumper(void) | ||
1275 | { | ||
1276 | atomic_notifier_chain_register(&panic_notifier_list, | ||
1277 | &kernel_offset_notifier); | ||
1278 | return 0; | ||
1279 | } | ||
1280 | __initcall(register_kernel_offset_dumper); | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 85dc05a3aa02..a32da804252e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -1312,6 +1312,12 @@ void cpu_disable_common(void) | |||
1312 | 1312 | ||
1313 | int native_cpu_disable(void) | 1313 | int native_cpu_disable(void) |
1314 | { | 1314 | { |
1315 | int ret; | ||
1316 | |||
1317 | ret = check_irq_vectors_for_cpu_disable(); | ||
1318 | if (ret) | ||
1319 | return ret; | ||
1320 | |||
1315 | clear_local_APIC(); | 1321 | clear_local_APIC(); |
1316 | 1322 | ||
1317 | cpu_disable_common(); | 1323 | cpu_disable_common(); |
@@ -1417,7 +1423,9 @@ static inline void mwait_play_dead(void) | |||
1417 | * The WBINVD is insufficient due to the spurious-wakeup | 1423 | * The WBINVD is insufficient due to the spurious-wakeup |
1418 | * case where we return around the loop. | 1424 | * case where we return around the loop. |
1419 | */ | 1425 | */ |
1426 | mb(); | ||
1420 | clflush(mwait_ptr); | 1427 | clflush(mwait_ptr); |
1428 | mb(); | ||
1421 | __monitor(mwait_ptr, 0, 0); | 1429 | __monitor(mwait_ptr, 0, 0); |
1422 | mb(); | 1430 | mb(); |
1423 | __mwait(eax, 0); | 1431 | __mwait(eax, 0); |
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b857ed890b4c..57409f6b8c62 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c | |||
@@ -211,21 +211,17 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ | |||
211 | exception_exit(prev_state); \ | 211 | exception_exit(prev_state); \ |
212 | } | 212 | } |
213 | 213 | ||
214 | DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, | 214 | DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip ) |
215 | regs->ip) | 215 | DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow ) |
216 | DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) | 216 | DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds ) |
217 | DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) | 217 | DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip ) |
218 | DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, | 218 | DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun ) |
219 | regs->ip) | 219 | DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS ) |
220 | DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", | 220 | DO_ERROR (X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present ) |
221 | coprocessor_segment_overrun) | ||
222 | DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS) | ||
223 | DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) | ||
224 | #ifdef CONFIG_X86_32 | 221 | #ifdef CONFIG_X86_32 |
225 | DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) | 222 | DO_ERROR (X86_TRAP_SS, SIGBUS, "stack segment", stack_segment ) |
226 | #endif | 223 | #endif |
227 | DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, | 224 | DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0 ) |
228 | BUS_ADRALN, 0) | ||
229 | 225 | ||
230 | #ifdef CONFIG_X86_64 | 226 | #ifdef CONFIG_X86_64 |
231 | /* Runs on IST stack */ | 227 | /* Runs on IST stack */ |
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 930e5d48f560..a3acbac2ee72 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c | |||
@@ -11,6 +11,7 @@ | |||
11 | #include <linux/clocksource.h> | 11 | #include <linux/clocksource.h> |
12 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
13 | #include <linux/timex.h> | 13 | #include <linux/timex.h> |
14 | #include <linux/static_key.h> | ||
14 | 15 | ||
15 | #include <asm/hpet.h> | 16 | #include <asm/hpet.h> |
16 | #include <asm/timer.h> | 17 | #include <asm/timer.h> |
@@ -37,13 +38,244 @@ static int __read_mostly tsc_unstable; | |||
37 | erroneous rdtsc usage on !cpu_has_tsc processors */ | 38 | erroneous rdtsc usage on !cpu_has_tsc processors */ |
38 | static int __read_mostly tsc_disabled = -1; | 39 | static int __read_mostly tsc_disabled = -1; |
39 | 40 | ||
41 | static struct static_key __use_tsc = STATIC_KEY_INIT; | ||
42 | |||
40 | int tsc_clocksource_reliable; | 43 | int tsc_clocksource_reliable; |
44 | |||
45 | /* | ||
46 | * Use a ring-buffer like data structure, where a writer advances the head by | ||
47 | * writing a new data entry and a reader advances the tail when it observes a | ||
48 | * new entry. | ||
49 | * | ||
50 | * Writers are made to wait on readers until there's space to write a new | ||
51 | * entry. | ||
52 | * | ||
53 | * This means that we can always use an {offset, mul} pair to compute a ns | ||
54 | * value that is 'roughly' in the right direction, even if we're writing a new | ||
55 | * {offset, mul} pair during the clock read. | ||
56 | * | ||
57 | * The down-side is that we can no longer guarantee strict monotonicity anymore | ||
58 | * (assuming the TSC was that to begin with), because while we compute the | ||
59 | * intersection point of the two clock slopes and make sure the time is | ||
60 | * continuous at the point of switching; we can no longer guarantee a reader is | ||
61 | * strictly before or after the switch point. | ||
62 | * | ||
63 | * It does mean a reader no longer needs to disable IRQs in order to avoid | ||
64 | * CPU-Freq updates messing with his times, and similarly an NMI reader will | ||
65 | * no longer run the risk of hitting half-written state. | ||
66 | */ | ||
67 | |||
68 | struct cyc2ns { | ||
69 | struct cyc2ns_data data[2]; /* 0 + 2*24 = 48 */ | ||
70 | struct cyc2ns_data *head; /* 48 + 8 = 56 */ | ||
71 | struct cyc2ns_data *tail; /* 56 + 8 = 64 */ | ||
72 | }; /* exactly fits one cacheline */ | ||
73 | |||
74 | static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); | ||
75 | |||
76 | struct cyc2ns_data *cyc2ns_read_begin(void) | ||
77 | { | ||
78 | struct cyc2ns_data *head; | ||
79 | |||
80 | preempt_disable(); | ||
81 | |||
82 | head = this_cpu_read(cyc2ns.head); | ||
83 | /* | ||
84 | * Ensure we observe the entry when we observe the pointer to it. | ||
85 | * matches the wmb from cyc2ns_write_end(). | ||
86 | */ | ||
87 | smp_read_barrier_depends(); | ||
88 | head->__count++; | ||
89 | barrier(); | ||
90 | |||
91 | return head; | ||
92 | } | ||
93 | |||
94 | void cyc2ns_read_end(struct cyc2ns_data *head) | ||
95 | { | ||
96 | barrier(); | ||
97 | /* | ||
98 | * If we're the outer most nested read; update the tail pointer | ||
99 | * when we're done. This notifies possible pending writers | ||
100 | * that we've observed the head pointer and that the other | ||
101 | * entry is now free. | ||
102 | */ | ||
103 | if (!--head->__count) { | ||
104 | /* | ||
105 | * x86-TSO does not reorder writes with older reads; | ||
106 | * therefore once this write becomes visible to another | ||
107 | * cpu, we must be finished reading the cyc2ns_data. | ||
108 | * | ||
109 | * matches with cyc2ns_write_begin(). | ||
110 | */ | ||
111 | this_cpu_write(cyc2ns.tail, head); | ||
112 | } | ||
113 | preempt_enable(); | ||
114 | } | ||
115 | |||
116 | /* | ||
117 | * Begin writing a new @data entry for @cpu. | ||
118 | * | ||
119 | * Assumes some sort of write side lock; currently 'provided' by the assumption | ||
120 | * that cpufreq will call its notifiers sequentially. | ||
121 | */ | ||
122 | static struct cyc2ns_data *cyc2ns_write_begin(int cpu) | ||
123 | { | ||
124 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
125 | struct cyc2ns_data *data = c2n->data; | ||
126 | |||
127 | if (data == c2n->head) | ||
128 | data++; | ||
129 | |||
130 | /* XXX send an IPI to @cpu in order to guarantee a read? */ | ||
131 | |||
132 | /* | ||
133 | * When we observe the tail write from cyc2ns_read_end(), | ||
134 | * the cpu must be done with that entry and its safe | ||
135 | * to start writing to it. | ||
136 | */ | ||
137 | while (c2n->tail == data) | ||
138 | cpu_relax(); | ||
139 | |||
140 | return data; | ||
141 | } | ||
142 | |||
143 | static void cyc2ns_write_end(int cpu, struct cyc2ns_data *data) | ||
144 | { | ||
145 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
146 | |||
147 | /* | ||
148 | * Ensure the @data writes are visible before we publish the | ||
149 | * entry. Matches the data-depencency in cyc2ns_read_begin(). | ||
150 | */ | ||
151 | smp_wmb(); | ||
152 | |||
153 | ACCESS_ONCE(c2n->head) = data; | ||
154 | } | ||
155 | |||
156 | /* | ||
157 | * Accelerators for sched_clock() | ||
158 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
159 | * basic equation: | ||
160 | * ns = cycles / (freq / ns_per_sec) | ||
161 | * ns = cycles * (ns_per_sec / freq) | ||
162 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
163 | * ns = cycles * (10^6 / cpu_khz) | ||
164 | * | ||
165 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
166 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
167 | * ns = cycles * cyc2ns_scale / SC | ||
168 | * | ||
169 | * And since SC is a constant power of two, we can convert the div | ||
170 | * into a shift. | ||
171 | * | ||
172 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
173 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
174 | * (mathieu.desnoyers@polymtl.ca) | ||
175 | * | ||
176 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
177 | */ | ||
178 | |||
179 | #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ | ||
180 | |||
181 | static void cyc2ns_data_init(struct cyc2ns_data *data) | ||
182 | { | ||
183 | data->cyc2ns_mul = 1U << CYC2NS_SCALE_FACTOR; | ||
184 | data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; | ||
185 | data->cyc2ns_offset = 0; | ||
186 | data->__count = 0; | ||
187 | } | ||
188 | |||
189 | static void cyc2ns_init(int cpu) | ||
190 | { | ||
191 | struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); | ||
192 | |||
193 | cyc2ns_data_init(&c2n->data[0]); | ||
194 | cyc2ns_data_init(&c2n->data[1]); | ||
195 | |||
196 | c2n->head = c2n->data; | ||
197 | c2n->tail = c2n->data; | ||
198 | } | ||
199 | |||
200 | static inline unsigned long long cycles_2_ns(unsigned long long cyc) | ||
201 | { | ||
202 | struct cyc2ns_data *data, *tail; | ||
203 | unsigned long long ns; | ||
204 | |||
205 | /* | ||
206 | * See cyc2ns_read_*() for details; replicated in order to avoid | ||
207 | * an extra few instructions that came with the abstraction. | ||
208 | * Notable, it allows us to only do the __count and tail update | ||
209 | * dance when its actually needed. | ||
210 | */ | ||
211 | |||
212 | preempt_disable(); | ||
213 | data = this_cpu_read(cyc2ns.head); | ||
214 | tail = this_cpu_read(cyc2ns.tail); | ||
215 | |||
216 | if (likely(data == tail)) { | ||
217 | ns = data->cyc2ns_offset; | ||
218 | ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
219 | } else { | ||
220 | data->__count++; | ||
221 | |||
222 | barrier(); | ||
223 | |||
224 | ns = data->cyc2ns_offset; | ||
225 | ns += mul_u64_u32_shr(cyc, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
226 | |||
227 | barrier(); | ||
228 | |||
229 | if (!--data->__count) | ||
230 | this_cpu_write(cyc2ns.tail, data); | ||
231 | } | ||
232 | preempt_enable(); | ||
233 | |||
234 | return ns; | ||
235 | } | ||
236 | |||
237 | /* XXX surely we already have this someplace in the kernel?! */ | ||
238 | #define DIV_ROUND(n, d) (((n) + ((d) / 2)) / (d)) | ||
239 | |||
240 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
241 | { | ||
242 | unsigned long long tsc_now, ns_now; | ||
243 | struct cyc2ns_data *data; | ||
244 | unsigned long flags; | ||
245 | |||
246 | local_irq_save(flags); | ||
247 | sched_clock_idle_sleep_event(); | ||
248 | |||
249 | if (!cpu_khz) | ||
250 | goto done; | ||
251 | |||
252 | data = cyc2ns_write_begin(cpu); | ||
253 | |||
254 | rdtscll(tsc_now); | ||
255 | ns_now = cycles_2_ns(tsc_now); | ||
256 | |||
257 | /* | ||
258 | * Compute a new multiplier as per the above comment and ensure our | ||
259 | * time function is continuous; see the comment near struct | ||
260 | * cyc2ns_data. | ||
261 | */ | ||
262 | data->cyc2ns_mul = DIV_ROUND(NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR, cpu_khz); | ||
263 | data->cyc2ns_shift = CYC2NS_SCALE_FACTOR; | ||
264 | data->cyc2ns_offset = ns_now - | ||
265 | mul_u64_u32_shr(tsc_now, data->cyc2ns_mul, CYC2NS_SCALE_FACTOR); | ||
266 | |||
267 | cyc2ns_write_end(cpu, data); | ||
268 | |||
269 | done: | ||
270 | sched_clock_idle_wakeup_event(0); | ||
271 | local_irq_restore(flags); | ||
272 | } | ||
41 | /* | 273 | /* |
42 | * Scheduler clock - returns current time in nanosec units. | 274 | * Scheduler clock - returns current time in nanosec units. |
43 | */ | 275 | */ |
44 | u64 native_sched_clock(void) | 276 | u64 native_sched_clock(void) |
45 | { | 277 | { |
46 | u64 this_offset; | 278 | u64 tsc_now; |
47 | 279 | ||
48 | /* | 280 | /* |
49 | * Fall back to jiffies if there's no TSC available: | 281 | * Fall back to jiffies if there's no TSC available: |
@@ -53,16 +285,16 @@ u64 native_sched_clock(void) | |||
53 | * very important for it to be as fast as the platform | 285 | * very important for it to be as fast as the platform |
54 | * can achieve it. ) | 286 | * can achieve it. ) |
55 | */ | 287 | */ |
56 | if (unlikely(tsc_disabled)) { | 288 | if (!static_key_false(&__use_tsc)) { |
57 | /* No locking but a rare wrong value is not a big deal: */ | 289 | /* No locking but a rare wrong value is not a big deal: */ |
58 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); | 290 | return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); |
59 | } | 291 | } |
60 | 292 | ||
61 | /* read the Time Stamp Counter: */ | 293 | /* read the Time Stamp Counter: */ |
62 | rdtscll(this_offset); | 294 | rdtscll(tsc_now); |
63 | 295 | ||
64 | /* return the value in ns */ | 296 | /* return the value in ns */ |
65 | return __cycles_2_ns(this_offset); | 297 | return cycles_2_ns(tsc_now); |
66 | } | 298 | } |
67 | 299 | ||
68 | /* We need to define a real function for sched_clock, to override the | 300 | /* We need to define a real function for sched_clock, to override the |
@@ -419,6 +651,16 @@ unsigned long native_calibrate_tsc(void) | |||
419 | unsigned long flags, latch, ms, fast_calibrate; | 651 | unsigned long flags, latch, ms, fast_calibrate; |
420 | int hpet = is_hpet_enabled(), i, loopmin; | 652 | int hpet = is_hpet_enabled(), i, loopmin; |
421 | 653 | ||
654 | /* Calibrate TSC using MSR for Intel Atom SoCs */ | ||
655 | local_irq_save(flags); | ||
656 | i = try_msr_calibrate_tsc(&fast_calibrate); | ||
657 | local_irq_restore(flags); | ||
658 | if (i >= 0) { | ||
659 | if (i == 0) | ||
660 | pr_warn("Fast TSC calibration using MSR failed\n"); | ||
661 | return fast_calibrate; | ||
662 | } | ||
663 | |||
422 | local_irq_save(flags); | 664 | local_irq_save(flags); |
423 | fast_calibrate = quick_pit_calibrate(); | 665 | fast_calibrate = quick_pit_calibrate(); |
424 | local_irq_restore(flags); | 666 | local_irq_restore(flags); |
@@ -589,61 +831,11 @@ int recalibrate_cpu_khz(void) | |||
589 | EXPORT_SYMBOL(recalibrate_cpu_khz); | 831 | EXPORT_SYMBOL(recalibrate_cpu_khz); |
590 | 832 | ||
591 | 833 | ||
592 | /* Accelerators for sched_clock() | ||
593 | * convert from cycles(64bits) => nanoseconds (64bits) | ||
594 | * basic equation: | ||
595 | * ns = cycles / (freq / ns_per_sec) | ||
596 | * ns = cycles * (ns_per_sec / freq) | ||
597 | * ns = cycles * (10^9 / (cpu_khz * 10^3)) | ||
598 | * ns = cycles * (10^6 / cpu_khz) | ||
599 | * | ||
600 | * Then we use scaling math (suggested by george@mvista.com) to get: | ||
601 | * ns = cycles * (10^6 * SC / cpu_khz) / SC | ||
602 | * ns = cycles * cyc2ns_scale / SC | ||
603 | * | ||
604 | * And since SC is a constant power of two, we can convert the div | ||
605 | * into a shift. | ||
606 | * | ||
607 | * We can use khz divisor instead of mhz to keep a better precision, since | ||
608 | * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits. | ||
609 | * (mathieu.desnoyers@polymtl.ca) | ||
610 | * | ||
611 | * -johnstul@us.ibm.com "math is hard, lets go shopping!" | ||
612 | */ | ||
613 | |||
614 | DEFINE_PER_CPU(unsigned long, cyc2ns); | ||
615 | DEFINE_PER_CPU(unsigned long long, cyc2ns_offset); | ||
616 | |||
617 | static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) | ||
618 | { | ||
619 | unsigned long long tsc_now, ns_now, *offset; | ||
620 | unsigned long flags, *scale; | ||
621 | |||
622 | local_irq_save(flags); | ||
623 | sched_clock_idle_sleep_event(); | ||
624 | |||
625 | scale = &per_cpu(cyc2ns, cpu); | ||
626 | offset = &per_cpu(cyc2ns_offset, cpu); | ||
627 | |||
628 | rdtscll(tsc_now); | ||
629 | ns_now = __cycles_2_ns(tsc_now); | ||
630 | |||
631 | if (cpu_khz) { | ||
632 | *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + | ||
633 | cpu_khz / 2) / cpu_khz; | ||
634 | *offset = ns_now - mult_frac(tsc_now, *scale, | ||
635 | (1UL << CYC2NS_SCALE_FACTOR)); | ||
636 | } | ||
637 | |||
638 | sched_clock_idle_wakeup_event(0); | ||
639 | local_irq_restore(flags); | ||
640 | } | ||
641 | |||
642 | static unsigned long long cyc2ns_suspend; | 834 | static unsigned long long cyc2ns_suspend; |
643 | 835 | ||
644 | void tsc_save_sched_clock_state(void) | 836 | void tsc_save_sched_clock_state(void) |
645 | { | 837 | { |
646 | if (!sched_clock_stable) | 838 | if (!sched_clock_stable()) |
647 | return; | 839 | return; |
648 | 840 | ||
649 | cyc2ns_suspend = sched_clock(); | 841 | cyc2ns_suspend = sched_clock(); |
@@ -663,16 +855,26 @@ void tsc_restore_sched_clock_state(void) | |||
663 | unsigned long flags; | 855 | unsigned long flags; |
664 | int cpu; | 856 | int cpu; |
665 | 857 | ||
666 | if (!sched_clock_stable) | 858 | if (!sched_clock_stable()) |
667 | return; | 859 | return; |
668 | 860 | ||
669 | local_irq_save(flags); | 861 | local_irq_save(flags); |
670 | 862 | ||
671 | __this_cpu_write(cyc2ns_offset, 0); | 863 | /* |
864 | * We're comming out of suspend, there's no concurrency yet; don't | ||
865 | * bother being nice about the RCU stuff, just write to both | ||
866 | * data fields. | ||
867 | */ | ||
868 | |||
869 | this_cpu_write(cyc2ns.data[0].cyc2ns_offset, 0); | ||
870 | this_cpu_write(cyc2ns.data[1].cyc2ns_offset, 0); | ||
871 | |||
672 | offset = cyc2ns_suspend - sched_clock(); | 872 | offset = cyc2ns_suspend - sched_clock(); |
673 | 873 | ||
674 | for_each_possible_cpu(cpu) | 874 | for_each_possible_cpu(cpu) { |
675 | per_cpu(cyc2ns_offset, cpu) = offset; | 875 | per_cpu(cyc2ns.data[0].cyc2ns_offset, cpu) = offset; |
876 | per_cpu(cyc2ns.data[1].cyc2ns_offset, cpu) = offset; | ||
877 | } | ||
676 | 878 | ||
677 | local_irq_restore(flags); | 879 | local_irq_restore(flags); |
678 | } | 880 | } |
@@ -795,7 +997,7 @@ void mark_tsc_unstable(char *reason) | |||
795 | { | 997 | { |
796 | if (!tsc_unstable) { | 998 | if (!tsc_unstable) { |
797 | tsc_unstable = 1; | 999 | tsc_unstable = 1; |
798 | sched_clock_stable = 0; | 1000 | clear_sched_clock_stable(); |
799 | disable_sched_clock_irqtime(); | 1001 | disable_sched_clock_irqtime(); |
800 | pr_info("Marking TSC unstable due to %s\n", reason); | 1002 | pr_info("Marking TSC unstable due to %s\n", reason); |
801 | /* Change only the rating, when not registered */ | 1003 | /* Change only the rating, when not registered */ |
@@ -995,14 +1197,18 @@ void __init tsc_init(void) | |||
995 | * speed as the bootup CPU. (cpufreq notifiers will fix this | 1197 | * speed as the bootup CPU. (cpufreq notifiers will fix this |
996 | * up if their speed diverges) | 1198 | * up if their speed diverges) |
997 | */ | 1199 | */ |
998 | for_each_possible_cpu(cpu) | 1200 | for_each_possible_cpu(cpu) { |
1201 | cyc2ns_init(cpu); | ||
999 | set_cyc2ns_scale(cpu_khz, cpu); | 1202 | set_cyc2ns_scale(cpu_khz, cpu); |
1203 | } | ||
1000 | 1204 | ||
1001 | if (tsc_disabled > 0) | 1205 | if (tsc_disabled > 0) |
1002 | return; | 1206 | return; |
1003 | 1207 | ||
1004 | /* now allow native_sched_clock() to use rdtsc */ | 1208 | /* now allow native_sched_clock() to use rdtsc */ |
1209 | |||
1005 | tsc_disabled = 0; | 1210 | tsc_disabled = 0; |
1211 | static_key_slow_inc(&__use_tsc); | ||
1006 | 1212 | ||
1007 | if (!no_sched_irq_time) | 1213 | if (!no_sched_irq_time) |
1008 | enable_sched_clock_irqtime(); | 1214 | enable_sched_clock_irqtime(); |
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c new file mode 100644 index 000000000000..8b5434f4389f --- /dev/null +++ b/arch/x86/kernel/tsc_msr.c | |||
@@ -0,0 +1,127 @@ | |||
1 | /* | ||
2 | * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. | ||
3 | * | ||
4 | * TSC in Intel Atom SoC runs at a constant rate which can be figured | ||
5 | * by this formula: | ||
6 | * <maximum core-clock to bus-clock ratio> * <maximum resolved frequency> | ||
7 | * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 | ||
8 | * for details. | ||
9 | * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR | ||
10 | * based calibration is the only option. | ||
11 | * | ||
12 | * | ||
13 | * Copyright (C) 2013 Intel Corporation | ||
14 | * Author: Bin Gao <bin.gao@intel.com> | ||
15 | * | ||
16 | * This file is released under the GPLv2. | ||
17 | */ | ||
18 | |||
19 | #include <linux/kernel.h> | ||
20 | #include <asm/processor.h> | ||
21 | #include <asm/setup.h> | ||
22 | #include <asm/apic.h> | ||
23 | #include <asm/param.h> | ||
24 | |||
25 | /* CPU reference clock frequency: in KHz */ | ||
26 | #define FREQ_83 83200 | ||
27 | #define FREQ_100 99840 | ||
28 | #define FREQ_133 133200 | ||
29 | #define FREQ_166 166400 | ||
30 | |||
31 | #define MAX_NUM_FREQS 8 | ||
32 | |||
33 | /* | ||
34 | * According to Intel 64 and IA-32 System Programming Guide, | ||
35 | * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be | ||
36 | * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. | ||
37 | * Unfortunately some Intel Atom SoCs aren't quite compliant to this, | ||
38 | * so we need manually differentiate SoC families. This is what the | ||
39 | * field msr_plat does. | ||
40 | */ | ||
41 | struct freq_desc { | ||
42 | u8 x86_family; /* CPU family */ | ||
43 | u8 x86_model; /* model */ | ||
44 | u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ | ||
45 | u32 freqs[MAX_NUM_FREQS]; | ||
46 | }; | ||
47 | |||
48 | static struct freq_desc freq_desc_tables[] = { | ||
49 | /* PNW */ | ||
50 | { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, | ||
51 | /* CLV+ */ | ||
52 | { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, | ||
53 | /* TNG */ | ||
54 | { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, | ||
55 | /* VLV2 */ | ||
56 | { 6, 0x37, 1, { 0, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, | ||
57 | /* ANN */ | ||
58 | { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, | ||
59 | }; | ||
60 | |||
61 | static int match_cpu(u8 family, u8 model) | ||
62 | { | ||
63 | int i; | ||
64 | |||
65 | for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { | ||
66 | if ((family == freq_desc_tables[i].x86_family) && | ||
67 | (model == freq_desc_tables[i].x86_model)) | ||
68 | return i; | ||
69 | } | ||
70 | |||
71 | return -1; | ||
72 | } | ||
73 | |||
74 | /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ | ||
75 | #define id_to_freq(cpu_index, freq_id) \ | ||
76 | (freq_desc_tables[cpu_index].freqs[freq_id]) | ||
77 | |||
78 | /* | ||
79 | * Do MSR calibration only for known/supported CPUs. | ||
80 | * Return values: | ||
81 | * -1: CPU is unknown/unsupported for MSR based calibration | ||
82 | * 0: CPU is known/supported, but calibration failed | ||
83 | * 1: CPU is known/supported, and calibration succeeded | ||
84 | */ | ||
85 | int try_msr_calibrate_tsc(unsigned long *fast_calibrate) | ||
86 | { | ||
87 | int cpu_index; | ||
88 | u32 lo, hi, ratio, freq_id, freq; | ||
89 | |||
90 | cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); | ||
91 | if (cpu_index < 0) | ||
92 | return -1; | ||
93 | |||
94 | *fast_calibrate = 0; | ||
95 | |||
96 | if (freq_desc_tables[cpu_index].msr_plat) { | ||
97 | rdmsr(MSR_PLATFORM_INFO, lo, hi); | ||
98 | ratio = (lo >> 8) & 0x1f; | ||
99 | } else { | ||
100 | rdmsr(MSR_IA32_PERF_STATUS, lo, hi); | ||
101 | ratio = (hi >> 8) & 0x1f; | ||
102 | } | ||
103 | pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); | ||
104 | |||
105 | if (!ratio) | ||
106 | return 0; | ||
107 | |||
108 | /* Get FSB FREQ ID */ | ||
109 | rdmsr(MSR_FSB_FREQ, lo, hi); | ||
110 | freq_id = lo & 0x7; | ||
111 | freq = id_to_freq(cpu_index, freq_id); | ||
112 | pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", | ||
113 | freq_id, freq); | ||
114 | if (!freq) | ||
115 | return 0; | ||
116 | |||
117 | /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ | ||
118 | *fast_calibrate = freq * ratio; | ||
119 | pr_info("TSC runs at %lu KHz\n", *fast_calibrate); | ||
120 | |||
121 | #ifdef CONFIG_X86_LOCAL_APIC | ||
122 | lapic_timer_frequency = (freq * 1000) / HZ; | ||
123 | pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); | ||
124 | #endif | ||
125 | |||
126 | return 1; | ||
127 | } | ||
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index adfdf56a3714..26488487bc61 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c | |||
@@ -16,7 +16,6 @@ | |||
16 | */ | 16 | */ |
17 | #include <linux/spinlock.h> | 17 | #include <linux/spinlock.h> |
18 | #include <linux/kernel.h> | 18 | #include <linux/kernel.h> |
19 | #include <linux/init.h> | ||
20 | #include <linux/smp.h> | 19 | #include <linux/smp.h> |
21 | #include <linux/nmi.h> | 20 | #include <linux/nmi.h> |
22 | #include <asm/tsc.h> | 21 | #include <asm/tsc.h> |
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 422fd8223470..a4b451c6addf 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c | |||
@@ -562,6 +562,16 @@ static void __init xstate_enable_boot_cpu(void) | |||
562 | if (cpu_has_xsaveopt && eagerfpu != DISABLE) | 562 | if (cpu_has_xsaveopt && eagerfpu != DISABLE) |
563 | eagerfpu = ENABLE; | 563 | eagerfpu = ENABLE; |
564 | 564 | ||
565 | if (pcntxt_mask & XSTATE_EAGER) { | ||
566 | if (eagerfpu == DISABLE) { | ||
567 | pr_err("eagerfpu not present, disabling some xstate features: 0x%llx\n", | ||
568 | pcntxt_mask & XSTATE_EAGER); | ||
569 | pcntxt_mask &= ~XSTATE_EAGER; | ||
570 | } else { | ||
571 | eagerfpu = ENABLE; | ||
572 | } | ||
573 | } | ||
574 | |||
565 | pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", | 575 | pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", |
566 | pcntxt_mask, xstate_size); | 576 | pcntxt_mask, xstate_size); |
567 | } | 577 | } |