aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /arch/x86/kernel
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile22
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c58
-rw-r--r--arch/x86/kernel/acpi/cstate.c1
-rw-r--r--arch/x86/kernel/acpi/sleep.c48
-rw-r--r--arch/x86/kernel/acpi/sleep.h4
-rw-r--r--arch/x86/kernel/alternative.c132
-rw-r--r--arch/x86/kernel/amd_gart_64.c13
-rw-r--r--arch/x86/kernel/amd_nb.c19
-rw-r--r--arch/x86/kernel/aperture_64.c10
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/apic.c350
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c89
-rw-r--r--arch/x86/kernel/apic/apic_noop.c11
-rw-r--r--arch/x86/kernel/apic/apic_numachip.c263
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c50
-rw-r--r--arch/x86/kernel/apic/es7000_32.c55
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c27
-rw-r--r--arch/x86/kernel/apic/io_apic.c1320
-rw-r--r--arch/x86/kernel/apic/numaq_32.c32
-rw-r--r--arch/x86/kernel/apic/probe_32.c25
-rw-r--r--arch/x86/kernel/apic/probe_64.c11
-rw-r--r--arch/x86/kernel/apic/summit_32.c70
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c84
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c47
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c79
-rw-r--r--arch/x86/kernel/apm_32.c61
-rw-r--r--arch/x86/kernel/asm-offsets.c5
-rw-r--r--arch/x86/kernel/asm-offsets_32.c8
-rw-r--r--arch/x86/kernel/asm-offsets_64.c25
-rw-r--r--arch/x86/kernel/check.c54
-rw-r--r--arch/x86/kernel/cpu/Makefile13
-rw-r--r--arch/x86/kernel/cpu/amd.c209
-rw-r--r--arch/x86/kernel/cpu/bugs.c62
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c163
-rw-r--r--arch/x86/kernel/cpu/cpu.h15
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c3
-rw-r--r--arch/x86/kernel/cpu/intel.c202
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c259
-rw-r--r--arch/x86/kernel/cpu/match.c91
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c62
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c51
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c891
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c352
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c209
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c96
-rw-r--r--arch/x86/kernel/cpu/mcheck/threshold.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c1
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl30
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c23
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c9
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c11
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1211
-rw-r--r--arch/x86/kernel/cpu/perf_event.h656
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c202
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd_ibs.c908
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c826
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c150
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c557
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c2957
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.h621
-rw-r--r--arch/x86/kernel/cpu/perf_event_knc.c319
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c47
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c161
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c4
-rw-r--r--arch/x86/kernel/cpu/powerflags.c3
-rw-r--r--arch/x86/kernel/cpu/proc.c18
-rw-r--r--arch/x86/kernel/cpu/rdrand.c73
-rw-r--r--arch/x86/kernel/cpu/scattered.c3
-rw-r--r--arch/x86/kernel/cpuid.c8
-rw-r--r--arch/x86/kernel/crash.c37
-rw-r--r--arch/x86/kernel/crash_dump_32.c6
-rw-r--r--arch/x86/kernel/devicetree.c141
-rw-r--r--arch/x86/kernel/dumpstack.c40
-rw-r--r--arch/x86/kernel/dumpstack_32.c29
-rw-r--r--arch/x86/kernel/dumpstack_64.c31
-rw-r--r--arch/x86/kernel/e820.c178
-rw-r--r--arch/x86/kernel/early_printk.c4
-rw-r--r--arch/x86/kernel/entry_32.S279
-rw-r--r--arch/x86/kernel/entry_64.S679
-rw-r--r--arch/x86/kernel/ftrace.c659
-rw-r--r--arch/x86/kernel/head.c2
-rw-r--r--arch/x86/kernel/head32.c8
-rw-r--r--arch/x86/kernel/head64.c8
-rw-r--r--arch/x86/kernel/head_32.S277
-rw-r--r--arch/x86/kernel/head_64.S104
-rw-r--r--arch/x86/kernel/hpet.c79
-rw-r--r--arch/x86/kernel/i387.c356
-rw-r--r--arch/x86/kernel/i8259.c3
-rw-r--r--arch/x86/kernel/irq.c30
-rw-r--r--arch/x86/kernel/irq_32.c24
-rw-r--r--arch/x86/kernel/irq_64.c38
-rw-r--r--arch/x86/kernel/irqinit.c122
-rw-r--r--arch/x86/kernel/jump_label.c20
-rw-r--r--arch/x86/kernel/kdebugfs.c15
-rw-r--r--arch/x86/kernel/kgdb.c137
-rw-r--r--arch/x86/kernel/kprobes-common.h102
-rw-r--r--arch/x86/kernel/kprobes-opt.c512
-rw-r--r--arch/x86/kernel/kprobes.c708
-rw-r--r--arch/x86/kernel/kvm.c292
-rw-r--r--arch/x86/kernel/kvmclock.c114
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/microcode_amd.c407
-rw-r--r--arch/x86/kernel/microcode_core.c249
-rw-r--r--arch/x86/kernel/microcode_intel.c31
-rw-r--r--arch/x86/kernel/module.c35
-rw-r--r--arch/x86/kernel/mpparse.c34
-rw-r--r--arch/x86/kernel/msr.c8
-rw-r--r--arch/x86/kernel/nmi.c511
-rw-r--r--arch/x86/kernel/nmi_selftest.c183
-rw-r--r--arch/x86/kernel/paravirt.c20
-rw-r--r--arch/x86/kernel/pci-calgary_64.c52
-rw-r--r--arch/x86/kernel/pci-dma.c43
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pci-swiotlb.c17
-rw-r--r--arch/x86/kernel/perf_regs.c105
-rw-r--r--arch/x86/kernel/probe_roms.c7
-rw-r--r--arch/x86/kernel/process.c363
-rw-r--r--arch/x86/kernel/process_32.c141
-rw-r--r--arch/x86/kernel/process_64.c226
-rw-r--r--arch/x86/kernel/ptrace.c179
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kernel/quirks.c17
-rw-r--r--arch/x86/kernel/reboot.c347
-rw-r--r--arch/x86/kernel/rtc.c14
-rw-r--r--arch/x86/kernel/setup.c135
-rw-r--r--arch/x86/kernel/setup_percpu.c16
-rw-r--r--arch/x86/kernel/signal.c305
-rw-r--r--arch/x86/kernel/smp.c71
-rw-r--r--arch/x86/kernel/smpboot.c550
-rw-r--r--arch/x86/kernel/step.c53
-rw-r--r--arch/x86/kernel/sys_x86_64.c161
-rw-r--r--arch/x86/kernel/syscall_32.c25
-rw-r--r--arch/x86/kernel/syscall_64.c26
-rw-r--r--arch/x86/kernel/tboot.c17
-rw-r--r--arch/x86/kernel/tce_64.c1
-rw-r--r--arch/x86/kernel/test_rodata.c10
-rw-r--r--arch/x86/kernel/time.c10
-rw-r--r--arch/x86/kernel/tls.c5
-rw-r--r--arch/x86/kernel/topology.c102
-rw-r--r--arch/x86/kernel/trace_clock.c21
-rw-r--r--arch/x86/kernel/traps.c545
-rw-r--r--arch/x86/kernel/tsc.c113
-rw-r--r--arch/x86/kernel/tsc_sync.c33
-rw-r--r--arch/x86/kernel/uprobes.c697
-rw-r--r--arch/x86/kernel/vm86_32.c24
-rw-r--r--arch/x86/kernel/vmlinux.lds.S12
-rw-r--r--arch/x86/kernel/vsmp_64.c84
-rw-r--r--arch/x86/kernel/vsyscall_64.c202
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c7
-rw-r--r--arch/x86/kernel/x86_init.c20
-rw-r--r--arch/x86/kernel/xsave.c532
159 files changed, 7614 insertions, 18845 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 34e923a5376..82f2912155a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,13 +2,14 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5extra-y := head_$(BITS).o head$(BITS).o head.o vmlinux.lds 5extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinux.lds
6 6
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FUNCTION_TRACER 9ifdef CONFIG_FUNCTION_TRACER
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg
12CFLAGS_REMOVE_paravirt-spinlocks.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
13CFLAGS_REMOVE_pvclock.o = -pg 14CFLAGS_REMOVE_pvclock.o = -pg
14CFLAGS_REMOVE_kvmclock.o = -pg 15CFLAGS_REMOVE_kvmclock.o = -pg
@@ -18,14 +19,13 @@ endif
18 19
19obj-y := process_$(BITS).o signal.o entry_$(BITS).o 20obj-y := process_$(BITS).o signal.o entry_$(BITS).o
20obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 21obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
21obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o 22obj-y += time.o ioport.o ldt.o dumpstack.o
22obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 23obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
23obj-$(CONFIG_IRQ_WORK) += irq_work.o 24obj-$(CONFIG_IRQ_WORK) += irq_work.o
24obj-y += probe_roms.o 25obj-y += probe_roms.o
25obj-$(CONFIG_X86_32) += i386_ksyms_32.o 26obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
26obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 27obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
27obj-y += syscall_$(BITS).o 28obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
28obj-$(CONFIG_X86_64) += vsyscall_64.o
29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o 29obj-$(CONFIG_X86_64) += vsyscall_emu_64.o
30obj-y += bootflag.o e820.o 30obj-y += bootflag.o e820.o
31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 31obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
@@ -34,6 +34,7 @@ obj-y += tsc.o io_delay.o rtc.o
34obj-y += pci-iommu_table.o 34obj-y += pci-iommu_table.o
35obj-y += resource.o 35obj-y += resource.o
36 36
37obj-y += trampoline.o trampoline_$(BITS).o
37obj-y += process.o 38obj-y += process.o
38obj-y += i387.o xsave.o 39obj-y += i387.o xsave.o
39obj-y += ptrace.o 40obj-y += ptrace.o
@@ -46,6 +47,8 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o
46obj-y += cpu/ 47obj-y += cpu/
47obj-y += acpi/ 48obj-y += acpi/
48obj-y += reboot.o 49obj-y += reboot.o
50obj-$(CONFIG_X86_32) += reboot_32.o
51obj-$(CONFIG_MCA) += mca_32.o
49obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
50obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
51obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
@@ -61,12 +64,10 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
61obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 64obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
62obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o 65obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
63obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o 66obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
64obj-$(CONFIG_X86_TSC) += trace_clock.o
65obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
66obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
67obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
68obj-$(CONFIG_KPROBES) += kprobes.o 70obj-$(CONFIG_KPROBES) += kprobes.o
69obj-$(CONFIG_OPTPROBES) += kprobes-opt.o
70obj-$(CONFIG_MODULES) += module.o 71obj-$(CONFIG_MODULES) += module.o
71obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 72obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
72obj-$(CONFIG_KGDB) += kgdb.o 73obj-$(CONFIG_KGDB) += kgdb.o
@@ -79,9 +80,9 @@ obj-$(CONFIG_APB_TIMER) += apb_timer.o
79obj-$(CONFIG_AMD_NB) += amd_nb.o 80obj-$(CONFIG_AMD_NB) += amd_nb.o
80obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 81obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
81obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 82obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
82obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o
83 83
84obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o 84obj-$(CONFIG_KVM_GUEST) += kvm.o
85obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
85obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 86obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
86obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o 87obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
87obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 88obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
@@ -97,9 +98,6 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
97 98
98obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 99obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
99obj-$(CONFIG_OF) += devicetree.o 100obj-$(CONFIG_OF) += devicetree.o
100obj-$(CONFIG_UPROBES) += uprobes.o
101
102obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
103 101
104### 102###
105# 64 bit specific files 103# 64 bit specific files
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 163b2258147..6f35260bb3e 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,7 +1,14 @@
1subdir- := realmode
2
1obj-$(CONFIG_ACPI) += boot.o 3obj-$(CONFIG_ACPI) += boot.o
2obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
3 5
4ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
5obj-y += cstate.o 7obj-y += cstate.o
6endif 8endif
7 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
11
12$(obj)/realmode/wakeup.bin: FORCE
13 $(Q)$(MAKE) $(build)=$(obj)/realmode
14
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bacf4b0d91f..4558f0d0822 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -219,8 +219,6 @@ static int __init
219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) 219acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
220{ 220{
221 struct acpi_madt_local_x2apic *processor = NULL; 221 struct acpi_madt_local_x2apic *processor = NULL;
222 int apic_id;
223 u8 enabled;
224 222
225 processor = (struct acpi_madt_local_x2apic *)header; 223 processor = (struct acpi_madt_local_x2apic *)header;
226 224
@@ -229,8 +227,6 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
229 227
230 acpi_table_print_madt_entry(header); 228 acpi_table_print_madt_entry(header);
231 229
232 apic_id = processor->local_apic_id;
233 enabled = processor->lapic_flags & ACPI_MADT_ENABLED;
234#ifdef CONFIG_X86_X2APIC 230#ifdef CONFIG_X86_X2APIC
235 /* 231 /*
236 * We need to register disabled CPU as well to permit 232 * We need to register disabled CPU as well to permit
@@ -239,10 +235,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end)
239 * to not preallocating memory for all NR_CPUS 235 * to not preallocating memory for all NR_CPUS
240 * when we use CPU hotplug. 236 * when we use CPU hotplug.
241 */ 237 */
242 if (!apic->apic_id_valid(apic_id) && enabled) 238 acpi_register_lapic(processor->local_apic_id, /* APIC ID */
243 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 239 processor->lapic_flags & ACPI_MADT_ENABLED);
244 else
245 acpi_register_lapic(apic_id, enabled);
246#else 240#else
247 printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); 241 printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
248#endif 242#endif
@@ -422,14 +416,12 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
422 return 0; 416 return 0;
423 } 417 }
424 418
425 if (intsrc->source_irq == 0) { 419 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
426 if (acpi_skip_timer_override) { 420 if (acpi_skip_timer_override) {
427 printk(PREFIX "BIOS IRQ0 override ignored.\n"); 421 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
428 return 0; 422 return 0;
429 } 423 }
430 424 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
431 if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity
432 && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
433 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; 425 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
434 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); 426 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
435 } 427 }
@@ -574,12 +566,6 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
574 566
575 return irq; 567 return irq;
576} 568}
577EXPORT_SYMBOL_GPL(acpi_register_gsi);
578
579void acpi_unregister_gsi(u32 gsi)
580{
581}
582EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
583 569
584void __init acpi_set_irq_model_pic(void) 570void __init acpi_set_irq_model_pic(void)
585{ 571{
@@ -601,7 +587,7 @@ void __init acpi_set_irq_model_ioapic(void)
601#ifdef CONFIG_ACPI_HOTPLUG_CPU 587#ifdef CONFIG_ACPI_HOTPLUG_CPU
602#include <acpi/processor.h> 588#include <acpi/processor.h>
603 589
604static void __cpuinit acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) 590static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
605{ 591{
606#ifdef CONFIG_ACPI_NUMA 592#ifdef CONFIG_ACPI_NUMA
607 int nid; 593 int nid;
@@ -650,7 +636,6 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
650 kfree(buffer.pointer); 636 kfree(buffer.pointer);
651 buffer.length = ACPI_ALLOCATE_BUFFER; 637 buffer.length = ACPI_ALLOCATE_BUFFER;
652 buffer.pointer = NULL; 638 buffer.pointer = NULL;
653 lapic = NULL;
654 639
655 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) 640 if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL))
656 goto out; 641 goto out;
@@ -659,10 +644,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
659 goto free_tmp_map; 644 goto free_tmp_map;
660 645
661 cpumask_copy(tmp_map, cpu_present_mask); 646 cpumask_copy(tmp_map, cpu_present_mask);
662 acpi_register_lapic(physid, ACPI_MADT_ENABLED); 647 acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
663 648
664 /* 649 /*
665 * If acpi_register_lapic successfully generates a new logical cpu 650 * If mp_register_lapic successfully generates a new logical cpu
666 * number, then the following will get us exactly what was mapped 651 * number, then the following will get us exactly what was mapped
667 */ 652 */
668 cpumask_andnot(new_map, cpu_present_mask, tmp_map); 653 cpumask_andnot(new_map, cpu_present_mask, tmp_map);
@@ -998,7 +983,7 @@ void __init mp_config_acpi_legacy_irqs(void)
998 int i; 983 int i;
999 struct mpc_intsrc mp_irq; 984 struct mpc_intsrc mp_irq;
1000 985
1001#ifdef CONFIG_EISA 986#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
1002 /* 987 /*
1003 * Fabricate the legacy ISA bus (bus #31). 988 * Fabricate the legacy ISA bus (bus #31).
1004 */ 989 */
@@ -1342,12 +1327,17 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1342} 1327}
1343 1328
1344/* 1329/*
1345 * Force ignoring BIOS IRQ0 override 1330 * Force ignoring BIOS IRQ0 pin2 override
1346 */ 1331 */
1347static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1332static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1348{ 1333{
1334 /*
1335 * The ati_ixp4x0_rev() early PCI quirk should have set
1336 * the acpi_skip_timer_override flag already:
1337 */
1349 if (!acpi_skip_timer_override) { 1338 if (!acpi_skip_timer_override) {
1350 pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", 1339 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1340 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1351 d->ident); 1341 d->ident);
1352 acpi_skip_timer_override = 1; 1342 acpi_skip_timer_override = 1;
1353 } 1343 }
@@ -1441,7 +1431,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1441 * is enabled. This input is incorrectly designated the 1431 * is enabled. This input is incorrectly designated the
1442 * ISA IRQ 0 via an interrupt source override even though 1432 * ISA IRQ 0 via an interrupt source override even though
1443 * it is wired to the output of the master 8259A and INTIN0 1433 * it is wired to the output of the master 8259A and INTIN0
1444 * is not connected at all. Force ignoring BIOS IRQ0 1434 * is not connected at all. Force ignoring BIOS IRQ0 pin2
1445 * override in that cases. 1435 * override in that cases.
1446 */ 1436 */
1447 { 1437 {
@@ -1476,14 +1466,6 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1476 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), 1466 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1477 }, 1467 },
1478 }, 1468 },
1479 {
1480 .callback = dmi_ignore_irq0_timer_override,
1481 .ident = "FUJITSU SIEMENS",
1482 .matches = {
1483 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1484 DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"),
1485 },
1486 },
1487 {} 1469 {}
1488}; 1470};
1489 1471
@@ -1706,9 +1688,3 @@ int __acpi_release_global_lock(unsigned int *lock)
1706 } while (unlikely (val != old)); 1688 } while (unlikely (val != old));
1707 return old & 0x1; 1689 return old & 0x1;
1708} 1690}
1709
1710void __init arch_reserve_mem_area(acpi_physical_address addr, size_t size)
1711{
1712 e820_add_region(addr, size, E820_ACPI);
1713 update_e820();
1714}
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index d2b7f27781b..f50e7fb2a20 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -14,7 +14,6 @@
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h> 16#include <asm/mwait.h>
17#include <asm/special_insns.h>
18 17
19/* 18/*
20 * Initialize bm_flags based on the CPU cache properties 19 * Initialize bm_flags based on the CPU cache properties
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index d5e0d717005..103b6ab368d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -14,9 +14,8 @@
14#include <asm/desc.h> 14#include <asm/desc.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/cacheflush.h> 16#include <asm/cacheflush.h>
17#include <asm/realmode.h>
18 17
19#include "../../realmode/rm/wakeup.h" 18#include "realmode/wakeup.h"
20#include "sleep.h" 19#include "sleep.h"
21 20
22unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
@@ -33,9 +32,13 @@ static char temp_stack[4096];
33 */ 32 */
34int acpi_suspend_lowlevel(void) 33int acpi_suspend_lowlevel(void)
35{ 34{
36 struct wakeup_header *header = 35 struct wakeup_header *header;
37 (struct wakeup_header *) __va(real_mode_header->wakeup_header); 36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
38 38
39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
40
41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
39 if (header->signature != WAKEUP_HEADER_SIGNATURE) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
40 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
41 return -EINVAL; 44 return -EINVAL;
@@ -43,22 +46,38 @@ int acpi_suspend_lowlevel(void)
43 46
44 header->video_mode = saved_video_mode; 47 header->video_mode = saved_video_mode;
45 48
46 header->pmode_behavior = 0; 49 header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
50
51 /*
52 * Set up the wakeup GDT. We set these up as Big Real Mode,
53 * that is, with limits set to 4 GB. At least the Lenovo
54 * Thinkpad X61 is known to need this for the video BIOS
55 * initialization quirk to work; this is likely to also
56 * be the case for other laptops or integrated video devices.
57 */
58
59 /* GDT[0]: GDT self-pointer */
60 header->wakeup_gdt[0] =
61 (u64)(sizeof(header->wakeup_gdt) - 1) +
62 ((u64)__pa(&header->wakeup_gdt) << 16);
63 /* GDT[1]: big real mode-like code segment */
64 header->wakeup_gdt[1] =
65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
66 /* GDT[2]: big real mode-like data segment */
67 header->wakeup_gdt[2] =
68 GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
47 69
48#ifndef CONFIG_64BIT 70#ifndef CONFIG_64BIT
49 store_gdt((struct desc_ptr *)&header->pmode_gdt); 71 store_gdt((struct desc_ptr *)&header->pmode_gdt);
50 72
51 if (!rdmsr_safe(MSR_EFER, 73 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
52 &header->pmode_efer_low, 74 &header->pmode_efer_high))
53 &header->pmode_efer_high)) 75 header->pmode_efer_low = header->pmode_efer_high = 0;
54 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);
55#endif /* !CONFIG_64BIT */ 76#endif /* !CONFIG_64BIT */
56 77
57 header->pmode_cr0 = read_cr0(); 78 header->pmode_cr0 = read_cr0();
58 if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { 79 header->pmode_cr4 = read_cr4_safe();
59 header->pmode_cr4 = read_cr4(); 80 header->pmode_behavior = 0;
60 header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4);
61 }
62 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, 81 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
63 &header->pmode_misc_en_low, 82 &header->pmode_misc_en_low,
64 &header->pmode_misc_en_high)) 83 &header->pmode_misc_en_high))
@@ -72,6 +91,7 @@ int acpi_suspend_lowlevel(void)
72 header->pmode_cr3 = (u32)__pa(&initial_page_table); 91 header->pmode_cr3 = (u32)__pa(&initial_page_table);
73 saved_magic = 0x12345678; 92 saved_magic = 0x12345678;
74#else /* CONFIG_64BIT */ 93#else /* CONFIG_64BIT */
94 header->trampoline_segment = trampoline_address() >> 4;
75#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
76 stack_start = (unsigned long)temp_stack + sizeof(temp_stack); 96 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
77 early_gdt_descr.address = 97 early_gdt_descr.address =
@@ -101,8 +121,6 @@ static int __init acpi_sleep_setup(char *str)
101#endif 121#endif
102 if (strncmp(str, "nonvs", 5) == 0) 122 if (strncmp(str, "nonvs", 5) == 0)
103 acpi_nvs_nosave(); 123 acpi_nvs_nosave();
104 if (strncmp(str, "nonvs_s3", 8) == 0)
105 acpi_nvs_nosave_s3();
106 if (strncmp(str, "old_ordering", 12) == 0) 124 if (strncmp(str, "old_ordering", 12) == 0)
107 acpi_old_suspend_ordering(); 125 acpi_old_suspend_ordering();
108 str = strchr(str, ','); 126 str = strchr(str, ',');
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index 67f59f8c695..416d4be13fe 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -2,15 +2,13 @@
2 * Variables and functions used by the code in sleep.c 2 * Variables and functions used by the code in sleep.c
3 */ 3 */
4 4
5#include <asm/realmode.h> 5#include <asm/trampoline.h>
6 6
7extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
8extern long saved_magic; 8extern long saved_magic;
9 9
10extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
11 11
12extern u8 wake_sleep_flags;
13
14extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
15extern void wakeup_long64(void); 13extern void wakeup_long64(void);
16 14
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index ef5ccca79a6..c6382281624 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) "SMP alternatives: " fmt
2
3#include <linux/module.h> 1#include <linux/module.h>
4#include <linux/sched.h> 2#include <linux/sched.h>
5#include <linux/mutex.h> 3#include <linux/mutex.h>
@@ -23,6 +21,19 @@
23 21
24#define MAX_PATCH_LEN (255-1) 22#define MAX_PATCH_LEN (255-1)
25 23
24#ifdef CONFIG_HOTPLUG_CPU
25static int smp_alt_once;
26
27static int __init bootonly(char *str)
28{
29 smp_alt_once = 1;
30 return 1;
31}
32__setup("smp-alt-boot", bootonly);
33#else
34#define smp_alt_once 1
35#endif
36
26static int __initdata_or_module debug_alternative; 37static int __initdata_or_module debug_alternative;
27 38
28static int __init debug_alt(char *str) 39static int __init debug_alt(char *str)
@@ -52,11 +63,8 @@ static int __init setup_noreplace_paravirt(char *str)
52__setup("noreplace-paravirt", setup_noreplace_paravirt); 63__setup("noreplace-paravirt", setup_noreplace_paravirt);
53#endif 64#endif
54 65
55#define DPRINTK(fmt, ...) \ 66#define DPRINTK(fmt, args...) if (debug_alternative) \
56do { \ 67 printk(KERN_DEBUG fmt, args)
57 if (debug_alternative) \
58 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
59} while (0)
60 68
61/* 69/*
62 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes 70 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
@@ -152,7 +160,7 @@ static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
152#endif 160#endif
153 161
154#ifdef P6_NOP1 162#ifdef P6_NOP1
155static const unsigned char p6nops[] = 163static const unsigned char __initconst_or_module p6nops[] =
156{ 164{
157 P6_NOP1, 165 P6_NOP1,
158 P6_NOP2, 166 P6_NOP2,
@@ -211,7 +219,7 @@ void __init arch_init_ideal_nops(void)
211 ideal_nops = intel_nops; 219 ideal_nops = intel_nops;
212#endif 220#endif
213 } 221 }
214 break; 222
215 default: 223 default:
216#ifdef CONFIG_X86_64 224#ifdef CONFIG_X86_64
217 ideal_nops = k8_nops; 225 ideal_nops = k8_nops;
@@ -304,7 +312,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
304 /* turn DS segment override prefix into lock prefix */ 312 /* turn DS segment override prefix into lock prefix */
305 if (*ptr == 0x3e) 313 if (*ptr == 0x3e)
306 text_poke(ptr, ((unsigned char []){0xf0}), 1); 314 text_poke(ptr, ((unsigned char []){0xf0}), 1);
307 } 315 };
308 mutex_unlock(&text_mutex); 316 mutex_unlock(&text_mutex);
309} 317}
310 318
@@ -313,6 +321,9 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
313{ 321{
314 const s32 *poff; 322 const s32 *poff;
315 323
324 if (noreplace_smp)
325 return;
326
316 mutex_lock(&text_mutex); 327 mutex_lock(&text_mutex);
317 for (poff = start; poff < end; poff++) { 328 for (poff = start; poff < end; poff++) {
318 u8 *ptr = (u8 *)poff + *poff; 329 u8 *ptr = (u8 *)poff + *poff;
@@ -322,7 +333,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
322 /* turn lock prefix into DS segment override prefix */ 333 /* turn lock prefix into DS segment override prefix */
323 if (*ptr == 0xf0) 334 if (*ptr == 0xf0)
324 text_poke(ptr, ((unsigned char []){0x3E}), 1); 335 text_poke(ptr, ((unsigned char []){0x3E}), 1);
325 } 336 };
326 mutex_unlock(&text_mutex); 337 mutex_unlock(&text_mutex);
327} 338}
328 339
@@ -343,7 +354,7 @@ struct smp_alt_module {
343}; 354};
344static LIST_HEAD(smp_alt_modules); 355static LIST_HEAD(smp_alt_modules);
345static DEFINE_MUTEX(smp_alt); 356static DEFINE_MUTEX(smp_alt);
346static bool uniproc_patched = false; /* protected by smp_alt */ 357static int smp_mode = 1; /* protected by smp_alt */
347 358
348void __init_or_module alternatives_smp_module_add(struct module *mod, 359void __init_or_module alternatives_smp_module_add(struct module *mod,
349 char *name, 360 char *name,
@@ -352,18 +363,19 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
352{ 363{
353 struct smp_alt_module *smp; 364 struct smp_alt_module *smp;
354 365
355 mutex_lock(&smp_alt); 366 if (noreplace_smp)
356 if (!uniproc_patched) 367 return;
357 goto unlock;
358 368
359 if (num_possible_cpus() == 1) 369 if (smp_alt_once) {
360 /* Don't bother remembering, we'll never have to undo it. */ 370 if (boot_cpu_has(X86_FEATURE_UP))
361 goto smp_unlock; 371 alternatives_smp_unlock(locks, locks_end,
372 text, text_end);
373 return;
374 }
362 375
363 smp = kzalloc(sizeof(*smp), GFP_KERNEL); 376 smp = kzalloc(sizeof(*smp), GFP_KERNEL);
364 if (NULL == smp) 377 if (NULL == smp)
365 /* we'll run the (safe but slow) SMP code then ... */ 378 return; /* we'll run the (safe but slow) SMP code then ... */
366 goto unlock;
367 379
368 smp->mod = mod; 380 smp->mod = mod;
369 smp->name = name; 381 smp->name = name;
@@ -375,10 +387,11 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
375 __func__, smp->locks, smp->locks_end, 387 __func__, smp->locks, smp->locks_end,
376 smp->text, smp->text_end, smp->name); 388 smp->text, smp->text_end, smp->name);
377 389
390 mutex_lock(&smp_alt);
378 list_add_tail(&smp->next, &smp_alt_modules); 391 list_add_tail(&smp->next, &smp_alt_modules);
379smp_unlock: 392 if (boot_cpu_has(X86_FEATURE_UP))
380 alternatives_smp_unlock(locks, locks_end, text, text_end); 393 alternatives_smp_unlock(smp->locks, smp->locks_end,
381unlock: 394 smp->text, smp->text_end);
382 mutex_unlock(&smp_alt); 395 mutex_unlock(&smp_alt);
383} 396}
384 397
@@ -386,18 +399,24 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
386{ 399{
387 struct smp_alt_module *item; 400 struct smp_alt_module *item;
388 401
402 if (smp_alt_once || noreplace_smp)
403 return;
404
389 mutex_lock(&smp_alt); 405 mutex_lock(&smp_alt);
390 list_for_each_entry(item, &smp_alt_modules, next) { 406 list_for_each_entry(item, &smp_alt_modules, next) {
391 if (mod != item->mod) 407 if (mod != item->mod)
392 continue; 408 continue;
393 list_del(&item->next); 409 list_del(&item->next);
410 mutex_unlock(&smp_alt);
411 DPRINTK("%s: %s\n", __func__, item->name);
394 kfree(item); 412 kfree(item);
395 break; 413 return;
396 } 414 }
397 mutex_unlock(&smp_alt); 415 mutex_unlock(&smp_alt);
398} 416}
399 417
400void alternatives_enable_smp(void) 418bool skip_smp_alternatives;
419void alternatives_smp_switch(int smp)
401{ 420{
402 struct smp_alt_module *mod; 421 struct smp_alt_module *mod;
403 422
@@ -409,24 +428,37 @@ void alternatives_enable_smp(void)
409 * If this still occurs then you should see a hang 428 * If this still occurs then you should see a hang
410 * or crash shortly after this line: 429 * or crash shortly after this line:
411 */ 430 */
412 pr_info("lockdep: fixing up alternatives\n"); 431 printk("lockdep: fixing up alternatives.\n");
413#endif 432#endif
414 433
415 /* Why bother if there are no other CPUs? */ 434 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
416 BUG_ON(num_possible_cpus() == 1); 435 return;
436 BUG_ON(!smp && (num_online_cpus() > 1));
417 437
418 mutex_lock(&smp_alt); 438 mutex_lock(&smp_alt);
419 439
420 if (uniproc_patched) { 440 /*
421 pr_info("switching to SMP code\n"); 441 * Avoid unnecessary switches because it forces JIT based VMs to
422 BUG_ON(num_online_cpus() != 1); 442 * throw away all cached translations, which can be quite costly.
443 */
444 if (smp == smp_mode) {
445 /* nothing */
446 } else if (smp) {
447 printk(KERN_INFO "SMP alternatives: switching to SMP code\n");
423 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); 448 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
424 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); 449 clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
425 list_for_each_entry(mod, &smp_alt_modules, next) 450 list_for_each_entry(mod, &smp_alt_modules, next)
426 alternatives_smp_lock(mod->locks, mod->locks_end, 451 alternatives_smp_lock(mod->locks, mod->locks_end,
427 mod->text, mod->text_end); 452 mod->text, mod->text_end);
428 uniproc_patched = false; 453 } else {
454 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
455 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
456 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
457 list_for_each_entry(mod, &smp_alt_modules, next)
458 alternatives_smp_unlock(mod->locks, mod->locks_end,
459 mod->text, mod->text_end);
429 } 460 }
461 smp_mode = smp;
430 mutex_unlock(&smp_alt); 462 mutex_unlock(&smp_alt);
431} 463}
432 464
@@ -503,22 +535,40 @@ void __init alternative_instructions(void)
503 535
504 apply_alternatives(__alt_instructions, __alt_instructions_end); 536 apply_alternatives(__alt_instructions, __alt_instructions_end);
505 537
538 /* switch to patch-once-at-boottime-only mode and free the
539 * tables in case we know the number of CPUs will never ever
540 * change */
541#ifdef CONFIG_HOTPLUG_CPU
542 if (num_possible_cpus() < 2)
543 smp_alt_once = 1;
544#endif
545
506#ifdef CONFIG_SMP 546#ifdef CONFIG_SMP
507 /* Patch to UP if other cpus not imminent. */ 547 if (smp_alt_once) {
508 if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { 548 if (1 == num_possible_cpus()) {
509 uniproc_patched = true; 549 printk(KERN_INFO "SMP alternatives: switching to UP code\n");
550 set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
551 set_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
552
553 alternatives_smp_unlock(__smp_locks, __smp_locks_end,
554 _text, _etext);
555 }
556 } else {
510 alternatives_smp_module_add(NULL, "core kernel", 557 alternatives_smp_module_add(NULL, "core kernel",
511 __smp_locks, __smp_locks_end, 558 __smp_locks, __smp_locks_end,
512 _text, _etext); 559 _text, _etext);
560
561 /* Only switch to UP mode if we don't immediately boot others */
562 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
563 alternatives_smp_switch(0);
513 } 564 }
565#endif
566 apply_paravirt(__parainstructions, __parainstructions_end);
514 567
515 if (!uniproc_patched || num_possible_cpus() == 1) 568 if (smp_alt_once)
516 free_init_pages("SMP alternatives", 569 free_init_pages("SMP alternatives",
517 (unsigned long)__smp_locks, 570 (unsigned long)__smp_locks,
518 (unsigned long)__smp_locks_end); 571 (unsigned long)__smp_locks_end);
519#endif
520
521 apply_paravirt(__parainstructions, __parainstructions_end);
522 572
523 restart_nmi(); 573 restart_nmi();
524} 574}
@@ -614,7 +664,7 @@ static int __kprobes stop_machine_text_poke(void *data)
614 struct text_poke_param *p; 664 struct text_poke_param *p;
615 int i; 665 int i;
616 666
617 if (atomic_xchg(&stop_machine_first, 0)) { 667 if (atomic_dec_and_test(&stop_machine_first)) {
618 for (i = 0; i < tpp->nparams; i++) { 668 for (i = 0; i < tpp->nparams; i++) {
619 p = &tpp->params[i]; 669 p = &tpp->params[i];
620 text_poke(p->addr, p->opcode, p->len); 670 text_poke(p->addr, p->opcode, p->len);
@@ -688,5 +738,5 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
688 738
689 atomic_set(&stop_machine_first, 1); 739 atomic_set(&stop_machine_first, 1);
690 wrote_text = 0; 740 wrote_text = 0;
691 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); 741 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
692} 742}
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index e66311200cb..8a439d364b9 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -5,7 +5,7 @@
5 * This allows to use PCI devices that only support 32bit addresses on systems 5 * This allows to use PCI devices that only support 32bit addresses on systems
6 * with more than 4GB. 6 * with more than 4GB.
7 * 7 *
8 * See Documentation/DMA-API-HOWTO.txt for the interface specification. 8 * See Documentation/PCI/PCI-DMA-mapping.txt for the interface specification.
9 * 9 *
10 * Copyright 2002 Andi Kleen, SuSE Labs. 10 * Copyright 2002 Andi Kleen, SuSE Labs.
11 * Subject to the GNU General Public License v2 only. 11 * Subject to the GNU General Public License v2 only.
@@ -477,7 +477,7 @@ error:
477/* allocate and map a coherent mapping */ 477/* allocate and map a coherent mapping */
478static void * 478static void *
479gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, 479gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
480 gfp_t flag, struct dma_attrs *attrs) 480 gfp_t flag)
481{ 481{
482 dma_addr_t paddr; 482 dma_addr_t paddr;
483 unsigned long align_mask; 483 unsigned long align_mask;
@@ -500,8 +500,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
500 } 500 }
501 __free_pages(page, get_order(size)); 501 __free_pages(page, get_order(size));
502 } else 502 } else
503 return dma_generic_alloc_coherent(dev, size, dma_addr, flag, 503 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
504 attrs);
505 504
506 return NULL; 505 return NULL;
507} 506}
@@ -509,7 +508,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
509/* free a coherent mapping */ 508/* free a coherent mapping */
510static void 509static void
511gart_free_coherent(struct device *dev, size_t size, void *vaddr, 510gart_free_coherent(struct device *dev, size_t size, void *vaddr,
512 dma_addr_t dma_addr, struct dma_attrs *attrs) 511 dma_addr_t dma_addr)
513{ 512{
514 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); 513 gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL);
515 free_pages((unsigned long)vaddr, get_order(size)); 514 free_pages((unsigned long)vaddr, get_order(size));
@@ -701,8 +700,8 @@ static struct dma_map_ops gart_dma_ops = {
701 .unmap_sg = gart_unmap_sg, 700 .unmap_sg = gart_unmap_sg,
702 .map_page = gart_map_page, 701 .map_page = gart_map_page,
703 .unmap_page = gart_unmap_page, 702 .unmap_page = gart_unmap_page,
704 .alloc = gart_alloc_coherent, 703 .alloc_coherent = gart_alloc_coherent,
705 .free = gart_free_coherent, 704 .free_coherent = gart_free_coherent,
706 .mapping_error = gart_mapping_error, 705 .mapping_error = gart_mapping_error,
707}; 706};
708 707
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index aadf3359e2a..bae1efe6d51 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -2,9 +2,6 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/types.h> 5#include <linux/types.h>
9#include <linux/slab.h> 6#include <linux/slab.h>
10#include <linux/init.h> 7#include <linux/init.h>
@@ -19,7 +16,6 @@ const struct pci_device_id amd_nb_misc_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, 18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
22 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
23 {} 19 {}
24}; 20};
25EXPORT_SYMBOL(amd_nb_misc_ids); 21EXPORT_SYMBOL(amd_nb_misc_ids);
@@ -158,14 +154,16 @@ int amd_get_subcaches(int cpu)
158{ 154{
159 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; 155 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
160 unsigned int mask; 156 unsigned int mask;
161 int cuid; 157 int cuid = 0;
162 158
163 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 159 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
164 return 0; 160 return 0;
165 161
166 pci_read_config_dword(link, 0x1d4, &mask); 162 pci_read_config_dword(link, 0x1d4, &mask);
167 163
164#ifdef CONFIG_SMP
168 cuid = cpu_data(cpu).compute_unit_id; 165 cuid = cpu_data(cpu).compute_unit_id;
166#endif
169 return (mask >> (4 * cuid)) & 0xf; 167 return (mask >> (4 * cuid)) & 0xf;
170} 168}
171 169
@@ -174,7 +172,7 @@ int amd_set_subcaches(int cpu, int mask)
174 static unsigned int reset, ban; 172 static unsigned int reset, ban;
175 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); 173 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
176 unsigned int reg; 174 unsigned int reg;
177 int cuid; 175 int cuid = 0;
178 176
179 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) 177 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
180 return -EINVAL; 178 return -EINVAL;
@@ -192,7 +190,9 @@ int amd_set_subcaches(int cpu, int mask)
192 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); 190 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
193 } 191 }
194 192
193#ifdef CONFIG_SMP
195 cuid = cpu_data(cpu).compute_unit_id; 194 cuid = cpu_data(cpu).compute_unit_id;
195#endif
196 mask <<= 4 * cuid; 196 mask <<= 4 * cuid;
197 mask |= (0xf ^ (1 << cuid)) << 26; 197 mask |= (0xf ^ (1 << cuid)) << 26;
198 198
@@ -262,7 +262,7 @@ void amd_flush_garts(void)
262 } 262 }
263 spin_unlock_irqrestore(&gart_lock, flags); 263 spin_unlock_irqrestore(&gart_lock, flags);
264 if (!flushed) 264 if (!flushed)
265 pr_notice("nothing to flush?\n"); 265 printk("nothing to flush?\n");
266} 266}
267EXPORT_SYMBOL_GPL(amd_flush_garts); 267EXPORT_SYMBOL_GPL(amd_flush_garts);
268 268
@@ -273,10 +273,11 @@ static __init int init_amd_nbs(void)
273 err = amd_cache_northbridges(); 273 err = amd_cache_northbridges();
274 274
275 if (err < 0) 275 if (err < 0)
276 pr_notice("Cannot enumerate AMD northbridges\n"); 276 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
277 277
278 if (amd_cache_gart() < 0) 278 if (amd_cache_gart() < 0)
279 pr_notice("Cannot initialize GART flush words, GART support disabled\n"); 279 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
280 "GART support disabled.\n");
280 281
281 return err; 282 return err;
282} 283}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index d5fd66f0d4c..3d2661ca654 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -20,6 +20,7 @@
20#include <linux/bitops.h> 20#include <linux/bitops.h>
21#include <linux/ioport.h> 21#include <linux/ioport.h>
22#include <linux/suspend.h> 22#include <linux/suspend.h>
23#include <linux/kmemleak.h>
23#include <asm/e820.h> 24#include <asm/e820.h>
24#include <asm/io.h> 25#include <asm/io.h>
25#include <asm/iommu.h> 26#include <asm/iommu.h>
@@ -87,13 +88,18 @@ static u32 __init allocate_aperture(void)
87 */ 88 */
88 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
89 aper_size, aper_size); 90 aper_size, aper_size);
90 if (!addr || addr + aper_size > GART_MAX_ADDR) { 91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
91 printk(KERN_ERR 92 printk(KERN_ERR
92 "Cannot allocate aperture memory hole (%lx,%uK)\n", 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
93 addr, aper_size>>10); 94 addr, aper_size>>10);
94 return 0; 95 return 0;
95 } 96 }
96 memblock_reserve(addr, aper_size); 97 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(phys_to_virt(addr));
97 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 103 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
98 aper_size >> 10, addr); 104 aper_size >> 10, addr);
99 insert_aperture_resource((u32)addr, aper_size); 105 insert_aperture_resource((u32)addr, aper_size);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 0ae0323b1f9..767fd04f284 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -10,7 +10,6 @@ obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12# APIC probe will depend on the listing order here 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o
14obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o 14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o 15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b994cc84aa7..a2fd72e0ab3 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -35,7 +35,6 @@
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/mm.h> 36#include <linux/mm.h>
37 37
38#include <asm/irq_remapping.h>
39#include <asm/perf_event.h> 38#include <asm/perf_event.h>
40#include <asm/x86_init.h> 39#include <asm/x86_init.h>
41#include <asm/pgalloc.h> 40#include <asm/pgalloc.h>
@@ -75,8 +74,8 @@ physid_mask_t phys_cpu_present_map;
75/* 74/*
76 * Map cpu index to physical APIC ID 75 * Map cpu index to physical APIC ID
77 */ 76 */
78DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); 77DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
79DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); 78DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
80EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
81EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 80EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
82 81
@@ -88,8 +87,23 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
88 * used for the mapping. This is where the behaviors of x86_64 and 32 87 * used for the mapping. This is where the behaviors of x86_64 and 32
89 * actually diverge. Let's keep it ugly for now. 88 * actually diverge. Let's keep it ugly for now.
90 */ 89 */
91DEFINE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid, BAD_APICID); 90DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
92 91
92/*
93 * Knob to control our willingness to enable the local APIC.
94 *
95 * +1=force-enable
96 */
97static int force_enable_local_apic __initdata;
98/*
99 * APIC command line parameters
100 */
101static int __init parse_lapic(char *arg)
102{
103 force_enable_local_apic = 1;
104 return 0;
105}
106early_param("lapic", parse_lapic);
93/* Local APIC was disabled by the BIOS and enabled by the kernel */ 107/* Local APIC was disabled by the BIOS and enabled by the kernel */
94static int enabled_via_apicbase; 108static int enabled_via_apicbase;
95 109
@@ -118,25 +132,6 @@ static inline void imcr_apic_to_pic(void)
118} 132}
119#endif 133#endif
120 134
121/*
122 * Knob to control our willingness to enable the local APIC.
123 *
124 * +1=force-enable
125 */
126static int force_enable_local_apic __initdata;
127/*
128 * APIC command line parameters
129 */
130static int __init parse_lapic(char *arg)
131{
132 if (config_enabled(CONFIG_X86_32) && !arg)
133 force_enable_local_apic = 1;
134 else if (!strncmp(arg, "notscdeadline", 13))
135 setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
136 return 0;
137}
138early_param("lapic", parse_lapic);
139
140#ifdef CONFIG_X86_64 135#ifdef CONFIG_X86_64
141static int apic_calibrate_pmtmr __initdata; 136static int apic_calibrate_pmtmr __initdata;
142static __init int setup_apicpmtimer(char *s) 137static __init int setup_apicpmtimer(char *s)
@@ -151,26 +146,16 @@ __setup("apicpmtimer", setup_apicpmtimer);
151int x2apic_mode; 146int x2apic_mode;
152#ifdef CONFIG_X86_X2APIC 147#ifdef CONFIG_X86_X2APIC
153/* x2apic enabled before OS handover */ 148/* x2apic enabled before OS handover */
154int x2apic_preenabled; 149static int x2apic_preenabled;
155static int x2apic_disabled;
156static int nox2apic;
157static __init int setup_nox2apic(char *str) 150static __init int setup_nox2apic(char *str)
158{ 151{
159 if (x2apic_enabled()) { 152 if (x2apic_enabled()) {
160 int apicid = native_apic_msr_read(APIC_ID); 153 pr_warning("Bios already enabled x2apic, "
161 154 "can't enforce nox2apic");
162 if (apicid >= 255) { 155 return 0;
163 pr_warning("Apicid: %08x, cannot enforce nox2apic\n", 156 }
164 apicid);
165 return 0;
166 }
167
168 pr_warning("x2apic already enabled. will disable it\n");
169 } else
170 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
171
172 nox2apic = 1;
173 157
158 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
174 return 0; 159 return 0;
175} 160}
176early_param("nox2apic", setup_nox2apic); 161early_param("nox2apic", setup_nox2apic);
@@ -201,7 +186,7 @@ static struct resource lapic_resource = {
201 .flags = IORESOURCE_MEM | IORESOURCE_BUSY, 186 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
202}; 187};
203 188
204unsigned int lapic_timer_frequency = 0; 189static unsigned int calibration_result;
205 190
206static void apic_pm_activate(void); 191static void apic_pm_activate(void);
207 192
@@ -265,7 +250,6 @@ u32 native_safe_apic_wait_icr_idle(void)
265 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; 250 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
266 if (!send_status) 251 if (!send_status)
267 break; 252 break;
268 inc_irq_stat(icr_read_retry_count);
269 udelay(100); 253 udelay(100);
270 } while (timeout++ < 1000); 254 } while (timeout++ < 1000);
271 255
@@ -319,7 +303,6 @@ int lapic_get_maxlvt(void)
319 303
320/* Clock divisor */ 304/* Clock divisor */
321#define APIC_DIVISOR 16 305#define APIC_DIVISOR 16
322#define TSC_DIVISOR 32
323 306
324/* 307/*
325 * This function sets up the local APIC timer, with a timeout of 308 * This function sets up the local APIC timer, with a timeout of
@@ -338,9 +321,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
338 lvtt_value = LOCAL_TIMER_VECTOR; 321 lvtt_value = LOCAL_TIMER_VECTOR;
339 if (!oneshot) 322 if (!oneshot)
340 lvtt_value |= APIC_LVT_TIMER_PERIODIC; 323 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
341 else if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
342 lvtt_value |= APIC_LVT_TIMER_TSCDEADLINE;
343
344 if (!lapic_is_integrated()) 324 if (!lapic_is_integrated())
345 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); 325 lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
346 326
@@ -349,11 +329,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
349 329
350 apic_write(APIC_LVTT, lvtt_value); 330 apic_write(APIC_LVTT, lvtt_value);
351 331
352 if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) {
353 printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
354 return;
355 }
356
357 /* 332 /*
358 * Divide PICLK by 16 333 * Divide PICLK by 16
359 */ 334 */
@@ -397,25 +372,20 @@ static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
397 372
398static unsigned int reserve_eilvt_offset(int offset, unsigned int new) 373static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
399{ 374{
400 unsigned int rsvd, vector; 375 unsigned int rsvd; /* 0: uninitialized */
401 376
402 if (offset >= APIC_EILVT_NR_MAX) 377 if (offset >= APIC_EILVT_NR_MAX)
403 return ~0; 378 return ~0;
404 379
405 rsvd = atomic_read(&eilvt_offsets[offset]); 380 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
406 do { 381 do {
407 vector = rsvd & ~APIC_EILVT_MASKED; /* 0: unassigned */ 382 if (rsvd &&
408 if (vector && !eilvt_entry_is_changeable(vector, new)) 383 !eilvt_entry_is_changeable(rsvd, new))
409 /* may not change if vectors are different */ 384 /* may not change if vectors are different */
410 return rsvd; 385 return rsvd;
411 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); 386 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
412 } while (rsvd != new); 387 } while (rsvd != new);
413 388
414 rsvd &= ~APIC_EILVT_MASKED;
415 if (rsvd && rsvd != vector)
416 pr_info("LVT offset %d assigned for vector 0x%02x\n",
417 offset, rsvd);
418
419 return new; 389 return new;
420} 390}
421 391
@@ -466,16 +436,6 @@ static int lapic_next_event(unsigned long delta,
466 return 0; 436 return 0;
467} 437}
468 438
469static int lapic_next_deadline(unsigned long delta,
470 struct clock_event_device *evt)
471{
472 u64 tsc;
473
474 rdtscll(tsc);
475 wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
476 return 0;
477}
478
479/* 439/*
480 * Setup the lapic timer in periodic or oneshot mode 440 * Setup the lapic timer in periodic or oneshot mode
481 */ 441 */
@@ -494,7 +454,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
494 switch (mode) { 454 switch (mode) {
495 case CLOCK_EVT_MODE_PERIODIC: 455 case CLOCK_EVT_MODE_PERIODIC:
496 case CLOCK_EVT_MODE_ONESHOT: 456 case CLOCK_EVT_MODE_ONESHOT:
497 __setup_APIC_LVTT(lapic_timer_frequency, 457 __setup_APIC_LVTT(calibration_result,
498 mode != CLOCK_EVT_MODE_PERIODIC, 1); 458 mode != CLOCK_EVT_MODE_PERIODIC, 1);
499 break; 459 break;
500 case CLOCK_EVT_MODE_UNUSED: 460 case CLOCK_EVT_MODE_UNUSED:
@@ -556,15 +516,7 @@ static void __cpuinit setup_APIC_timer(void)
556 memcpy(levt, &lapic_clockevent, sizeof(*levt)); 516 memcpy(levt, &lapic_clockevent, sizeof(*levt));
557 levt->cpumask = cpumask_of(smp_processor_id()); 517 levt->cpumask = cpumask_of(smp_processor_id());
558 518
559 if (this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { 519 clockevents_register_device(levt);
560 levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
561 CLOCK_EVT_FEAT_DUMMY);
562 levt->set_next_event = lapic_next_deadline;
563 clockevents_config_and_register(levt,
564 (tsc_khz / TSC_DIVISOR) * 1000,
565 0xF, ~0UL);
566 } else
567 clockevents_register_device(levt);
568} 520}
569 521
570/* 522/*
@@ -686,30 +638,6 @@ static int __init calibrate_APIC_clock(void)
686 long delta, deltatsc; 638 long delta, deltatsc;
687 int pm_referenced = 0; 639 int pm_referenced = 0;
688 640
689 /**
690 * check if lapic timer has already been calibrated by platform
691 * specific routine, such as tsc calibration code. if so, we just fill
692 * in the clockevent structure and return.
693 */
694
695 if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) {
696 return 0;
697 } else if (lapic_timer_frequency) {
698 apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n",
699 lapic_timer_frequency);
700 lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR,
701 TICK_NSEC, lapic_clockevent.shift);
702 lapic_clockevent.max_delta_ns =
703 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
704 lapic_clockevent.min_delta_ns =
705 clockevent_delta2ns(0xF, &lapic_clockevent);
706 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
707 return 0;
708 }
709
710 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
711 "calibrating APIC timer ...\n");
712
713 local_irq_disable(); 641 local_irq_disable();
714 642
715 /* Replace the global interrupt handler */ 643 /* Replace the global interrupt handler */
@@ -751,12 +679,12 @@ static int __init calibrate_APIC_clock(void)
751 lapic_clockevent.min_delta_ns = 679 lapic_clockevent.min_delta_ns =
752 clockevent_delta2ns(0xF, &lapic_clockevent); 680 clockevent_delta2ns(0xF, &lapic_clockevent);
753 681
754 lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 682 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
755 683
756 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 684 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
757 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); 685 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
758 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 686 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
759 lapic_timer_frequency); 687 calibration_result);
760 688
761 if (cpu_has_tsc) { 689 if (cpu_has_tsc) {
762 apic_printk(APIC_VERBOSE, "..... CPU clock speed is " 690 apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
@@ -767,13 +695,13 @@ static int __init calibrate_APIC_clock(void)
767 695
768 apic_printk(APIC_VERBOSE, "..... host bus clock speed is " 696 apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
769 "%u.%04u MHz.\n", 697 "%u.%04u MHz.\n",
770 lapic_timer_frequency / (1000000 / HZ), 698 calibration_result / (1000000 / HZ),
771 lapic_timer_frequency % (1000000 / HZ)); 699 calibration_result % (1000000 / HZ));
772 700
773 /* 701 /*
774 * Do a sanity check on the APIC calibration result 702 * Do a sanity check on the APIC calibration result
775 */ 703 */
776 if (lapic_timer_frequency < (1000000 / HZ)) { 704 if (calibration_result < (1000000 / HZ)) {
777 local_irq_enable(); 705 local_irq_enable();
778 pr_warning("APIC frequency too slow, disabling apic timer\n"); 706 pr_warning("APIC frequency too slow, disabling apic timer\n");
779 return -1; 707 return -1;
@@ -847,6 +775,9 @@ void __init setup_boot_APIC_clock(void)
847 return; 775 return;
848 } 776 }
849 777
778 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
779 "calibrating APIC timer ...\n");
780
850 if (calibrate_APIC_clock()) { 781 if (calibrate_APIC_clock()) {
851 /* No broadcast on UP ! */ 782 /* No broadcast on UP ! */
852 if (num_possible_cpus() > 1) 783 if (num_possible_cpus() > 1)
@@ -926,8 +857,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
926 * Besides, if we don't timer interrupts ignore the global 857 * Besides, if we don't timer interrupts ignore the global
927 * interrupt lock, which is the WrongThing (tm) to do. 858 * interrupt lock, which is the WrongThing (tm) to do.
928 */ 859 */
929 irq_enter();
930 exit_idle(); 860 exit_idle();
861 irq_enter();
931 local_apic_timer_interrupt(); 862 local_apic_timer_interrupt();
932 irq_exit(); 863 irq_exit();
933 864
@@ -1359,13 +1290,11 @@ void __cpuinit setup_local_APIC(void)
1359 acked); 1290 acked);
1360 break; 1291 break;
1361 } 1292 }
1362 if (queued) { 1293 if (cpu_has_tsc) {
1363 if (cpu_has_tsc) { 1294 rdtscll(ntsc);
1364 rdtscll(ntsc); 1295 max_loops = (cpu_khz << 10) - (ntsc - tsc);
1365 max_loops = (cpu_khz << 10) - (ntsc - tsc); 1296 } else
1366 } else 1297 max_loops--;
1367 max_loops--;
1368 }
1369 } while (queued && max_loops > 0); 1298 } while (queued && max_loops > 0);
1370 WARN_ON(max_loops <= 0); 1299 WARN_ON(max_loops <= 0);
1371 1300
@@ -1477,51 +1406,12 @@ void __init bsp_end_local_APIC_setup(void)
1477 * Now that local APIC setup is completed for BP, configure the fault 1406 * Now that local APIC setup is completed for BP, configure the fault
1478 * handling for interrupt remapping. 1407 * handling for interrupt remapping.
1479 */ 1408 */
1480 if (irq_remapping_enabled) 1409 if (intr_remapping_enabled)
1481 irq_remap_enable_fault_handling(); 1410 enable_drhd_fault_handling();
1482 1411
1483} 1412}
1484 1413
1485#ifdef CONFIG_X86_X2APIC 1414#ifdef CONFIG_X86_X2APIC
1486/*
1487 * Need to disable xapic and x2apic at the same time and then enable xapic mode
1488 */
1489static inline void __disable_x2apic(u64 msr)
1490{
1491 wrmsrl(MSR_IA32_APICBASE,
1492 msr & ~(X2APIC_ENABLE | XAPIC_ENABLE));
1493 wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE);
1494}
1495
1496static __init void disable_x2apic(void)
1497{
1498 u64 msr;
1499
1500 if (!cpu_has_x2apic)
1501 return;
1502
1503 rdmsrl(MSR_IA32_APICBASE, msr);
1504 if (msr & X2APIC_ENABLE) {
1505 u32 x2apic_id = read_apic_id();
1506
1507 if (x2apic_id >= 255)
1508 panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
1509
1510 pr_info("Disabling x2apic\n");
1511 __disable_x2apic(msr);
1512
1513 if (nox2apic) {
1514 clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC);
1515 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
1516 }
1517
1518 x2apic_disabled = 1;
1519 x2apic_mode = 0;
1520
1521 register_lapic_address(mp_lapic_addr);
1522 }
1523}
1524
1525void check_x2apic(void) 1415void check_x2apic(void)
1526{ 1416{
1527 if (x2apic_enabled()) { 1417 if (x2apic_enabled()) {
@@ -1532,20 +1422,15 @@ void check_x2apic(void)
1532 1422
1533void enable_x2apic(void) 1423void enable_x2apic(void)
1534{ 1424{
1535 u64 msr; 1425 int msr, msr2;
1536
1537 rdmsrl(MSR_IA32_APICBASE, msr);
1538 if (x2apic_disabled) {
1539 __disable_x2apic(msr);
1540 return;
1541 }
1542 1426
1543 if (!x2apic_mode) 1427 if (!x2apic_mode)
1544 return; 1428 return;
1545 1429
1430 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1546 if (!(msr & X2APIC_ENABLE)) { 1431 if (!(msr & X2APIC_ENABLE)) {
1547 printk_once(KERN_INFO "Enabling x2apic\n"); 1432 printk_once(KERN_INFO "Enabling x2apic\n");
1548 wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); 1433 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2);
1549 } 1434 }
1550} 1435}
1551#endif /* CONFIG_X86_X2APIC */ 1436#endif /* CONFIG_X86_X2APIC */
@@ -1553,7 +1438,7 @@ void enable_x2apic(void)
1553int __init enable_IR(void) 1438int __init enable_IR(void)
1554{ 1439{
1555#ifdef CONFIG_IRQ_REMAP 1440#ifdef CONFIG_IRQ_REMAP
1556 if (!irq_remapping_supported()) { 1441 if (!intr_remapping_supported()) {
1557 pr_debug("intr-remapping not supported\n"); 1442 pr_debug("intr-remapping not supported\n");
1558 return -1; 1443 return -1;
1559 } 1444 }
@@ -1564,7 +1449,7 @@ int __init enable_IR(void)
1564 return -1; 1449 return -1;
1565 } 1450 }
1566 1451
1567 return irq_remapping_enable(); 1452 return enable_intr_remapping();
1568#endif 1453#endif
1569 return -1; 1454 return -1;
1570} 1455}
@@ -1573,46 +1458,34 @@ void __init enable_IR_x2apic(void)
1573{ 1458{
1574 unsigned long flags; 1459 unsigned long flags;
1575 int ret, x2apic_enabled = 0; 1460 int ret, x2apic_enabled = 0;
1576 int hardware_init_ret; 1461 int dmar_table_init_ret;
1577 1462
1578 /* Make sure irq_remap_ops are initialized */ 1463 dmar_table_init_ret = dmar_table_init();
1579 setup_irq_remapping_ops(); 1464 if (dmar_table_init_ret && !x2apic_supported())
1580
1581 hardware_init_ret = irq_remapping_prepare();
1582 if (hardware_init_ret && !x2apic_supported())
1583 return; 1465 return;
1584 1466
1585 ret = save_ioapic_entries(); 1467 ret = save_ioapic_entries();
1586 if (ret) { 1468 if (ret) {
1587 pr_info("Saving IO-APIC state failed: %d\n", ret); 1469 pr_info("Saving IO-APIC state failed: %d\n", ret);
1588 return; 1470 goto out;
1589 } 1471 }
1590 1472
1591 local_irq_save(flags); 1473 local_irq_save(flags);
1592 legacy_pic->mask_all(); 1474 legacy_pic->mask_all();
1593 mask_ioapic_entries(); 1475 mask_ioapic_entries();
1594 1476
1595 if (x2apic_preenabled && nox2apic) 1477 if (dmar_table_init_ret)
1596 disable_x2apic();
1597
1598 if (hardware_init_ret)
1599 ret = -1; 1478 ret = -1;
1600 else 1479 else
1601 ret = enable_IR(); 1480 ret = enable_IR();
1602 1481
1603 if (!x2apic_supported())
1604 goto skip_x2apic;
1605
1606 if (ret < 0) { 1482 if (ret < 0) {
1607 /* IR is required if there is APIC ID > 255 even when running 1483 /* IR is required if there is APIC ID > 255 even when running
1608 * under KVM 1484 * under KVM
1609 */ 1485 */
1610 if (max_physical_apicid > 255 || 1486 if (max_physical_apicid > 255 ||
1611 !hypervisor_x2apic_available()) { 1487 !hypervisor_x2apic_available())
1612 if (x2apic_preenabled) 1488 goto nox2apic;
1613 disable_x2apic();
1614 goto skip_x2apic;
1615 }
1616 /* 1489 /*
1617 * without IR all CPUs can be addressed by IOAPIC/MSI 1490 * without IR all CPUs can be addressed by IOAPIC/MSI
1618 * only in physical mode 1491 * only in physical mode
@@ -1620,10 +1493,8 @@ void __init enable_IR_x2apic(void)
1620 x2apic_force_phys(); 1493 x2apic_force_phys();
1621 } 1494 }
1622 1495
1623 if (ret == IRQ_REMAP_XAPIC_MODE) { 1496 if (ret == IRQ_REMAP_XAPIC_MODE)
1624 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); 1497 goto nox2apic;
1625 goto skip_x2apic;
1626 }
1627 1498
1628 x2apic_enabled = 1; 1499 x2apic_enabled = 1;
1629 1500
@@ -1633,11 +1504,22 @@ void __init enable_IR_x2apic(void)
1633 pr_info("Enabled x2apic\n"); 1504 pr_info("Enabled x2apic\n");
1634 } 1505 }
1635 1506
1636skip_x2apic: 1507nox2apic:
1637 if (ret < 0) /* IR enabling failed */ 1508 if (ret < 0) /* IR enabling failed */
1638 restore_ioapic_entries(); 1509 restore_ioapic_entries();
1639 legacy_pic->restore_mask(); 1510 legacy_pic->restore_mask();
1640 local_irq_restore(flags); 1511 local_irq_restore(flags);
1512
1513out:
1514 if (x2apic_enabled || !x2apic_supported())
1515 return;
1516
1517 if (x2apic_preenabled)
1518 panic("x2apic: enabled by BIOS but kernel init failed.");
1519 else if (ret == IRQ_REMAP_XAPIC_MODE)
1520 pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n");
1521 else if (ret < 0)
1522 pr_info("x2apic not enabled, IRQ remapping init failed\n");
1641} 1523}
1642 1524
1643#ifdef CONFIG_X86_64 1525#ifdef CONFIG_X86_64
@@ -1676,11 +1558,9 @@ static int __init apic_verify(void)
1676 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; 1558 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1677 1559
1678 /* The BIOS may have set up the APIC at some other address */ 1560 /* The BIOS may have set up the APIC at some other address */
1679 if (boot_cpu_data.x86 >= 6) { 1561 rdmsr(MSR_IA32_APICBASE, l, h);
1680 rdmsr(MSR_IA32_APICBASE, l, h); 1562 if (l & MSR_IA32_APICBASE_ENABLE)
1681 if (l & MSR_IA32_APICBASE_ENABLE) 1563 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1682 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1683 }
1684 1564
1685 pr_info("Found and enabled local APIC!\n"); 1565 pr_info("Found and enabled local APIC!\n");
1686 return 0; 1566 return 0;
@@ -1698,15 +1578,13 @@ int __init apic_force_enable(unsigned long addr)
1698 * MSR. This can only be done in software for Intel P6 or later 1578 * MSR. This can only be done in software for Intel P6 or later
1699 * and AMD K7 (Model > 1) or later. 1579 * and AMD K7 (Model > 1) or later.
1700 */ 1580 */
1701 if (boot_cpu_data.x86 >= 6) { 1581 rdmsr(MSR_IA32_APICBASE, l, h);
1702 rdmsr(MSR_IA32_APICBASE, l, h); 1582 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1703 if (!(l & MSR_IA32_APICBASE_ENABLE)) { 1583 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1704 pr_info("Local APIC disabled by BIOS -- reenabling.\n"); 1584 l &= ~MSR_IA32_APICBASE_BASE;
1705 l &= ~MSR_IA32_APICBASE_BASE; 1585 l |= MSR_IA32_APICBASE_ENABLE | addr;
1706 l |= MSR_IA32_APICBASE_ENABLE | addr; 1586 wrmsr(MSR_IA32_APICBASE, l, h);
1707 wrmsr(MSR_IA32_APICBASE, l, h); 1587 enabled_via_apicbase = 1;
1708 enabled_via_apicbase = 1;
1709 }
1710 } 1588 }
1711 return apic_verify(); 1589 return apic_verify();
1712} 1590}
@@ -1912,8 +1790,8 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1912{ 1790{
1913 u32 v; 1791 u32 v;
1914 1792
1915 irq_enter();
1916 exit_idle(); 1793 exit_idle();
1794 irq_enter();
1917 /* 1795 /*
1918 * Check if this really is a spurious interrupt and ACK it 1796 * Check if this really is a spurious interrupt and ACK it
1919 * if it is a vectored one. Just in case... 1797 * if it is a vectored one. Just in case...
@@ -1949,8 +1827,8 @@ void smp_error_interrupt(struct pt_regs *regs)
1949 "Illegal register address", /* APIC Error Bit 7 */ 1827 "Illegal register address", /* APIC Error Bit 7 */
1950 }; 1828 };
1951 1829
1952 irq_enter();
1953 exit_idle(); 1830 exit_idle();
1831 irq_enter();
1954 /* First tickle the hardware, only then report what went on. -- REW */ 1832 /* First tickle the hardware, only then report what went on. -- REW */
1955 v0 = apic_read(APIC_ESR); 1833 v0 = apic_read(APIC_ESR);
1956 apic_write(APIC_ESR, 0); 1834 apic_write(APIC_ESR, 0);
@@ -1967,7 +1845,7 @@ void smp_error_interrupt(struct pt_regs *regs)
1967 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); 1845 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1968 i++; 1846 i++;
1969 v1 >>= 1; 1847 v1 >>= 1;
1970 } 1848 };
1971 1849
1972 apic_printk(APIC_DEBUG, KERN_CONT "\n"); 1850 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1973 1851
@@ -2156,42 +2034,6 @@ void default_init_apic_ldr(void)
2156 apic_write(APIC_LDR, val); 2034 apic_write(APIC_LDR, val);
2157} 2035}
2158 2036
2159int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
2160 const struct cpumask *andmask,
2161 unsigned int *apicid)
2162{
2163 unsigned int cpu;
2164
2165 for_each_cpu_and(cpu, cpumask, andmask) {
2166 if (cpumask_test_cpu(cpu, cpu_online_mask))
2167 break;
2168 }
2169
2170 if (likely(cpu < nr_cpu_ids)) {
2171 *apicid = per_cpu(x86_cpu_to_apicid, cpu);
2172 return 0;
2173 }
2174
2175 return -EINVAL;
2176}
2177
2178/*
2179 * Override the generic EOI implementation with an optimized version.
2180 * Only called during early boot when only one CPU is active and with
2181 * interrupts disabled, so we know this does not race with actual APIC driver
2182 * use.
2183 */
2184void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v))
2185{
2186 struct apic **drv;
2187
2188 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
2189 /* Should happen once for each apic */
2190 WARN_ON((*drv)->eoi_write == eoi_write);
2191 (*drv)->eoi_write = eoi_write;
2192 }
2193}
2194
2195/* 2037/*
2196 * Power management 2038 * Power management
2197 */ 2039 */
@@ -2251,8 +2093,8 @@ static int lapic_suspend(void)
2251 local_irq_save(flags); 2093 local_irq_save(flags);
2252 disable_local_APIC(); 2094 disable_local_APIC();
2253 2095
2254 if (irq_remapping_enabled) 2096 if (intr_remapping_enabled)
2255 irq_remapping_disable(); 2097 disable_intr_remapping();
2256 2098
2257 local_irq_restore(flags); 2099 local_irq_restore(flags);
2258 return 0; 2100 return 0;
@@ -2268,7 +2110,7 @@ static void lapic_resume(void)
2268 return; 2110 return;
2269 2111
2270 local_irq_save(flags); 2112 local_irq_save(flags);
2271 if (irq_remapping_enabled) { 2113 if (intr_remapping_enabled) {
2272 /* 2114 /*
2273 * IO-APIC and PIC have their own resume routines. 2115 * IO-APIC and PIC have their own resume routines.
2274 * We just mask them here to make sure the interrupt 2116 * We just mask them here to make sure the interrupt
@@ -2288,12 +2130,10 @@ static void lapic_resume(void)
2288 * FIXME! This will be wrong if we ever support suspend on 2130 * FIXME! This will be wrong if we ever support suspend on
2289 * SMP! We'll need to do this as part of the CPU restore! 2131 * SMP! We'll need to do this as part of the CPU restore!
2290 */ 2132 */
2291 if (boot_cpu_data.x86 >= 6) { 2133 rdmsr(MSR_IA32_APICBASE, l, h);
2292 rdmsr(MSR_IA32_APICBASE, l, h); 2134 l &= ~MSR_IA32_APICBASE_BASE;
2293 l &= ~MSR_IA32_APICBASE_BASE; 2135 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
2294 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 2136 wrmsr(MSR_IA32_APICBASE, l, h);
2295 wrmsr(MSR_IA32_APICBASE, l, h);
2296 }
2297 } 2137 }
2298 2138
2299 maxlvt = lapic_get_maxlvt(); 2139 maxlvt = lapic_get_maxlvt();
@@ -2320,8 +2160,8 @@ static void lapic_resume(void)
2320 apic_write(APIC_ESR, 0); 2160 apic_write(APIC_ESR, 0);
2321 apic_read(APIC_ESR); 2161 apic_read(APIC_ESR);
2322 2162
2323 if (irq_remapping_enabled) 2163 if (intr_remapping_enabled)
2324 irq_remapping_reenable(x2apic_mode); 2164 reenable_intr_remapping(x2apic_mode);
2325 2165
2326 local_irq_restore(flags); 2166 local_irq_restore(flags);
2327} 2167}
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 00c77cf78e9..f7a41e4cae4 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -36,6 +36,25 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
36 return 1; 36 return 1;
37} 37}
38 38
39static const struct cpumask *flat_target_cpus(void)
40{
41 return cpu_online_mask;
42}
43
44static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask)
45{
46 /* Careful. Some cpus do not strictly honor the set of cpus
47 * specified in the interrupt destination when using lowest
48 * priority interrupt delivery mode.
49 *
50 * In particular there was a hyperthreading cpu observed to
51 * deliver interrupts to the wrong hyperthread when only one
52 * hyperthread was specified in the interrupt desitination.
53 */
54 cpumask_clear(retmask);
55 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
56}
57
39/* 58/*
40 * Set up the logical destination ID. 59 * Set up the logical destination ID.
41 * 60 *
@@ -43,7 +62,7 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
43 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel 62 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
44 * document number 292116). So here it goes... 63 * document number 292116). So here it goes...
45 */ 64 */
46void flat_init_apic_ldr(void) 65static void flat_init_apic_ldr(void)
47{ 66{
48 unsigned long val; 67 unsigned long val;
49 unsigned long num, id; 68 unsigned long num, id;
@@ -73,7 +92,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector)
73} 92}
74 93
75static void 94static void
76flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) 95 flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector)
77{ 96{
78 unsigned long mask = cpumask_bits(cpumask)[0]; 97 unsigned long mask = cpumask_bits(cpumask)[0];
79 int cpu = smp_processor_id(); 98 int cpu = smp_processor_id();
@@ -152,22 +171,16 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
152 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
153} 172}
154 173
155static int flat_probe(void)
156{
157 return 1;
158}
159
160static struct apic apic_flat = { 174static struct apic apic_flat = {
161 .name = "flat", 175 .name = "flat",
162 .probe = flat_probe, 176 .probe = NULL,
163 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 177 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
164 .apic_id_valid = default_apic_id_valid,
165 .apic_id_registered = flat_apic_id_registered, 178 .apic_id_registered = flat_apic_id_registered,
166 179
167 .irq_delivery_mode = dest_LowestPrio, 180 .irq_delivery_mode = dest_LowestPrio,
168 .irq_dest_mode = 1, /* logical */ 181 .irq_dest_mode = 1, /* logical */
169 182
170 .target_cpus = online_target_cpus, 183 .target_cpus = flat_target_cpus,
171 .disable_esr = 0, 184 .disable_esr = 0,
172 .dest_logical = APIC_DEST_LOGICAL, 185 .dest_logical = APIC_DEST_LOGICAL,
173 .check_apicid_used = NULL, 186 .check_apicid_used = NULL,
@@ -191,7 +204,8 @@ static struct apic apic_flat = {
191 .set_apic_id = set_apic_id, 204 .set_apic_id = set_apic_id,
192 .apic_id_mask = 0xFFu << 24, 205 .apic_id_mask = 0xFFu << 24,
193 206
194 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 207 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
208 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
195 209
196 .send_IPI_mask = flat_send_IPI_mask, 210 .send_IPI_mask = flat_send_IPI_mask,
197 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, 211 .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself,
@@ -207,7 +221,6 @@ static struct apic apic_flat = {
207 221
208 .read = native_apic_mem_read, 222 .read = native_apic_mem_read,
209 .write = native_apic_mem_write, 223 .write = native_apic_mem_write,
210 .eoi_write = native_apic_mem_write,
211 .icr_read = native_apic_icr_read, 224 .icr_read = native_apic_icr_read,
212 .icr_write = native_apic_icr_write, 225 .icr_write = native_apic_icr_write,
213 .wait_icr_idle = native_apic_wait_icr_idle, 226 .wait_icr_idle = native_apic_wait_icr_idle,
@@ -242,6 +255,17 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
242 return 0; 255 return 0;
243} 256}
244 257
258static const struct cpumask *physflat_target_cpus(void)
259{
260 return cpu_online_mask;
261}
262
263static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask)
264{
265 cpumask_clear(retmask);
266 cpumask_set_cpu(cpu, retmask);
267}
268
245static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) 269static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector)
246{ 270{
247 default_send_IPI_mask_sequence_phys(cpumask, vector); 271 default_send_IPI_mask_sequence_phys(cpumask, vector);
@@ -263,6 +287,38 @@ static void physflat_send_IPI_all(int vector)
263 physflat_send_IPI_mask(cpu_online_mask, vector); 287 physflat_send_IPI_mask(cpu_online_mask, vector);
264} 288}
265 289
290static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask)
291{
292 int cpu;
293
294 /*
295 * We're using fixed IRQ delivery, can only return one phys APIC ID.
296 * May as well be the first.
297 */
298 cpu = cpumask_first(cpumask);
299 if ((unsigned)cpu < nr_cpu_ids)
300 return per_cpu(x86_cpu_to_apicid, cpu);
301 else
302 return BAD_APICID;
303}
304
305static unsigned int
306physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
307 const struct cpumask *andmask)
308{
309 int cpu;
310
311 /*
312 * We're using fixed IRQ delivery, can only return one phys APIC ID.
313 * May as well be the first.
314 */
315 for_each_cpu_and(cpu, cpumask, andmask) {
316 if (cpumask_test_cpu(cpu, cpu_online_mask))
317 break;
318 }
319 return per_cpu(x86_cpu_to_apicid, cpu);
320}
321
266static int physflat_probe(void) 322static int physflat_probe(void)
267{ 323{
268 if (apic == &apic_physflat || num_possible_cpus() > 8) 324 if (apic == &apic_physflat || num_possible_cpus() > 8)
@@ -276,19 +332,18 @@ static struct apic apic_physflat = {
276 .name = "physical flat", 332 .name = "physical flat",
277 .probe = physflat_probe, 333 .probe = physflat_probe,
278 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 334 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
279 .apic_id_valid = default_apic_id_valid,
280 .apic_id_registered = flat_apic_id_registered, 335 .apic_id_registered = flat_apic_id_registered,
281 336
282 .irq_delivery_mode = dest_Fixed, 337 .irq_delivery_mode = dest_Fixed,
283 .irq_dest_mode = 0, /* physical */ 338 .irq_dest_mode = 0, /* physical */
284 339
285 .target_cpus = online_target_cpus, 340 .target_cpus = physflat_target_cpus,
286 .disable_esr = 0, 341 .disable_esr = 0,
287 .dest_logical = 0, 342 .dest_logical = 0,
288 .check_apicid_used = NULL, 343 .check_apicid_used = NULL,
289 .check_apicid_present = NULL, 344 .check_apicid_present = NULL,
290 345
291 .vector_allocation_domain = default_vector_allocation_domain, 346 .vector_allocation_domain = physflat_vector_allocation_domain,
292 /* not needed, but shouldn't hurt: */ 347 /* not needed, but shouldn't hurt: */
293 .init_apic_ldr = flat_init_apic_ldr, 348 .init_apic_ldr = flat_init_apic_ldr,
294 349
@@ -307,7 +362,8 @@ static struct apic apic_physflat = {
307 .set_apic_id = set_apic_id, 362 .set_apic_id = set_apic_id,
308 .apic_id_mask = 0xFFu << 24, 363 .apic_id_mask = 0xFFu << 24,
309 364
310 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, 365 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
366 .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and,
311 367
312 .send_IPI_mask = physflat_send_IPI_mask, 368 .send_IPI_mask = physflat_send_IPI_mask,
313 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, 369 .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself,
@@ -323,7 +379,6 @@ static struct apic apic_physflat = {
323 379
324 .read = native_apic_mem_read, 380 .read = native_apic_mem_read,
325 .write = native_apic_mem_write, 381 .write = native_apic_mem_write,
326 .eoi_write = native_apic_mem_write,
327 .icr_read = native_apic_icr_read, 382 .icr_read = native_apic_icr_read,
328 .icr_write = native_apic_icr_write, 383 .icr_write = native_apic_icr_write,
329 .wait_icr_idle = native_apic_wait_icr_idle, 384 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e145f28b409..775b82bc655 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit)
100 return physid_isset(bit, phys_cpu_present_map); 100 return physid_isset(bit, phys_cpu_present_map);
101} 101}
102 102
103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, 103static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
104 const struct cpumask *mask)
105{ 104{
106 if (cpu != 0) 105 if (cpu != 0)
107 pr_warning("APIC: Vector allocated for non-BSP cpu\n"); 106 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
108 cpumask_copy(retmask, cpumask_of(cpu)); 107 cpumask_clear(retmask);
108 cpumask_set_cpu(cpu, retmask);
109} 109}
110 110
111static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
@@ -124,7 +124,6 @@ struct apic apic_noop = {
124 .probe = noop_probe, 124 .probe = noop_probe,
125 .acpi_madt_oem_check = NULL, 125 .acpi_madt_oem_check = NULL,
126 126
127 .apic_id_valid = default_apic_id_valid,
128 .apic_id_registered = noop_apic_id_registered, 127 .apic_id_registered = noop_apic_id_registered,
129 128
130 .irq_delivery_mode = dest_LowestPrio, 129 .irq_delivery_mode = dest_LowestPrio,
@@ -159,7 +158,8 @@ struct apic apic_noop = {
159 .set_apic_id = NULL, 158 .set_apic_id = NULL,
160 .apic_id_mask = 0x0F << 24, 159 .apic_id_mask = 0x0F << 24,
161 160
162 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 161 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
162 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
163 163
164 .send_IPI_mask = noop_send_IPI_mask, 164 .send_IPI_mask = noop_send_IPI_mask,
165 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, 165 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
@@ -180,7 +180,6 @@ struct apic apic_noop = {
180 180
181 .read = noop_apic_read, 181 .read = noop_apic_read,
182 .write = noop_apic_write, 182 .write = noop_apic_write,
183 .eoi_write = noop_apic_write,
184 .icr_read = noop_apic_icr_read, 183 .icr_read = noop_apic_icr_read,
185 .icr_write = noop_apic_icr_write, 184 .icr_write = noop_apic_icr_write,
186 .wait_icr_idle = noop_apic_wait_icr_idle, 185 .wait_icr_idle = noop_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
deleted file mode 100644
index 9c2aa89a11c..00000000000
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ /dev/null
@@ -1,263 +0,0 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Numascale NumaConnect-Specific APIC Code
7 *
8 * Copyright (C) 2011 Numascale AS. All rights reserved.
9 *
10 * Send feedback to <support@numascale.com>
11 *
12 */
13
14#include <linux/errno.h>
15#include <linux/threads.h>
16#include <linux/cpumask.h>
17#include <linux/string.h>
18#include <linux/kernel.h>
19#include <linux/module.h>
20#include <linux/ctype.h>
21#include <linux/init.h>
22#include <linux/hardirq.h>
23#include <linux/delay.h>
24
25#include <asm/numachip/numachip.h>
26#include <asm/numachip/numachip_csr.h>
27#include <asm/smp.h>
28#include <asm/apic.h>
29#include <asm/ipi.h>
30#include <asm/apic_flat_64.h>
31
32static int numachip_system __read_mostly;
33
34static const struct apic apic_numachip __read_mostly;
35
36static unsigned int get_apic_id(unsigned long x)
37{
38 unsigned long value;
39 unsigned int id;
40
41 rdmsrl(MSR_FAM10H_NODE_ID, value);
42 id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U);
43
44 return id;
45}
46
47static unsigned long set_apic_id(unsigned int id)
48{
49 unsigned long x;
50
51 x = ((id & 0xffU) << 24);
52 return x;
53}
54
55static unsigned int read_xapic_id(void)
56{
57 return get_apic_id(apic_read(APIC_ID));
58}
59
60static int numachip_apic_id_valid(int apicid)
61{
62 /* Trust what bootloader passes in MADT */
63 return 1;
64}
65
66static int numachip_apic_id_registered(void)
67{
68 return physid_isset(read_xapic_id(), phys_cpu_present_map);
69}
70
71static int numachip_phys_pkg_id(int initial_apic_id, int index_msb)
72{
73 return initial_apic_id >> index_msb;
74}
75
76static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)
77{
78 union numachip_csr_g3_ext_irq_gen int_gen;
79
80 int_gen.s._destination_apic_id = phys_apicid;
81 int_gen.s._vector = 0;
82 int_gen.s._msgtype = APIC_DM_INIT >> 8;
83 int_gen.s._index = 0;
84
85 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
86
87 int_gen.s._msgtype = APIC_DM_STARTUP >> 8;
88 int_gen.s._vector = start_rip >> 12;
89
90 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
91
92 atomic_set(&init_deasserted, 1);
93 return 0;
94}
95
96static void numachip_send_IPI_one(int cpu, int vector)
97{
98 union numachip_csr_g3_ext_irq_gen int_gen;
99 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
100
101 int_gen.s._destination_apic_id = apicid;
102 int_gen.s._vector = vector;
103 int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8;
104 int_gen.s._index = 0;
105
106 write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);
107}
108
109static void numachip_send_IPI_mask(const struct cpumask *mask, int vector)
110{
111 unsigned int cpu;
112
113 for_each_cpu(cpu, mask)
114 numachip_send_IPI_one(cpu, vector);
115}
116
117static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask,
118 int vector)
119{
120 unsigned int this_cpu = smp_processor_id();
121 unsigned int cpu;
122
123 for_each_cpu(cpu, mask) {
124 if (cpu != this_cpu)
125 numachip_send_IPI_one(cpu, vector);
126 }
127}
128
129static void numachip_send_IPI_allbutself(int vector)
130{
131 unsigned int this_cpu = smp_processor_id();
132 unsigned int cpu;
133
134 for_each_online_cpu(cpu) {
135 if (cpu != this_cpu)
136 numachip_send_IPI_one(cpu, vector);
137 }
138}
139
140static void numachip_send_IPI_all(int vector)
141{
142 numachip_send_IPI_mask(cpu_online_mask, vector);
143}
144
145static void numachip_send_IPI_self(int vector)
146{
147 __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
148}
149
150static int __init numachip_probe(void)
151{
152 return apic == &apic_numachip;
153}
154
155static void __init map_csrs(void)
156{
157 printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n",
158 NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1);
159 init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE);
160
161 printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n",
162 NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1);
163 init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE);
164}
165
166static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
167{
168
169 if (c->phys_proc_id != node) {
170 c->phys_proc_id = node;
171 per_cpu(cpu_llc_id, smp_processor_id()) = node;
172 }
173}
174
175static int __init numachip_system_init(void)
176{
177 unsigned int val;
178
179 if (!numachip_system)
180 return 0;
181
182 x86_cpuinit.fixup_cpu_id = fixup_cpu_id;
183 x86_init.pci.arch_init = pci_numachip_init;
184
185 map_csrs();
186
187 val = read_lcsr(CSR_G0_NODE_IDS);
188 printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val);
189
190 return 0;
191}
192early_initcall(numachip_system_init);
193
194static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
195{
196 if (!strncmp(oem_id, "NUMASC", 6)) {
197 numachip_system = 1;
198 return 1;
199 }
200
201 return 0;
202}
203
204static const struct apic apic_numachip __refconst = {
205
206 .name = "NumaConnect system",
207 .probe = numachip_probe,
208 .acpi_madt_oem_check = numachip_acpi_madt_oem_check,
209 .apic_id_valid = numachip_apic_id_valid,
210 .apic_id_registered = numachip_apic_id_registered,
211
212 .irq_delivery_mode = dest_Fixed,
213 .irq_dest_mode = 0, /* physical */
214
215 .target_cpus = online_target_cpus,
216 .disable_esr = 0,
217 .dest_logical = 0,
218 .check_apicid_used = NULL,
219 .check_apicid_present = NULL,
220
221 .vector_allocation_domain = default_vector_allocation_domain,
222 .init_apic_ldr = flat_init_apic_ldr,
223
224 .ioapic_phys_id_map = NULL,
225 .setup_apic_routing = NULL,
226 .multi_timer_check = NULL,
227 .cpu_present_to_apicid = default_cpu_present_to_apicid,
228 .apicid_to_cpu_present = NULL,
229 .setup_portio_remap = NULL,
230 .check_phys_apicid_present = default_check_phys_apicid_present,
231 .enable_apic_mode = NULL,
232 .phys_pkg_id = numachip_phys_pkg_id,
233 .mps_oem_check = NULL,
234
235 .get_apic_id = get_apic_id,
236 .set_apic_id = set_apic_id,
237 .apic_id_mask = 0xffU << 24,
238
239 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
240
241 .send_IPI_mask = numachip_send_IPI_mask,
242 .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself,
243 .send_IPI_allbutself = numachip_send_IPI_allbutself,
244 .send_IPI_all = numachip_send_IPI_all,
245 .send_IPI_self = numachip_send_IPI_self,
246
247 .wakeup_secondary_cpu = numachip_wakeup_secondary,
248 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
249 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
250 .wait_for_init_deassert = NULL,
251 .smp_callin_clear_local_apic = NULL,
252 .inquire_remote_apic = NULL, /* REMRD not supported */
253
254 .read = native_apic_mem_read,
255 .write = native_apic_mem_write,
256 .eoi_write = native_apic_mem_write,
257 .icr_read = native_apic_icr_read,
258 .icr_write = native_apic_icr_write,
259 .wait_icr_idle = native_apic_wait_icr_idle,
260 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
261};
262apic_driver(apic_numachip);
263
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index d50e3640d5a..521bead0113 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -26,6 +26,15 @@ static int bigsmp_apic_id_registered(void)
26 return 1; 26 return 1;
27} 27}
28 28
29static const struct cpumask *bigsmp_target_cpus(void)
30{
31#ifdef CONFIG_SMP
32 return cpu_online_mask;
33#else
34 return cpumask_of(0);
35#endif
36}
37
29static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
30{ 39{
31 return 0; 40 return 0;
@@ -96,6 +105,32 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
96 return 1; 105 return 1;
97} 106}
98 107
108/* As we are using single CPU as destination, pick only one CPU here */
109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
110{
111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
116}
117
118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
119 const struct cpumask *andmask)
120{
121 int cpu;
122
123 /*
124 * We're using fixed IRQ delivery, can only return one phys APIC ID.
125 * May as well be the first.
126 */
127 for_each_cpu_and(cpu, cpumask, andmask) {
128 if (cpumask_test_cpu(cpu, cpu_online_mask))
129 return cpu_physical_id(cpu);
130 }
131 return BAD_APICID;
132}
133
99static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
100{ 135{
101 return cpuid_apic >> index_msb; 136 return cpuid_apic >> index_msb;
@@ -142,6 +177,12 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
142 { } /* NULL entry stops DMI scanning */ 177 { } /* NULL entry stops DMI scanning */
143}; 178};
144 179
180static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask)
181{
182 cpumask_clear(retmask);
183 cpumask_set_cpu(cpu, retmask);
184}
185
145static int probe_bigsmp(void) 186static int probe_bigsmp(void)
146{ 187{
147 if (def_to_bigsmp) 188 if (def_to_bigsmp)
@@ -157,20 +198,19 @@ static struct apic apic_bigsmp = {
157 .name = "bigsmp", 198 .name = "bigsmp",
158 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
159 .acpi_madt_oem_check = NULL, 200 .acpi_madt_oem_check = NULL,
160 .apic_id_valid = default_apic_id_valid,
161 .apic_id_registered = bigsmp_apic_id_registered, 201 .apic_id_registered = bigsmp_apic_id_registered,
162 202
163 .irq_delivery_mode = dest_Fixed, 203 .irq_delivery_mode = dest_Fixed,
164 /* phys delivery to target CPU: */ 204 /* phys delivery to target CPU: */
165 .irq_dest_mode = 0, 205 .irq_dest_mode = 0,
166 206
167 .target_cpus = default_target_cpus, 207 .target_cpus = bigsmp_target_cpus,
168 .disable_esr = 1, 208 .disable_esr = 1,
169 .dest_logical = 0, 209 .dest_logical = 0,
170 .check_apicid_used = bigsmp_check_apicid_used, 210 .check_apicid_used = bigsmp_check_apicid_used,
171 .check_apicid_present = bigsmp_check_apicid_present, 211 .check_apicid_present = bigsmp_check_apicid_present,
172 212
173 .vector_allocation_domain = default_vector_allocation_domain, 213 .vector_allocation_domain = bigsmp_vector_allocation_domain,
174 .init_apic_ldr = bigsmp_init_apic_ldr, 214 .init_apic_ldr = bigsmp_init_apic_ldr,
175 215
176 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
@@ -188,7 +228,8 @@ static struct apic apic_bigsmp = {
188 .set_apic_id = NULL, 228 .set_apic_id = NULL,
189 .apic_id_mask = 0xFF << 24, 229 .apic_id_mask = 0xFF << 24,
190 230
191 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, 231 .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid,
232 .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and,
192 233
193 .send_IPI_mask = bigsmp_send_IPI_mask, 234 .send_IPI_mask = bigsmp_send_IPI_mask,
194 .send_IPI_mask_allbutself = NULL, 235 .send_IPI_mask_allbutself = NULL,
@@ -206,7 +247,6 @@ static struct apic apic_bigsmp = {
206 247
207 .read = native_apic_mem_read, 248 .read = native_apic_mem_read,
208 .write = native_apic_mem_write, 249 .write = native_apic_mem_write,
209 .eoi_write = native_apic_mem_write,
210 .icr_read = native_apic_icr_read, 250 .icr_read = native_apic_icr_read,
211 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
212 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 0874799a98c..5d513bc47b6 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -394,6 +394,21 @@ static void es7000_enable_apic_mode(void)
394 WARN(1, "Command failed, status = %x\n", mip_status); 394 WARN(1, "Command failed, status = %x\n", mip_status);
395} 395}
396 396
397static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask)
398{
399 /* Careful. Some cpus do not strictly honor the set of cpus
400 * specified in the interrupt destination when using lowest
401 * priority interrupt delivery mode.
402 *
403 * In particular there was a hyperthreading cpu observed to
404 * deliver interrupts to the wrong hyperthread when only one
405 * hyperthread was specified in the interrupt desitination.
406 */
407 cpumask_clear(retmask);
408 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
409}
410
411
397static void es7000_wait_for_init_deassert(atomic_t *deassert) 412static void es7000_wait_for_init_deassert(atomic_t *deassert)
398{ 413{
399 while (!atomic_read(deassert)) 414 while (!atomic_read(deassert))
@@ -525,49 +540,45 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
525 return 1; 540 return 1;
526} 541}
527 542
528static inline int 543static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
529es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
530{ 544{
531 unsigned int round = 0; 545 unsigned int round = 0;
532 unsigned int cpu, uninitialized_var(apicid); 546 int cpu, uninitialized_var(apicid);
533 547
534 /* 548 /*
535 * The cpus in the mask must all be on the apic cluster. 549 * The cpus in the mask must all be on the apic cluster.
536 */ 550 */
537 for_each_cpu_and(cpu, cpumask, cpu_online_mask) { 551 for_each_cpu(cpu, cpumask) {
538 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
539 553
540 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
541 WARN(1, "Not a valid mask!"); 555 WARN(1, "Not a valid mask!");
542 556
543 return -EINVAL; 557 return BAD_APICID;
544 } 558 }
545 apicid |= new_apicid; 559 apicid = new_apicid;
546 round++; 560 round++;
547 } 561 }
548 if (!round) 562 return apicid;
549 return -EINVAL;
550 *dest_id = apicid;
551 return 0;
552} 563}
553 564
554static int 565static unsigned int
555es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
556 const struct cpumask *andmask, 567 const struct cpumask *andmask)
557 unsigned int *apicid)
558{ 568{
569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
559 cpumask_var_t cpumask; 570 cpumask_var_t cpumask;
560 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
561 571
562 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
563 return 0; 573 return apicid;
564 574
565 cpumask_and(cpumask, inmask, andmask); 575 cpumask_and(cpumask, inmask, andmask);
566 es7000_cpu_mask_to_apicid(cpumask, apicid); 576 cpumask_and(cpumask, cpumask, cpu_online_mask);
577 apicid = es7000_cpu_mask_to_apicid(cpumask);
567 578
568 free_cpumask_var(cpumask); 579 free_cpumask_var(cpumask);
569 580
570 return 0; 581 return apicid;
571} 582}
572 583
573static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) 584static int es7000_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -614,7 +625,6 @@ static struct apic __refdata apic_es7000_cluster = {
614 .name = "es7000", 625 .name = "es7000",
615 .probe = probe_es7000, 626 .probe = probe_es7000,
616 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster, 627 .acpi_madt_oem_check = es7000_acpi_madt_oem_check_cluster,
617 .apic_id_valid = default_apic_id_valid,
618 .apic_id_registered = es7000_apic_id_registered, 628 .apic_id_registered = es7000_apic_id_registered,
619 629
620 .irq_delivery_mode = dest_LowestPrio, 630 .irq_delivery_mode = dest_LowestPrio,
@@ -627,7 +637,7 @@ static struct apic __refdata apic_es7000_cluster = {
627 .check_apicid_used = es7000_check_apicid_used, 637 .check_apicid_used = es7000_check_apicid_used,
628 .check_apicid_present = es7000_check_apicid_present, 638 .check_apicid_present = es7000_check_apicid_present,
629 639
630 .vector_allocation_domain = flat_vector_allocation_domain, 640 .vector_allocation_domain = es7000_vector_allocation_domain,
631 .init_apic_ldr = es7000_init_apic_ldr_cluster, 641 .init_apic_ldr = es7000_init_apic_ldr_cluster,
632 642
633 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 643 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -645,6 +655,7 @@ static struct apic __refdata apic_es7000_cluster = {
645 .set_apic_id = NULL, 655 .set_apic_id = NULL,
646 .apic_id_mask = 0xFF << 24, 656 .apic_id_mask = 0xFF << 24,
647 657
658 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
648 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 659 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
649 660
650 .send_IPI_mask = es7000_send_IPI_mask, 661 .send_IPI_mask = es7000_send_IPI_mask,
@@ -666,7 +677,6 @@ static struct apic __refdata apic_es7000_cluster = {
666 677
667 .read = native_apic_mem_read, 678 .read = native_apic_mem_read,
668 .write = native_apic_mem_write, 679 .write = native_apic_mem_write,
669 .eoi_write = native_apic_mem_write,
670 .icr_read = native_apic_icr_read, 680 .icr_read = native_apic_icr_read,
671 .icr_write = native_apic_icr_write, 681 .icr_write = native_apic_icr_write,
672 .wait_icr_idle = native_apic_wait_icr_idle, 682 .wait_icr_idle = native_apic_wait_icr_idle,
@@ -680,7 +690,6 @@ static struct apic __refdata apic_es7000 = {
680 .name = "es7000", 690 .name = "es7000",
681 .probe = probe_es7000, 691 .probe = probe_es7000,
682 .acpi_madt_oem_check = es7000_acpi_madt_oem_check, 692 .acpi_madt_oem_check = es7000_acpi_madt_oem_check,
683 .apic_id_valid = default_apic_id_valid,
684 .apic_id_registered = es7000_apic_id_registered, 693 .apic_id_registered = es7000_apic_id_registered,
685 694
686 .irq_delivery_mode = dest_Fixed, 695 .irq_delivery_mode = dest_Fixed,
@@ -693,7 +702,7 @@ static struct apic __refdata apic_es7000 = {
693 .check_apicid_used = es7000_check_apicid_used, 702 .check_apicid_used = es7000_check_apicid_used,
694 .check_apicid_present = es7000_check_apicid_present, 703 .check_apicid_present = es7000_check_apicid_present,
695 704
696 .vector_allocation_domain = flat_vector_allocation_domain, 705 .vector_allocation_domain = es7000_vector_allocation_domain,
697 .init_apic_ldr = es7000_init_apic_ldr, 706 .init_apic_ldr = es7000_init_apic_ldr,
698 707
699 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 708 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
@@ -711,6 +720,7 @@ static struct apic __refdata apic_es7000 = {
711 .set_apic_id = NULL, 720 .set_apic_id = NULL,
712 .apic_id_mask = 0xFF << 24, 721 .apic_id_mask = 0xFF << 24,
713 722
723 .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid,
714 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, 724 .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and,
715 725
716 .send_IPI_mask = es7000_send_IPI_mask, 726 .send_IPI_mask = es7000_send_IPI_mask,
@@ -730,7 +740,6 @@ static struct apic __refdata apic_es7000 = {
730 740
731 .read = native_apic_mem_read, 741 .read = native_apic_mem_read,
732 .write = native_apic_mem_write, 742 .write = native_apic_mem_write,
733 .eoi_write = native_apic_mem_write,
734 .icr_read = native_apic_icr_read, 743 .icr_read = native_apic_icr_read,
735 .icr_write = native_apic_icr_write, 744 .icr_write = native_apic_icr_write,
736 .wait_icr_idle = native_apic_wait_icr_idle, 745 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 31cb9ae992b..d5e57db0f7b 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -60,10 +60,22 @@ void arch_trigger_all_cpu_backtrace(void)
60} 60}
61 61
62static int __kprobes 62static int __kprobes
63arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) 63arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
64 unsigned long cmd, void *__args)
64{ 65{
66 struct die_args *args = __args;
67 struct pt_regs *regs;
65 int cpu; 68 int cpu;
66 69
70 switch (cmd) {
71 case DIE_NMI:
72 break;
73
74 default:
75 return NOTIFY_DONE;
76 }
77
78 regs = args->regs;
67 cpu = smp_processor_id(); 79 cpu = smp_processor_id();
68 80
69 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 81 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
@@ -74,16 +86,21 @@ arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs)
74 show_regs(regs); 86 show_regs(regs);
75 arch_spin_unlock(&lock); 87 arch_spin_unlock(&lock);
76 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
77 return NMI_HANDLED; 89 return NOTIFY_STOP;
78 } 90 }
79 91
80 return NMI_DONE; 92 return NOTIFY_DONE;
81} 93}
82 94
95static __read_mostly struct notifier_block backtrace_notifier = {
96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
97 .next = NULL,
98 .priority = NMI_LOCAL_LOW_PRIOR,
99};
100
83static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
84{ 102{
85 register_nmi_handler(NMI_LOCAL, arch_trigger_all_cpu_backtrace_handler, 103 register_die_notifier(&backtrace_notifier);
86 0, "arch_bt");
87 return 0; 104 return 0;
88} 105}
89early_initcall(register_trigger_all_cpu_backtrace); 106early_initcall(register_trigger_all_cpu_backtrace);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index b739d398bb2..620da6fed6b 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -64,26 +64,9 @@
64#include <asm/apic.h> 64#include <asm/apic.h>
65 65
66#define __apicdebuginit(type) static type __init 66#define __apicdebuginit(type) static type __init
67
68#define for_each_irq_pin(entry, head) \ 67#define for_each_irq_pin(entry, head) \
69 for (entry = head; entry; entry = entry->next) 68 for (entry = head; entry; entry = entry->next)
70 69
71#ifdef CONFIG_IRQ_REMAP
72static void irq_remap_modify_chip_defaults(struct irq_chip *chip);
73static inline bool irq_remapped(struct irq_cfg *cfg)
74{
75 return cfg->irq_2_iommu.iommu != NULL;
76}
77#else
78static inline bool irq_remapped(struct irq_cfg *cfg)
79{
80 return false;
81}
82static inline void irq_remap_modify_chip_defaults(struct irq_chip *chip)
83{
84}
85#endif
86
87/* 70/*
88 * Is the SiS APIC rmw bug present ? 71 * Is the SiS APIC rmw bug present ?
89 * -1 = don't know, 0 = no, 1 = yes 72 * -1 = don't know, 0 = no, 1 = yes
@@ -109,21 +92,21 @@ static struct ioapic {
109 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); 92 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
110} ioapics[MAX_IO_APICS]; 93} ioapics[MAX_IO_APICS];
111 94
112#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver 95#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
113 96
114int mpc_ioapic_id(int ioapic_idx) 97int mpc_ioapic_id(int id)
115{ 98{
116 return ioapics[ioapic_idx].mp_config.apicid; 99 return ioapics[id].mp_config.apicid;
117} 100}
118 101
119unsigned int mpc_ioapic_addr(int ioapic_idx) 102unsigned int mpc_ioapic_addr(int id)
120{ 103{
121 return ioapics[ioapic_idx].mp_config.apicaddr; 104 return ioapics[id].mp_config.apicaddr;
122} 105}
123 106
124struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic_idx) 107struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
125{ 108{
126 return &ioapics[ioapic_idx].gsi_config; 109 return &ioapics[id].gsi_config;
127} 110}
128 111
129int nr_ioapics; 112int nr_ioapics;
@@ -140,7 +123,7 @@ int mp_irq_entries;
140/* GSI interrupts */ 123/* GSI interrupts */
141static int nr_irqs_gsi = NR_IRQS_LEGACY; 124static int nr_irqs_gsi = NR_IRQS_LEGACY;
142 125
143#ifdef CONFIG_EISA 126#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
144int mp_bus_id_to_type[MAX_MP_BUSSES]; 127int mp_bus_id_to_type[MAX_MP_BUSSES];
145#endif 128#endif
146 129
@@ -203,15 +186,21 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
203 186
204 187
205/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 188/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
189#ifdef CONFIG_SPARSE_IRQ
206static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; 190static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
191#else
192static struct irq_cfg irq_cfgx[NR_IRQS];
193#endif
207 194
208int __init arch_early_irq_init(void) 195int __init arch_early_irq_init(void)
209{ 196{
210 struct irq_cfg *cfg; 197 struct irq_cfg *cfg;
211 int count, node, i; 198 int count, node, i;
212 199
213 if (!legacy_pic->nr_legacy_irqs) 200 if (!legacy_pic->nr_legacy_irqs) {
201 nr_irqs_gsi = 0;
214 io_apic_irqs = ~0UL; 202 io_apic_irqs = ~0UL;
203 }
215 204
216 for (i = 0; i < nr_ioapics; i++) { 205 for (i = 0; i < nr_ioapics; i++) {
217 ioapics[i].saved_registers = 206 ioapics[i].saved_registers =
@@ -234,17 +223,18 @@ int __init arch_early_irq_init(void)
234 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node); 223 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
235 /* 224 /*
236 * For legacy IRQ's, start with assigning irq0 to irq15 to 225 * For legacy IRQ's, start with assigning irq0 to irq15 to
237 * IRQ0_VECTOR to IRQ15_VECTOR for all cpu's. 226 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
238 */ 227 */
239 if (i < legacy_pic->nr_legacy_irqs) { 228 if (i < legacy_pic->nr_legacy_irqs) {
240 cfg[i].vector = IRQ0_VECTOR + i; 229 cfg[i].vector = IRQ0_VECTOR + i;
241 cpumask_setall(cfg[i].domain); 230 cpumask_set_cpu(0, cfg[i].domain);
242 } 231 }
243 } 232 }
244 233
245 return 0; 234 return 0;
246} 235}
247 236
237#ifdef CONFIG_SPARSE_IRQ
248static struct irq_cfg *irq_cfg(unsigned int irq) 238static struct irq_cfg *irq_cfg(unsigned int irq)
249{ 239{
250 return irq_get_chip_data(irq); 240 return irq_get_chip_data(irq);
@@ -279,6 +269,22 @@ static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
279 kfree(cfg); 269 kfree(cfg);
280} 270}
281 271
272#else
273
274struct irq_cfg *irq_cfg(unsigned int irq)
275{
276 return irq < nr_irqs ? irq_cfgx + irq : NULL;
277}
278
279static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
280{
281 return irq_cfgx + irq;
282}
283
284static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
285
286#endif
287
282static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node) 288static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
283{ 289{
284 int res = irq_alloc_desc_at(at, node); 290 int res = irq_alloc_desc_at(at, node);
@@ -311,7 +317,6 @@ static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
311 irq_free_desc(at); 317 irq_free_desc(at);
312} 318}
313 319
314
315struct io_apic { 320struct io_apic {
316 unsigned int index; 321 unsigned int index;
317 unsigned int unused[3]; 322 unsigned int unused[3];
@@ -332,17 +337,16 @@ static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
332 writel(vector, &io_apic->eoi); 337 writel(vector, &io_apic->eoi);
333} 338}
334 339
335unsigned int native_io_apic_read(unsigned int apic, unsigned int reg) 340static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
336{ 341{
337 struct io_apic __iomem *io_apic = io_apic_base(apic); 342 struct io_apic __iomem *io_apic = io_apic_base(apic);
338 writel(reg, &io_apic->index); 343 writel(reg, &io_apic->index);
339 return readl(&io_apic->data); 344 return readl(&io_apic->data);
340} 345}
341 346
342void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) 347static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
343{ 348{
344 struct io_apic __iomem *io_apic = io_apic_base(apic); 349 struct io_apic __iomem *io_apic = io_apic_base(apic);
345
346 writel(reg, &io_apic->index); 350 writel(reg, &io_apic->index);
347 writel(value, &io_apic->data); 351 writel(value, &io_apic->data);
348} 352}
@@ -353,7 +357,7 @@ void native_io_apic_write(unsigned int apic, unsigned int reg, unsigned int valu
353 * 357 *
354 * Older SiS APIC requires we rewrite the index register 358 * Older SiS APIC requires we rewrite the index register
355 */ 359 */
356void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) 360static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
357{ 361{
358 struct io_apic __iomem *io_apic = io_apic_base(apic); 362 struct io_apic __iomem *io_apic = io_apic_base(apic);
359 363
@@ -362,30 +366,42 @@ void native_io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val
362 writel(value, &io_apic->data); 366 writel(value, &io_apic->data);
363} 367}
364 368
365union entry_union { 369static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
366 struct { u32 w1, w2; };
367 struct IO_APIC_route_entry entry;
368};
369
370static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin)
371{ 370{
372 union entry_union eu; 371 struct irq_pin_list *entry;
372 unsigned long flags;
373 373
374 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 374 raw_spin_lock_irqsave(&ioapic_lock, flags);
375 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 375 for_each_irq_pin(entry, cfg->irq_2_pin) {
376 unsigned int reg;
377 int pin;
376 378
377 return eu.entry; 379 pin = entry->pin;
380 reg = io_apic_read(entry->apic, 0x10 + pin*2);
381 /* Is the remote IRR bit set? */
382 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
383 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
384 return true;
385 }
386 }
387 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
388
389 return false;
378} 390}
379 391
392union entry_union {
393 struct { u32 w1, w2; };
394 struct IO_APIC_route_entry entry;
395};
396
380static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) 397static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
381{ 398{
382 union entry_union eu; 399 union entry_union eu;
383 unsigned long flags; 400 unsigned long flags;
384
385 raw_spin_lock_irqsave(&ioapic_lock, flags); 401 raw_spin_lock_irqsave(&ioapic_lock, flags);
386 eu.entry = __ioapic_read_entry(apic, pin); 402 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
403 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
387 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 404 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
388
389 return eu.entry; 405 return eu.entry;
390} 406}
391 407
@@ -395,7 +411,8 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
395 * the interrupt, and we need to make sure the entry is fully populated 411 * the interrupt, and we need to make sure the entry is fully populated
396 * before that happens. 412 * before that happens.
397 */ 413 */
398static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 414static void
415__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
399{ 416{
400 union entry_union eu = {{0, 0}}; 417 union entry_union eu = {{0, 0}};
401 418
@@ -407,7 +424,6 @@ static void __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e
407static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 424static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
408{ 425{
409 unsigned long flags; 426 unsigned long flags;
410
411 raw_spin_lock_irqsave(&ioapic_lock, flags); 427 raw_spin_lock_irqsave(&ioapic_lock, flags);
412 __ioapic_write_entry(apic, pin, e); 428 __ioapic_write_entry(apic, pin, e);
413 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 429 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -434,7 +450,8 @@ static void ioapic_mask_entry(int apic, int pin)
434 * shared ISA-space IRQs, so we have to support them. We are super 450 * shared ISA-space IRQs, so we have to support them. We are super
435 * fast in the common case, and fast for shared ISA-space IRQs. 451 * fast in the common case, and fast for shared ISA-space IRQs.
436 */ 452 */
437static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 453static int
454__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
438{ 455{
439 struct irq_pin_list **last, *entry; 456 struct irq_pin_list **last, *entry;
440 457
@@ -448,8 +465,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
448 465
449 entry = alloc_irq_pin_list(node); 466 entry = alloc_irq_pin_list(node);
450 if (!entry) { 467 if (!entry) {
451 pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", 468 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
452 node, apic, pin); 469 node, apic, pin);
453 return -ENOMEM; 470 return -ENOMEM;
454 } 471 }
455 entry->apic = apic; 472 entry->apic = apic;
@@ -512,6 +529,18 @@ static void io_apic_modify_irq(struct irq_cfg *cfg,
512 __io_apic_modify_irq(entry, mask_and, mask_or, final); 529 __io_apic_modify_irq(entry, mask_and, mask_or, final);
513} 530}
514 531
532static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
533{
534 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
535 IO_APIC_REDIR_MASKED, NULL);
536}
537
538static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
539{
540 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
541 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
542}
543
515static void io_apic_sync(struct irq_pin_list *entry) 544static void io_apic_sync(struct irq_pin_list *entry)
516{ 545{
517 /* 546 /*
@@ -519,7 +548,6 @@ static void io_apic_sync(struct irq_pin_list *entry)
519 * a dummy read from the IO-APIC 548 * a dummy read from the IO-APIC
520 */ 549 */
521 struct io_apic __iomem *io_apic; 550 struct io_apic __iomem *io_apic;
522
523 io_apic = io_apic_base(entry->apic); 551 io_apic = io_apic_base(entry->apic);
524 readl(&io_apic->data); 552 readl(&io_apic->data);
525} 553}
@@ -557,66 +585,6 @@ static void unmask_ioapic_irq(struct irq_data *data)
557 unmask_ioapic(data->chip_data); 585 unmask_ioapic(data->chip_data);
558} 586}
559 587
560/*
561 * IO-APIC versions below 0x20 don't support EOI register.
562 * For the record, here is the information about various versions:
563 * 0Xh 82489DX
564 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
565 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
566 * 30h-FFh Reserved
567 *
568 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
569 * version as 0x2. This is an error with documentation and these ICH chips
570 * use io-apic's of version 0x20.
571 *
572 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
573 * Otherwise, we simulate the EOI message manually by changing the trigger
574 * mode to edge and then back to level, with RTE being masked during this.
575 */
576static void __eoi_ioapic_pin(int apic, int pin, int vector, struct irq_cfg *cfg)
577{
578 if (mpc_ioapic_ver(apic) >= 0x20) {
579 /*
580 * Intr-remapping uses pin number as the virtual vector
581 * in the RTE. Actual vector is programmed in
582 * intr-remapping table entry. Hence for the io-apic
583 * EOI we use the pin number.
584 */
585 if (cfg && irq_remapped(cfg))
586 io_apic_eoi(apic, pin);
587 else
588 io_apic_eoi(apic, vector);
589 } else {
590 struct IO_APIC_route_entry entry, entry1;
591
592 entry = entry1 = __ioapic_read_entry(apic, pin);
593
594 /*
595 * Mask the entry and change the trigger mode to edge.
596 */
597 entry1.mask = 1;
598 entry1.trigger = IOAPIC_EDGE;
599
600 __ioapic_write_entry(apic, pin, entry1);
601
602 /*
603 * Restore the previous level triggered entry.
604 */
605 __ioapic_write_entry(apic, pin, entry);
606 }
607}
608
609static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
610{
611 struct irq_pin_list *entry;
612 unsigned long flags;
613
614 raw_spin_lock_irqsave(&ioapic_lock, flags);
615 for_each_irq_pin(entry, cfg->irq_2_pin)
616 __eoi_ioapic_pin(entry->apic, entry->pin, cfg->vector, cfg);
617 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
618}
619
620static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 588static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
621{ 589{
622 struct IO_APIC_route_entry entry; 590 struct IO_APIC_route_entry entry;
@@ -625,44 +593,10 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
625 entry = ioapic_read_entry(apic, pin); 593 entry = ioapic_read_entry(apic, pin);
626 if (entry.delivery_mode == dest_SMI) 594 if (entry.delivery_mode == dest_SMI)
627 return; 595 return;
628
629 /*
630 * Make sure the entry is masked and re-read the contents to check
631 * if it is a level triggered pin and if the remote-IRR is set.
632 */
633 if (!entry.mask) {
634 entry.mask = 1;
635 ioapic_write_entry(apic, pin, entry);
636 entry = ioapic_read_entry(apic, pin);
637 }
638
639 if (entry.irr) {
640 unsigned long flags;
641
642 /*
643 * Make sure the trigger mode is set to level. Explicit EOI
644 * doesn't clear the remote-IRR if the trigger mode is not
645 * set to level.
646 */
647 if (!entry.trigger) {
648 entry.trigger = IOAPIC_LEVEL;
649 ioapic_write_entry(apic, pin, entry);
650 }
651
652 raw_spin_lock_irqsave(&ioapic_lock, flags);
653 __eoi_ioapic_pin(apic, pin, entry.vector, NULL);
654 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
655 }
656
657 /* 596 /*
658 * Clear the rest of the bits in the IO-APIC RTE except for the mask 597 * Disable it in the IO-APIC irq-routing table:
659 * bit.
660 */ 598 */
661 ioapic_mask_entry(apic, pin); 599 ioapic_mask_entry(apic, pin);
662 entry = ioapic_read_entry(apic, pin);
663 if (entry.irr)
664 pr_err("Unable to reset IRR for apic: %d, pin :%d\n",
665 mpc_ioapic_id(apic), pin);
666} 600}
667 601
668static void clear_IO_APIC (void) 602static void clear_IO_APIC (void)
@@ -778,13 +712,13 @@ int restore_ioapic_entries(void)
778/* 712/*
779 * Find the IRQ entry number of a certain pin. 713 * Find the IRQ entry number of a certain pin.
780 */ 714 */
781static int find_irq_entry(int ioapic_idx, int pin, int type) 715static int find_irq_entry(int apic, int pin, int type)
782{ 716{
783 int i; 717 int i;
784 718
785 for (i = 0; i < mp_irq_entries; i++) 719 for (i = 0; i < mp_irq_entries; i++)
786 if (mp_irqs[i].irqtype == type && 720 if (mp_irqs[i].irqtype == type &&
787 (mp_irqs[i].dstapic == mpc_ioapic_id(ioapic_idx) || 721 (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
788 mp_irqs[i].dstapic == MP_APIC_ALL) && 722 mp_irqs[i].dstapic == MP_APIC_ALL) &&
789 mp_irqs[i].dstirq == pin) 723 mp_irqs[i].dstirq == pin)
790 return i; 724 return i;
@@ -823,19 +757,18 @@ static int __init find_isa_irq_apic(int irq, int type)
823 (mp_irqs[i].srcbusirq == irq)) 757 (mp_irqs[i].srcbusirq == irq))
824 break; 758 break;
825 } 759 }
826
827 if (i < mp_irq_entries) { 760 if (i < mp_irq_entries) {
828 int ioapic_idx; 761 int apic;
829 762 for(apic = 0; apic < nr_ioapics; apic++) {
830 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) 763 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
831 if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic) 764 return apic;
832 return ioapic_idx; 765 }
833 } 766 }
834 767
835 return -1; 768 return -1;
836} 769}
837 770
838#ifdef CONFIG_EISA 771#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
839/* 772/*
840 * EISA Edge/Level control register, ELCR 773 * EISA Edge/Level control register, ELCR
841 */ 774 */
@@ -872,6 +805,12 @@ static int EISA_ELCR(unsigned int irq)
872#define default_PCI_trigger(idx) (1) 805#define default_PCI_trigger(idx) (1)
873#define default_PCI_polarity(idx) (1) 806#define default_PCI_polarity(idx) (1)
874 807
808/* MCA interrupts are always polarity zero level triggered,
809 * when listed as conforming in the MP table. */
810
811#define default_MCA_trigger(idx) (1)
812#define default_MCA_polarity(idx) default_ISA_polarity(idx)
813
875static int irq_polarity(int idx) 814static int irq_polarity(int idx)
876{ 815{
877 int bus = mp_irqs[idx].srcbus; 816 int bus = mp_irqs[idx].srcbus;
@@ -895,7 +834,7 @@ static int irq_polarity(int idx)
895 } 834 }
896 case 2: /* reserved */ 835 case 2: /* reserved */
897 { 836 {
898 pr_warn("broken BIOS!!\n"); 837 printk(KERN_WARNING "broken BIOS!!\n");
899 polarity = 1; 838 polarity = 1;
900 break; 839 break;
901 } 840 }
@@ -906,7 +845,7 @@ static int irq_polarity(int idx)
906 } 845 }
907 default: /* invalid */ 846 default: /* invalid */
908 { 847 {
909 pr_warn("broken BIOS!!\n"); 848 printk(KERN_WARNING "broken BIOS!!\n");
910 polarity = 1; 849 polarity = 1;
911 break; 850 break;
912 } 851 }
@@ -929,7 +868,7 @@ static int irq_trigger(int idx)
929 trigger = default_ISA_trigger(idx); 868 trigger = default_ISA_trigger(idx);
930 else 869 else
931 trigger = default_PCI_trigger(idx); 870 trigger = default_PCI_trigger(idx);
932#ifdef CONFIG_EISA 871#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
933 switch (mp_bus_id_to_type[bus]) { 872 switch (mp_bus_id_to_type[bus]) {
934 case MP_BUS_ISA: /* ISA pin */ 873 case MP_BUS_ISA: /* ISA pin */
935 { 874 {
@@ -946,9 +885,14 @@ static int irq_trigger(int idx)
946 /* set before the switch */ 885 /* set before the switch */
947 break; 886 break;
948 } 887 }
888 case MP_BUS_MCA: /* MCA pin */
889 {
890 trigger = default_MCA_trigger(idx);
891 break;
892 }
949 default: 893 default:
950 { 894 {
951 pr_warn("broken BIOS!!\n"); 895 printk(KERN_WARNING "broken BIOS!!\n");
952 trigger = 1; 896 trigger = 1;
953 break; 897 break;
954 } 898 }
@@ -962,7 +906,7 @@ static int irq_trigger(int idx)
962 } 906 }
963 case 2: /* reserved */ 907 case 2: /* reserved */
964 { 908 {
965 pr_warn("broken BIOS!!\n"); 909 printk(KERN_WARNING "broken BIOS!!\n");
966 trigger = 1; 910 trigger = 1;
967 break; 911 break;
968 } 912 }
@@ -973,7 +917,7 @@ static int irq_trigger(int idx)
973 } 917 }
974 default: /* invalid */ 918 default: /* invalid */
975 { 919 {
976 pr_warn("broken BIOS!!\n"); 920 printk(KERN_WARNING "broken BIOS!!\n");
977 trigger = 0; 921 trigger = 0;
978 break; 922 break;
979 } 923 }
@@ -991,7 +935,7 @@ static int pin_2_irq(int idx, int apic, int pin)
991 * Debugging check, we are in big trouble if this message pops up! 935 * Debugging check, we are in big trouble if this message pops up!
992 */ 936 */
993 if (mp_irqs[idx].dstirq != pin) 937 if (mp_irqs[idx].dstirq != pin)
994 pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); 938 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
995 939
996 if (test_bit(bus, mp_bus_not_pci)) { 940 if (test_bit(bus, mp_bus_not_pci)) {
997 irq = mp_irqs[idx].srcbusirq; 941 irq = mp_irqs[idx].srcbusirq;
@@ -1033,7 +977,7 @@ static int pin_2_irq(int idx, int apic, int pin)
1033int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, 977int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1034 struct io_apic_irq_attr *irq_attr) 978 struct io_apic_irq_attr *irq_attr)
1035{ 979{
1036 int ioapic_idx, i, best_guess = -1; 980 int apic, i, best_guess = -1;
1037 981
1038 apic_printk(APIC_DEBUG, 982 apic_printk(APIC_DEBUG,
1039 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", 983 "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
@@ -1046,8 +990,8 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1046 for (i = 0; i < mp_irq_entries; i++) { 990 for (i = 0; i < mp_irq_entries; i++) {
1047 int lbus = mp_irqs[i].srcbus; 991 int lbus = mp_irqs[i].srcbus;
1048 992
1049 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) 993 for (apic = 0; apic < nr_ioapics; apic++)
1050 if (mpc_ioapic_id(ioapic_idx) == mp_irqs[i].dstapic || 994 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
1051 mp_irqs[i].dstapic == MP_APIC_ALL) 995 mp_irqs[i].dstapic == MP_APIC_ALL)
1052 break; 996 break;
1053 997
@@ -1055,13 +999,13 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1055 !mp_irqs[i].irqtype && 999 !mp_irqs[i].irqtype &&
1056 (bus == lbus) && 1000 (bus == lbus) &&
1057 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) { 1001 (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
1058 int irq = pin_2_irq(i, ioapic_idx, mp_irqs[i].dstirq); 1002 int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
1059 1003
1060 if (!(ioapic_idx || IO_APIC_IRQ(irq))) 1004 if (!(apic || IO_APIC_IRQ(irq)))
1061 continue; 1005 continue;
1062 1006
1063 if (pin == (mp_irqs[i].srcbusirq & 3)) { 1007 if (pin == (mp_irqs[i].srcbusirq & 3)) {
1064 set_io_apic_irq_attr(irq_attr, ioapic_idx, 1008 set_io_apic_irq_attr(irq_attr, apic,
1065 mp_irqs[i].dstirq, 1009 mp_irqs[i].dstirq,
1066 irq_trigger(i), 1010 irq_trigger(i),
1067 irq_polarity(i)); 1011 irq_polarity(i));
@@ -1072,7 +1016,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1072 * best-guess fuzzy result for broken mptables. 1016 * best-guess fuzzy result for broken mptables.
1073 */ 1017 */
1074 if (best_guess < 0) { 1018 if (best_guess < 0) {
1075 set_io_apic_irq_attr(irq_attr, ioapic_idx, 1019 set_io_apic_irq_attr(irq_attr, apic,
1076 mp_irqs[i].dstirq, 1020 mp_irqs[i].dstirq,
1077 irq_trigger(i), 1021 irq_trigger(i),
1078 irq_polarity(i)); 1022 irq_polarity(i));
@@ -1112,7 +1056,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1112 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1056 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1113 */ 1057 */
1114 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; 1058 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1115 static int current_offset = VECTOR_OFFSET_START % 16; 1059 static int current_offset = VECTOR_OFFSET_START % 8;
1060 unsigned int old_vector;
1116 int cpu, err; 1061 int cpu, err;
1117 cpumask_var_t tmp_mask; 1062 cpumask_var_t tmp_mask;
1118 1063
@@ -1122,46 +1067,35 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1122 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1067 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
1123 return -ENOMEM; 1068 return -ENOMEM;
1124 1069
1070 old_vector = cfg->vector;
1071 if (old_vector) {
1072 cpumask_and(tmp_mask, mask, cpu_online_mask);
1073 cpumask_and(tmp_mask, cfg->domain, tmp_mask);
1074 if (!cpumask_empty(tmp_mask)) {
1075 free_cpumask_var(tmp_mask);
1076 return 0;
1077 }
1078 }
1079
1125 /* Only try and allocate irqs on cpus that are present */ 1080 /* Only try and allocate irqs on cpus that are present */
1126 err = -ENOSPC; 1081 err = -ENOSPC;
1127 cpumask_clear(cfg->old_domain); 1082 for_each_cpu_and(cpu, mask, cpu_online_mask) {
1128 cpu = cpumask_first_and(mask, cpu_online_mask); 1083 int new_cpu;
1129 while (cpu < nr_cpu_ids) { 1084 int vector, offset;
1130 int new_cpu, vector, offset;
1131 1085
1132 apic->vector_allocation_domain(cpu, tmp_mask, mask); 1086 apic->vector_allocation_domain(cpu, tmp_mask);
1133
1134 if (cpumask_subset(tmp_mask, cfg->domain)) {
1135 err = 0;
1136 if (cpumask_equal(tmp_mask, cfg->domain))
1137 break;
1138 /*
1139 * New cpumask using the vector is a proper subset of
1140 * the current in use mask. So cleanup the vector
1141 * allocation for the members that are not used anymore.
1142 */
1143 cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
1144 cfg->move_in_progress =
1145 cpumask_intersects(cfg->old_domain, cpu_online_mask);
1146 cpumask_and(cfg->domain, cfg->domain, tmp_mask);
1147 break;
1148 }
1149 1087
1150 vector = current_vector; 1088 vector = current_vector;
1151 offset = current_offset; 1089 offset = current_offset;
1152next: 1090next:
1153 vector += 16; 1091 vector += 8;
1154 if (vector >= first_system_vector) { 1092 if (vector >= first_system_vector) {
1155 offset = (offset + 1) % 16; 1093 /* If out of vectors on large boxen, must share them. */
1094 offset = (offset + 1) % 8;
1156 vector = FIRST_EXTERNAL_VECTOR + offset; 1095 vector = FIRST_EXTERNAL_VECTOR + offset;
1157 } 1096 }
1158 1097 if (unlikely(current_vector == vector))
1159 if (unlikely(current_vector == vector)) {
1160 cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
1161 cpumask_andnot(tmp_mask, mask, cfg->old_domain);
1162 cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
1163 continue; 1098 continue;
1164 }
1165 1099
1166 if (test_bit(vector, used_vectors)) 1100 if (test_bit(vector, used_vectors))
1167 goto next; 1101 goto next;
@@ -1172,10 +1106,9 @@ next:
1172 /* Found one! */ 1106 /* Found one! */
1173 current_vector = vector; 1107 current_vector = vector;
1174 current_offset = offset; 1108 current_offset = offset;
1175 if (cfg->vector) { 1109 if (old_vector) {
1110 cfg->move_in_progress = 1;
1176 cpumask_copy(cfg->old_domain, cfg->domain); 1111 cpumask_copy(cfg->old_domain, cfg->domain);
1177 cfg->move_in_progress =
1178 cpumask_intersects(cfg->old_domain, cpu_online_mask);
1179 } 1112 }
1180 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) 1113 for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
1181 per_cpu(vector_irq, new_cpu)[vector] = irq; 1114 per_cpu(vector_irq, new_cpu)[vector] = irq;
@@ -1243,6 +1176,12 @@ void __setup_vector_irq(int cpu)
1243 cfg = irq_get_chip_data(irq); 1176 cfg = irq_get_chip_data(irq);
1244 if (!cfg) 1177 if (!cfg)
1245 continue; 1178 continue;
1179 /*
1180 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1181 * will be part of the irq_cfg's domain.
1182 */
1183 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1184 cpumask_set_cpu(cpu, cfg->domain);
1246 1185
1247 if (!cpumask_test_cpu(cpu, cfg->domain)) 1186 if (!cpumask_test_cpu(cpu, cfg->domain))
1248 continue; 1187 continue;
@@ -1315,100 +1254,142 @@ static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1315 fasteoi ? "fasteoi" : "edge"); 1254 fasteoi ? "fasteoi" : "edge");
1316} 1255}
1317 1256
1318static int setup_ioapic_entry(int irq, struct IO_APIC_route_entry *entry, 1257static int setup_ioapic_entry(int apic_id, int irq,
1319 unsigned int destination, int vector, 1258 struct IO_APIC_route_entry *entry,
1320 struct io_apic_irq_attr *attr) 1259 unsigned int destination, int trigger,
1260 int polarity, int vector, int pin)
1321{ 1261{
1322 if (irq_remapping_enabled) 1262 /*
1323 return setup_ioapic_remapped_entry(irq, entry, destination, 1263 * add it to the IO-APIC irq-routing table:
1324 vector, attr); 1264 */
1265 memset(entry,0,sizeof(*entry));
1325 1266
1326 memset(entry, 0, sizeof(*entry)); 1267 if (intr_remapping_enabled) {
1268 struct intel_iommu *iommu = map_ioapic_to_ir(apic_id);
1269 struct irte irte;
1270 struct IR_IO_APIC_route_entry *ir_entry =
1271 (struct IR_IO_APIC_route_entry *) entry;
1272 int index;
1327 1273
1328 entry->delivery_mode = apic->irq_delivery_mode; 1274 if (!iommu)
1329 entry->dest_mode = apic->irq_dest_mode; 1275 panic("No mapping iommu for ioapic %d\n", apic_id);
1330 entry->dest = destination;
1331 entry->vector = vector;
1332 entry->mask = 0; /* enable IRQ */
1333 entry->trigger = attr->trigger;
1334 entry->polarity = attr->polarity;
1335 1276
1336 /* 1277 index = alloc_irte(iommu, irq, 1);
1337 * Mask level triggered irqs. 1278 if (index < 0)
1279 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1280
1281 prepare_irte(&irte, vector, destination);
1282
1283 /* Set source-id of interrupt request */
1284 set_ioapic_sid(&irte, apic_id);
1285
1286 modify_irte(irq, &irte);
1287
1288 ir_entry->index2 = (index >> 15) & 0x1;
1289 ir_entry->zero = 0;
1290 ir_entry->format = 1;
1291 ir_entry->index = (index & 0x7fff);
1292 /*
1293 * IO-APIC RTE will be configured with virtual vector.
1294 * irq handler will do the explicit EOI to the io-apic.
1295 */
1296 ir_entry->vector = pin;
1297
1298 apic_printk(APIC_VERBOSE, KERN_DEBUG "IOAPIC[%d]: "
1299 "Set IRTE entry (P:%d FPD:%d Dst_Mode:%d "
1300 "Redir_hint:%d Trig_Mode:%d Dlvry_Mode:%X "
1301 "Avail:%X Vector:%02X Dest:%08X "
1302 "SID:%04X SQ:%X SVT:%X)\n",
1303 apic_id, irte.present, irte.fpd, irte.dst_mode,
1304 irte.redir_hint, irte.trigger_mode, irte.dlvry_mode,
1305 irte.avail, irte.vector, irte.dest_id,
1306 irte.sid, irte.sq, irte.svt);
1307 } else {
1308 entry->delivery_mode = apic->irq_delivery_mode;
1309 entry->dest_mode = apic->irq_dest_mode;
1310 entry->dest = destination;
1311 entry->vector = vector;
1312 }
1313
1314 entry->mask = 0; /* enable IRQ */
1315 entry->trigger = trigger;
1316 entry->polarity = polarity;
1317
1318 /* Mask level triggered irqs.
1338 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1319 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1339 */ 1320 */
1340 if (attr->trigger) 1321 if (trigger)
1341 entry->mask = 1; 1322 entry->mask = 1;
1342
1343 return 0; 1323 return 0;
1344} 1324}
1345 1325
1346static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, 1326static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1347 struct io_apic_irq_attr *attr) 1327 struct irq_cfg *cfg, int trigger, int polarity)
1348{ 1328{
1349 struct IO_APIC_route_entry entry; 1329 struct IO_APIC_route_entry entry;
1350 unsigned int dest; 1330 unsigned int dest;
1351 1331
1352 if (!IO_APIC_IRQ(irq)) 1332 if (!IO_APIC_IRQ(irq))
1353 return; 1333 return;
1334 /*
1335 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1336 * controllers like 8259. Now that IO-APIC can handle this irq, update
1337 * the cfg->domain.
1338 */
1339 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1340 apic->vector_allocation_domain(0, cfg->domain);
1354 1341
1355 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1342 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1356 return; 1343 return;
1357 1344
1358 if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), 1345 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
1359 &dest)) {
1360 pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
1361 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1362 __clear_irq_vector(irq, cfg);
1363
1364 return;
1365 }
1366 1346
1367 apic_printk(APIC_VERBOSE,KERN_DEBUG 1347 apic_printk(APIC_VERBOSE,KERN_DEBUG
1368 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1348 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1369 "IRQ %d Mode:%i Active:%i Dest:%d)\n", 1349 "IRQ %d Mode:%i Active:%i Dest:%d)\n",
1370 attr->ioapic, mpc_ioapic_id(attr->ioapic), attr->ioapic_pin, 1350 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1371 cfg->vector, irq, attr->trigger, attr->polarity, dest); 1351 irq, trigger, polarity, dest);
1372 1352
1373 if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) {
1374 pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1375 mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
1376 __clear_irq_vector(irq, cfg);
1377 1353
1354 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
1355 dest, trigger, polarity, cfg->vector, pin)) {
1356 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1357 mpc_ioapic_id(apic_id), pin);
1358 __clear_irq_vector(irq, cfg);
1378 return; 1359 return;
1379 } 1360 }
1380 1361
1381 ioapic_register_intr(irq, cfg, attr->trigger); 1362 ioapic_register_intr(irq, cfg, trigger);
1382 if (irq < legacy_pic->nr_legacy_irqs) 1363 if (irq < legacy_pic->nr_legacy_irqs)
1383 legacy_pic->mask(irq); 1364 legacy_pic->mask(irq);
1384 1365
1385 ioapic_write_entry(attr->ioapic, attr->ioapic_pin, entry); 1366 ioapic_write_entry(apic_id, pin, entry);
1386} 1367}
1387 1368
1388static bool __init io_apic_pin_not_connected(int idx, int ioapic_idx, int pin) 1369static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1389{ 1370{
1390 if (idx != -1) 1371 if (idx != -1)
1391 return false; 1372 return false;
1392 1373
1393 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", 1374 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1394 mpc_ioapic_id(ioapic_idx), pin); 1375 mpc_ioapic_id(apic_id), pin);
1395 return true; 1376 return true;
1396} 1377}
1397 1378
1398static void __init __io_apic_setup_irqs(unsigned int ioapic_idx) 1379static void __init __io_apic_setup_irqs(unsigned int apic_id)
1399{ 1380{
1400 int idx, node = cpu_to_node(0); 1381 int idx, node = cpu_to_node(0);
1401 struct io_apic_irq_attr attr; 1382 struct io_apic_irq_attr attr;
1402 unsigned int pin, irq; 1383 unsigned int pin, irq;
1403 1384
1404 for (pin = 0; pin < ioapics[ioapic_idx].nr_registers; pin++) { 1385 for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
1405 idx = find_irq_entry(ioapic_idx, pin, mp_INT); 1386 idx = find_irq_entry(apic_id, pin, mp_INT);
1406 if (io_apic_pin_not_connected(idx, ioapic_idx, pin)) 1387 if (io_apic_pin_not_connected(idx, apic_id, pin))
1407 continue; 1388 continue;
1408 1389
1409 irq = pin_2_irq(idx, ioapic_idx, pin); 1390 irq = pin_2_irq(idx, apic_id, pin);
1410 1391
1411 if ((ioapic_idx > 0) && (irq > 16)) 1392 if ((apic_id > 0) && (irq > 16))
1412 continue; 1393 continue;
1413 1394
1414 /* 1395 /*
@@ -1416,10 +1397,10 @@ static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
1416 * installed and if it returns 1: 1397 * installed and if it returns 1:
1417 */ 1398 */
1418 if (apic->multi_timer_check && 1399 if (apic->multi_timer_check &&
1419 apic->multi_timer_check(ioapic_idx, irq)) 1400 apic->multi_timer_check(apic_id, irq))
1420 continue; 1401 continue;
1421 1402
1422 set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), 1403 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1423 irq_polarity(idx)); 1404 irq_polarity(idx));
1424 1405
1425 io_apic_setup_irq_pin(irq, node, &attr); 1406 io_apic_setup_irq_pin(irq, node, &attr);
@@ -1428,12 +1409,12 @@ static void __init __io_apic_setup_irqs(unsigned int ioapic_idx)
1428 1409
1429static void __init setup_IO_APIC_irqs(void) 1410static void __init setup_IO_APIC_irqs(void)
1430{ 1411{
1431 unsigned int ioapic_idx; 1412 unsigned int apic_id;
1432 1413
1433 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1414 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1434 1415
1435 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) 1416 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1436 __io_apic_setup_irqs(ioapic_idx); 1417 __io_apic_setup_irqs(apic_id);
1437} 1418}
1438 1419
1439/* 1420/*
@@ -1443,28 +1424,28 @@ static void __init setup_IO_APIC_irqs(void)
1443 */ 1424 */
1444void setup_IO_APIC_irq_extra(u32 gsi) 1425void setup_IO_APIC_irq_extra(u32 gsi)
1445{ 1426{
1446 int ioapic_idx = 0, pin, idx, irq, node = cpu_to_node(0); 1427 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1447 struct io_apic_irq_attr attr; 1428 struct io_apic_irq_attr attr;
1448 1429
1449 /* 1430 /*
1450 * Convert 'gsi' to 'ioapic.pin'. 1431 * Convert 'gsi' to 'ioapic.pin'.
1451 */ 1432 */
1452 ioapic_idx = mp_find_ioapic(gsi); 1433 apic_id = mp_find_ioapic(gsi);
1453 if (ioapic_idx < 0) 1434 if (apic_id < 0)
1454 return; 1435 return;
1455 1436
1456 pin = mp_find_ioapic_pin(ioapic_idx, gsi); 1437 pin = mp_find_ioapic_pin(apic_id, gsi);
1457 idx = find_irq_entry(ioapic_idx, pin, mp_INT); 1438 idx = find_irq_entry(apic_id, pin, mp_INT);
1458 if (idx == -1) 1439 if (idx == -1)
1459 return; 1440 return;
1460 1441
1461 irq = pin_2_irq(idx, ioapic_idx, pin); 1442 irq = pin_2_irq(idx, apic_id, pin);
1462 1443
1463 /* Only handle the non legacy irqs on secondary ioapics */ 1444 /* Only handle the non legacy irqs on secondary ioapics */
1464 if (ioapic_idx == 0 || irq < NR_IRQS_LEGACY) 1445 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1465 return; 1446 return;
1466 1447
1467 set_io_apic_irq_attr(&attr, ioapic_idx, pin, irq_trigger(idx), 1448 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1468 irq_polarity(idx)); 1449 irq_polarity(idx));
1469 1450
1470 io_apic_setup_irq_pin_once(irq, node, &attr); 1451 io_apic_setup_irq_pin_once(irq, node, &attr);
@@ -1473,13 +1454,12 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1473/* 1454/*
1474 * Set up the timer pin, possibly with the 8259A-master behind. 1455 * Set up the timer pin, possibly with the 8259A-master behind.
1475 */ 1456 */
1476static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, 1457static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1477 unsigned int pin, int vector) 1458 int vector)
1478{ 1459{
1479 struct IO_APIC_route_entry entry; 1460 struct IO_APIC_route_entry entry;
1480 unsigned int dest;
1481 1461
1482 if (irq_remapping_enabled) 1462 if (intr_remapping_enabled)
1483 return; 1463 return;
1484 1464
1485 memset(&entry, 0, sizeof(entry)); 1465 memset(&entry, 0, sizeof(entry));
@@ -1488,13 +1468,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1488 * We use logical delivery to get the timer IRQ 1468 * We use logical delivery to get the timer IRQ
1489 * to the first CPU. 1469 * to the first CPU.
1490 */ 1470 */
1491 if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(),
1492 apic->target_cpus(), &dest)))
1493 dest = BAD_APICID;
1494
1495 entry.dest_mode = apic->irq_dest_mode; 1471 entry.dest_mode = apic->irq_dest_mode;
1496 entry.mask = 0; /* don't mask IRQ for edge */ 1472 entry.mask = 0; /* don't mask IRQ for edge */
1497 entry.dest = dest; 1473 entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus());
1498 entry.delivery_mode = apic->irq_delivery_mode; 1474 entry.delivery_mode = apic->irq_delivery_mode;
1499 entry.polarity = 0; 1475 entry.polarity = 0;
1500 entry.trigger = 0; 1476 entry.trigger = 0;
@@ -1510,28 +1486,45 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx,
1510 /* 1486 /*
1511 * Add it to the IO-APIC irq-routing table: 1487 * Add it to the IO-APIC irq-routing table:
1512 */ 1488 */
1513 ioapic_write_entry(ioapic_idx, pin, entry); 1489 ioapic_write_entry(apic_id, pin, entry);
1514} 1490}
1515 1491
1516__apicdebuginit(void) print_IO_APIC(int ioapic_idx) 1492
1493__apicdebuginit(void) print_IO_APIC(void)
1517{ 1494{
1518 int i; 1495 int apic, i;
1519 union IO_APIC_reg_00 reg_00; 1496 union IO_APIC_reg_00 reg_00;
1520 union IO_APIC_reg_01 reg_01; 1497 union IO_APIC_reg_01 reg_01;
1521 union IO_APIC_reg_02 reg_02; 1498 union IO_APIC_reg_02 reg_02;
1522 union IO_APIC_reg_03 reg_03; 1499 union IO_APIC_reg_03 reg_03;
1523 unsigned long flags; 1500 unsigned long flags;
1501 struct irq_cfg *cfg;
1502 unsigned int irq;
1503
1504 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1505 for (i = 0; i < nr_ioapics; i++)
1506 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1507 mpc_ioapic_id(i), ioapics[i].nr_registers);
1508
1509 /*
1510 * We are a bit conservative about what we expect. We have to
1511 * know about every hardware change ASAP.
1512 */
1513 printk(KERN_INFO "testing the IO APIC.......................\n");
1514
1515 for (apic = 0; apic < nr_ioapics; apic++) {
1524 1516
1525 raw_spin_lock_irqsave(&ioapic_lock, flags); 1517 raw_spin_lock_irqsave(&ioapic_lock, flags);
1526 reg_00.raw = io_apic_read(ioapic_idx, 0); 1518 reg_00.raw = io_apic_read(apic, 0);
1527 reg_01.raw = io_apic_read(ioapic_idx, 1); 1519 reg_01.raw = io_apic_read(apic, 1);
1528 if (reg_01.bits.version >= 0x10) 1520 if (reg_01.bits.version >= 0x10)
1529 reg_02.raw = io_apic_read(ioapic_idx, 2); 1521 reg_02.raw = io_apic_read(apic, 2);
1530 if (reg_01.bits.version >= 0x20) 1522 if (reg_01.bits.version >= 0x20)
1531 reg_03.raw = io_apic_read(ioapic_idx, 3); 1523 reg_03.raw = io_apic_read(apic, 3);
1532 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1524 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1533 1525
1534 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); 1526 printk("\n");
1527 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
1535 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1528 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1536 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1529 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1537 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1530 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1568,7 +1561,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1568 1561
1569 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1562 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1570 1563
1571 if (irq_remapping_enabled) { 1564 if (intr_remapping_enabled) {
1572 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR" 1565 printk(KERN_DEBUG " NR Indx Fmt Mask Trig IRR"
1573 " Pol Stat Indx2 Zero Vect:\n"); 1566 " Pol Stat Indx2 Zero Vect:\n");
1574 } else { 1567 } else {
@@ -1577,17 +1570,17 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1577 } 1570 }
1578 1571
1579 for (i = 0; i <= reg_01.bits.entries; i++) { 1572 for (i = 0; i <= reg_01.bits.entries; i++) {
1580 if (irq_remapping_enabled) { 1573 if (intr_remapping_enabled) {
1581 struct IO_APIC_route_entry entry; 1574 struct IO_APIC_route_entry entry;
1582 struct IR_IO_APIC_route_entry *ir_entry; 1575 struct IR_IO_APIC_route_entry *ir_entry;
1583 1576
1584 entry = ioapic_read_entry(ioapic_idx, i); 1577 entry = ioapic_read_entry(apic, i);
1585 ir_entry = (struct IR_IO_APIC_route_entry *) &entry; 1578 ir_entry = (struct IR_IO_APIC_route_entry *) &entry;
1586 printk(KERN_DEBUG " %02x %04X ", 1579 printk(KERN_DEBUG " %02x %04X ",
1587 i, 1580 i,
1588 ir_entry->index 1581 ir_entry->index
1589 ); 1582 );
1590 pr_cont("%1d %1d %1d %1d %1d " 1583 printk("%1d %1d %1d %1d %1d "
1591 "%1d %1d %X %02X\n", 1584 "%1d %1d %X %02X\n",
1592 ir_entry->format, 1585 ir_entry->format,
1593 ir_entry->mask, 1586 ir_entry->mask,
@@ -1602,12 +1595,12 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1602 } else { 1595 } else {
1603 struct IO_APIC_route_entry entry; 1596 struct IO_APIC_route_entry entry;
1604 1597
1605 entry = ioapic_read_entry(ioapic_idx, i); 1598 entry = ioapic_read_entry(apic, i);
1606 printk(KERN_DEBUG " %02x %02X ", 1599 printk(KERN_DEBUG " %02x %02X ",
1607 i, 1600 i,
1608 entry.dest 1601 entry.dest
1609 ); 1602 );
1610 pr_cont("%1d %1d %1d %1d %1d " 1603 printk("%1d %1d %1d %1d %1d "
1611 "%1d %1d %02X\n", 1604 "%1d %1d %02X\n",
1612 entry.mask, 1605 entry.mask,
1613 entry.trigger, 1606 entry.trigger,
@@ -1620,38 +1613,12 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
1620 ); 1613 );
1621 } 1614 }
1622 } 1615 }
1623} 1616 }
1624
1625__apicdebuginit(void) print_IO_APICs(void)
1626{
1627 int ioapic_idx;
1628 struct irq_cfg *cfg;
1629 unsigned int irq;
1630 struct irq_chip *chip;
1631
1632 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1633 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
1634 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1635 mpc_ioapic_id(ioapic_idx),
1636 ioapics[ioapic_idx].nr_registers);
1637
1638 /*
1639 * We are a bit conservative about what we expect. We have to
1640 * know about every hardware change ASAP.
1641 */
1642 printk(KERN_INFO "testing the IO APIC.......................\n");
1643
1644 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++)
1645 print_IO_APIC(ioapic_idx);
1646 1617
1647 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1618 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1648 for_each_active_irq(irq) { 1619 for_each_active_irq(irq) {
1649 struct irq_pin_list *entry; 1620 struct irq_pin_list *entry;
1650 1621
1651 chip = irq_get_chip(irq);
1652 if (chip != &ioapic_chip)
1653 continue;
1654
1655 cfg = irq_get_chip_data(irq); 1622 cfg = irq_get_chip_data(irq);
1656 if (!cfg) 1623 if (!cfg)
1657 continue; 1624 continue;
@@ -1660,11 +1627,13 @@ __apicdebuginit(void) print_IO_APICs(void)
1660 continue; 1627 continue;
1661 printk(KERN_DEBUG "IRQ%d ", irq); 1628 printk(KERN_DEBUG "IRQ%d ", irq);
1662 for_each_irq_pin(entry, cfg->irq_2_pin) 1629 for_each_irq_pin(entry, cfg->irq_2_pin)
1663 pr_cont("-> %d:%d", entry->apic, entry->pin); 1630 printk("-> %d:%d", entry->apic, entry->pin);
1664 pr_cont("\n"); 1631 printk("\n");
1665 } 1632 }
1666 1633
1667 printk(KERN_INFO ".................................... done.\n"); 1634 printk(KERN_INFO ".................................... done.\n");
1635
1636 return;
1668} 1637}
1669 1638
1670__apicdebuginit(void) print_APIC_field(int base) 1639__apicdebuginit(void) print_APIC_field(int base)
@@ -1674,9 +1643,9 @@ __apicdebuginit(void) print_APIC_field(int base)
1674 printk(KERN_DEBUG); 1643 printk(KERN_DEBUG);
1675 1644
1676 for (i = 0; i < 8; i++) 1645 for (i = 0; i < 8; i++)
1677 pr_cont("%08x", apic_read(base + i*0x10)); 1646 printk(KERN_CONT "%08x", apic_read(base + i*0x10));
1678 1647
1679 pr_cont("\n"); 1648 printk(KERN_CONT "\n");
1680} 1649}
1681 1650
1682__apicdebuginit(void) print_local_APIC(void *dummy) 1651__apicdebuginit(void) print_local_APIC(void *dummy)
@@ -1778,7 +1747,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1778 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); 1747 printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
1779 } 1748 }
1780 } 1749 }
1781 pr_cont("\n"); 1750 printk("\n");
1782} 1751}
1783 1752
1784__apicdebuginit(void) print_local_APICs(int maxcpu) 1753__apicdebuginit(void) print_local_APICs(int maxcpu)
@@ -1858,7 +1827,7 @@ __apicdebuginit(int) print_ICs(void)
1858 return 0; 1827 return 0;
1859 1828
1860 print_local_APICs(show_lapic); 1829 print_local_APICs(show_lapic);
1861 print_IO_APICs(); 1830 print_IO_APIC();
1862 1831
1863 return 0; 1832 return 0;
1864} 1833}
@@ -1944,7 +1913,7 @@ void disable_IO_APIC(void)
1944 * IOAPIC RTE as well as interrupt-remapping table entry). 1913 * IOAPIC RTE as well as interrupt-remapping table entry).
1945 * As this gets called during crash dump, keep this simple for now. 1914 * As this gets called during crash dump, keep this simple for now.
1946 */ 1915 */
1947 if (ioapic_i8259.pin != -1 && !irq_remapping_enabled) { 1916 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
1948 struct IO_APIC_route_entry entry; 1917 struct IO_APIC_route_entry entry;
1949 1918
1950 memset(&entry, 0, sizeof(entry)); 1919 memset(&entry, 0, sizeof(entry));
@@ -1968,7 +1937,7 @@ void disable_IO_APIC(void)
1968 * Use virtual wire A mode when interrupt remapping is enabled. 1937 * Use virtual wire A mode when interrupt remapping is enabled.
1969 */ 1938 */
1970 if (cpu_has_apic || apic_from_smp_config()) 1939 if (cpu_has_apic || apic_from_smp_config())
1971 disconnect_bsp_APIC(!irq_remapping_enabled && 1940 disconnect_bsp_APIC(!intr_remapping_enabled &&
1972 ioapic_i8259.pin != -1); 1941 ioapic_i8259.pin != -1);
1973} 1942}
1974 1943
@@ -1983,7 +1952,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1983{ 1952{
1984 union IO_APIC_reg_00 reg_00; 1953 union IO_APIC_reg_00 reg_00;
1985 physid_mask_t phys_id_present_map; 1954 physid_mask_t phys_id_present_map;
1986 int ioapic_idx; 1955 int apic_id;
1987 int i; 1956 int i;
1988 unsigned char old_id; 1957 unsigned char old_id;
1989 unsigned long flags; 1958 unsigned long flags;
@@ -1997,20 +1966,21 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1997 /* 1966 /*
1998 * Set the IOAPIC ID to the value stored in the MPC table. 1967 * Set the IOAPIC ID to the value stored in the MPC table.
1999 */ 1968 */
2000 for (ioapic_idx = 0; ioapic_idx < nr_ioapics; ioapic_idx++) { 1969 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
1970
2001 /* Read the register 0 value */ 1971 /* Read the register 0 value */
2002 raw_spin_lock_irqsave(&ioapic_lock, flags); 1972 raw_spin_lock_irqsave(&ioapic_lock, flags);
2003 reg_00.raw = io_apic_read(ioapic_idx, 0); 1973 reg_00.raw = io_apic_read(apic_id, 0);
2004 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1974 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2005 1975
2006 old_id = mpc_ioapic_id(ioapic_idx); 1976 old_id = mpc_ioapic_id(apic_id);
2007 1977
2008 if (mpc_ioapic_id(ioapic_idx) >= get_physical_broadcast()) { 1978 if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
2009 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1979 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2010 ioapic_idx, mpc_ioapic_id(ioapic_idx)); 1980 apic_id, mpc_ioapic_id(apic_id));
2011 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1981 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2012 reg_00.bits.ID); 1982 reg_00.bits.ID);
2013 ioapics[ioapic_idx].mp_config.apicid = reg_00.bits.ID; 1983 ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
2014 } 1984 }
2015 1985
2016 /* 1986 /*
@@ -2019,9 +1989,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2019 * 'stuck on smp_invalidate_needed IPI wait' messages. 1989 * 'stuck on smp_invalidate_needed IPI wait' messages.
2020 */ 1990 */
2021 if (apic->check_apicid_used(&phys_id_present_map, 1991 if (apic->check_apicid_used(&phys_id_present_map,
2022 mpc_ioapic_id(ioapic_idx))) { 1992 mpc_ioapic_id(apic_id))) {
2023 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1993 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2024 ioapic_idx, mpc_ioapic_id(ioapic_idx)); 1994 apic_id, mpc_ioapic_id(apic_id));
2025 for (i = 0; i < get_physical_broadcast(); i++) 1995 for (i = 0; i < get_physical_broadcast(); i++)
2026 if (!physid_isset(i, phys_id_present_map)) 1996 if (!physid_isset(i, phys_id_present_map))
2027 break; 1997 break;
@@ -2030,14 +2000,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2030 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 2000 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2031 i); 2001 i);
2032 physid_set(i, phys_id_present_map); 2002 physid_set(i, phys_id_present_map);
2033 ioapics[ioapic_idx].mp_config.apicid = i; 2003 ioapics[apic_id].mp_config.apicid = i;
2034 } else { 2004 } else {
2035 physid_mask_t tmp; 2005 physid_mask_t tmp;
2036 apic->apicid_to_cpu_present(mpc_ioapic_id(ioapic_idx), 2006 apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
2037 &tmp); 2007 &tmp);
2038 apic_printk(APIC_VERBOSE, "Setting %d in the " 2008 apic_printk(APIC_VERBOSE, "Setting %d in the "
2039 "phys_id_present_map\n", 2009 "phys_id_present_map\n",
2040 mpc_ioapic_id(ioapic_idx)); 2010 mpc_ioapic_id(apic_id));
2041 physids_or(phys_id_present_map, phys_id_present_map, tmp); 2011 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2042 } 2012 }
2043 2013
@@ -2045,36 +2015,36 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2045 * We need to adjust the IRQ routing table 2015 * We need to adjust the IRQ routing table
2046 * if the ID changed. 2016 * if the ID changed.
2047 */ 2017 */
2048 if (old_id != mpc_ioapic_id(ioapic_idx)) 2018 if (old_id != mpc_ioapic_id(apic_id))
2049 for (i = 0; i < mp_irq_entries; i++) 2019 for (i = 0; i < mp_irq_entries; i++)
2050 if (mp_irqs[i].dstapic == old_id) 2020 if (mp_irqs[i].dstapic == old_id)
2051 mp_irqs[i].dstapic 2021 mp_irqs[i].dstapic
2052 = mpc_ioapic_id(ioapic_idx); 2022 = mpc_ioapic_id(apic_id);
2053 2023
2054 /* 2024 /*
2055 * Update the ID register according to the right value 2025 * Update the ID register according to the right value
2056 * from the MPC table if they are different. 2026 * from the MPC table if they are different.
2057 */ 2027 */
2058 if (mpc_ioapic_id(ioapic_idx) == reg_00.bits.ID) 2028 if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
2059 continue; 2029 continue;
2060 2030
2061 apic_printk(APIC_VERBOSE, KERN_INFO 2031 apic_printk(APIC_VERBOSE, KERN_INFO
2062 "...changing IO-APIC physical APIC ID to %d ...", 2032 "...changing IO-APIC physical APIC ID to %d ...",
2063 mpc_ioapic_id(ioapic_idx)); 2033 mpc_ioapic_id(apic_id));
2064 2034
2065 reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); 2035 reg_00.bits.ID = mpc_ioapic_id(apic_id);
2066 raw_spin_lock_irqsave(&ioapic_lock, flags); 2036 raw_spin_lock_irqsave(&ioapic_lock, flags);
2067 io_apic_write(ioapic_idx, 0, reg_00.raw); 2037 io_apic_write(apic_id, 0, reg_00.raw);
2068 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2038 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2069 2039
2070 /* 2040 /*
2071 * Sanity check 2041 * Sanity check
2072 */ 2042 */
2073 raw_spin_lock_irqsave(&ioapic_lock, flags); 2043 raw_spin_lock_irqsave(&ioapic_lock, flags);
2074 reg_00.raw = io_apic_read(ioapic_idx, 0); 2044 reg_00.raw = io_apic_read(apic_id, 0);
2075 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2045 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2076 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) 2046 if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
2077 pr_cont("could not set ID!\n"); 2047 printk("could not set ID!\n");
2078 else 2048 else
2079 apic_printk(APIC_VERBOSE, " ok.\n"); 2049 apic_printk(APIC_VERBOSE, " ok.\n");
2080 } 2050 }
@@ -2185,11 +2155,9 @@ static int ioapic_retrigger_irq(struct irq_data *data)
2185{ 2155{
2186 struct irq_cfg *cfg = data->chip_data; 2156 struct irq_cfg *cfg = data->chip_data;
2187 unsigned long flags; 2157 unsigned long flags;
2188 int cpu;
2189 2158
2190 raw_spin_lock_irqsave(&vector_lock, flags); 2159 raw_spin_lock_irqsave(&vector_lock, flags);
2191 cpu = cpumask_first_and(cfg->domain, cpu_online_mask); 2160 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2192 apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
2193 raw_spin_unlock_irqrestore(&vector_lock, flags); 2161 raw_spin_unlock_irqrestore(&vector_lock, flags);
2194 2162
2195 return 1; 2163 return 1;
@@ -2221,13 +2189,143 @@ void send_cleanup_vector(struct irq_cfg *cfg)
2221 cfg->move_in_progress = 0; 2189 cfg->move_in_progress = 0;
2222} 2190}
2223 2191
2192static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2193{
2194 int apic, pin;
2195 struct irq_pin_list *entry;
2196 u8 vector = cfg->vector;
2197
2198 for_each_irq_pin(entry, cfg->irq_2_pin) {
2199 unsigned int reg;
2200
2201 apic = entry->apic;
2202 pin = entry->pin;
2203 /*
2204 * With interrupt-remapping, destination information comes
2205 * from interrupt-remapping table entry.
2206 */
2207 if (!irq_remapped(cfg))
2208 io_apic_write(apic, 0x11 + pin*2, dest);
2209 reg = io_apic_read(apic, 0x10 + pin*2);
2210 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2211 reg |= vector;
2212 io_apic_modify(apic, 0x10 + pin*2, reg);
2213 }
2214}
2215
2216/*
2217 * Either sets data->affinity to a valid value, and returns
2218 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2219 * leaves data->affinity untouched.
2220 */
2221int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2222 unsigned int *dest_id)
2223{
2224 struct irq_cfg *cfg = data->chip_data;
2225
2226 if (!cpumask_intersects(mask, cpu_online_mask))
2227 return -1;
2228
2229 if (assign_irq_vector(data->irq, data->chip_data, mask))
2230 return -1;
2231
2232 cpumask_copy(data->affinity, mask);
2233
2234 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2235 return 0;
2236}
2237
2238static int
2239ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2240 bool force)
2241{
2242 unsigned int dest, irq = data->irq;
2243 unsigned long flags;
2244 int ret;
2245
2246 raw_spin_lock_irqsave(&ioapic_lock, flags);
2247 ret = __ioapic_set_affinity(data, mask, &dest);
2248 if (!ret) {
2249 /* Only the high 8 bits are valid. */
2250 dest = SET_APIC_LOGICAL_ID(dest);
2251 __target_IO_APIC_irq(irq, dest, data->chip_data);
2252 }
2253 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2254 return ret;
2255}
2256
2257#ifdef CONFIG_IRQ_REMAP
2258
2259/*
2260 * Migrate the IO-APIC irq in the presence of intr-remapping.
2261 *
2262 * For both level and edge triggered, irq migration is a simple atomic
2263 * update(of vector and cpu destination) of IRTE and flush the hardware cache.
2264 *
2265 * For level triggered, we eliminate the io-apic RTE modification (with the
2266 * updated vector information), by using a virtual vector (io-apic pin number).
2267 * Real vector that is used for interrupting cpu will be coming from
2268 * the interrupt-remapping table entry.
2269 *
2270 * As the migration is a simple atomic update of IRTE, the same mechanism
2271 * is used to migrate MSI irq's in the presence of interrupt-remapping.
2272 */
2273static int
2274ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2275 bool force)
2276{
2277 struct irq_cfg *cfg = data->chip_data;
2278 unsigned int dest, irq = data->irq;
2279 struct irte irte;
2280
2281 if (!cpumask_intersects(mask, cpu_online_mask))
2282 return -EINVAL;
2283
2284 if (get_irte(irq, &irte))
2285 return -EBUSY;
2286
2287 if (assign_irq_vector(irq, cfg, mask))
2288 return -EBUSY;
2289
2290 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2291
2292 irte.vector = cfg->vector;
2293 irte.dest_id = IRTE_DEST(dest);
2294
2295 /*
2296 * Atomically updates the IRTE with the new destination, vector
2297 * and flushes the interrupt entry cache.
2298 */
2299 modify_irte(irq, &irte);
2300
2301 /*
2302 * After this point, all the interrupts will start arriving
2303 * at the new destination. So, time to cleanup the previous
2304 * vector allocation.
2305 */
2306 if (cfg->move_in_progress)
2307 send_cleanup_vector(cfg);
2308
2309 cpumask_copy(data->affinity, mask);
2310 return 0;
2311}
2312
2313#else
2314static inline int
2315ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2316 bool force)
2317{
2318 return 0;
2319}
2320#endif
2321
2224asmlinkage void smp_irq_move_cleanup_interrupt(void) 2322asmlinkage void smp_irq_move_cleanup_interrupt(void)
2225{ 2323{
2226 unsigned vector, me; 2324 unsigned vector, me;
2227 2325
2228 ack_APIC_irq(); 2326 ack_APIC_irq();
2229 irq_enter();
2230 exit_idle(); 2327 exit_idle();
2328 irq_enter();
2231 2329
2232 me = smp_processor_id(); 2330 me = smp_processor_id();
2233 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 2331 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -2245,9 +2343,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2245 continue; 2343 continue;
2246 2344
2247 cfg = irq_cfg(irq); 2345 cfg = irq_cfg(irq);
2248 if (!cfg)
2249 continue;
2250
2251 raw_spin_lock(&desc->lock); 2346 raw_spin_lock(&desc->lock);
2252 2347
2253 /* 2348 /*
@@ -2311,87 +2406,6 @@ void irq_force_complete_move(int irq)
2311static inline void irq_complete_move(struct irq_cfg *cfg) { } 2406static inline void irq_complete_move(struct irq_cfg *cfg) { }
2312#endif 2407#endif
2313 2408
2314static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
2315{
2316 int apic, pin;
2317 struct irq_pin_list *entry;
2318 u8 vector = cfg->vector;
2319
2320 for_each_irq_pin(entry, cfg->irq_2_pin) {
2321 unsigned int reg;
2322
2323 apic = entry->apic;
2324 pin = entry->pin;
2325 /*
2326 * With interrupt-remapping, destination information comes
2327 * from interrupt-remapping table entry.
2328 */
2329 if (!irq_remapped(cfg))
2330 io_apic_write(apic, 0x11 + pin*2, dest);
2331 reg = io_apic_read(apic, 0x10 + pin*2);
2332 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
2333 reg |= vector;
2334 io_apic_modify(apic, 0x10 + pin*2, reg);
2335 }
2336}
2337
2338/*
2339 * Either sets data->affinity to a valid value, and returns
2340 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2341 * leaves data->affinity untouched.
2342 */
2343int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2344 unsigned int *dest_id)
2345{
2346 struct irq_cfg *cfg = data->chip_data;
2347 unsigned int irq = data->irq;
2348 int err;
2349
2350 if (!config_enabled(CONFIG_SMP))
2351 return -1;
2352
2353 if (!cpumask_intersects(mask, cpu_online_mask))
2354 return -EINVAL;
2355
2356 err = assign_irq_vector(irq, cfg, mask);
2357 if (err)
2358 return err;
2359
2360 err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
2361 if (err) {
2362 if (assign_irq_vector(irq, cfg, data->affinity))
2363 pr_err("Failed to recover vector for irq %d\n", irq);
2364 return err;
2365 }
2366
2367 cpumask_copy(data->affinity, mask);
2368
2369 return 0;
2370}
2371
2372static int
2373ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2374 bool force)
2375{
2376 unsigned int dest, irq = data->irq;
2377 unsigned long flags;
2378 int ret;
2379
2380 if (!config_enabled(CONFIG_SMP))
2381 return -1;
2382
2383 raw_spin_lock_irqsave(&ioapic_lock, flags);
2384 ret = __ioapic_set_affinity(data, mask, &dest);
2385 if (!ret) {
2386 /* Only the high 8 bits are valid. */
2387 dest = SET_APIC_LOGICAL_ID(dest);
2388 __target_IO_APIC_irq(irq, dest, data->chip_data);
2389 ret = IRQ_SET_MASK_OK_NOCOPY;
2390 }
2391 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2392 return ret;
2393}
2394
2395static void ack_apic_edge(struct irq_data *data) 2409static void ack_apic_edge(struct irq_data *data)
2396{ 2410{
2397 irq_complete_move(data->chip_data); 2411 irq_complete_move(data->chip_data);
@@ -2401,95 +2415,62 @@ static void ack_apic_edge(struct irq_data *data)
2401 2415
2402atomic_t irq_mis_count; 2416atomic_t irq_mis_count;
2403 2417
2404#ifdef CONFIG_GENERIC_PENDING_IRQ 2418/*
2405static bool io_apic_level_ack_pending(struct irq_cfg *cfg) 2419 * IO-APIC versions below 0x20 don't support EOI register.
2420 * For the record, here is the information about various versions:
2421 * 0Xh 82489DX
2422 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2423 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2424 * 30h-FFh Reserved
2425 *
2426 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2427 * version as 0x2. This is an error with documentation and these ICH chips
2428 * use io-apic's of version 0x20.
2429 *
2430 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2431 * Otherwise, we simulate the EOI message manually by changing the trigger
2432 * mode to edge and then back to level, with RTE being masked during this.
2433*/
2434static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2406{ 2435{
2407 struct irq_pin_list *entry; 2436 struct irq_pin_list *entry;
2408 unsigned long flags; 2437 unsigned long flags;
2409 2438
2410 raw_spin_lock_irqsave(&ioapic_lock, flags); 2439 raw_spin_lock_irqsave(&ioapic_lock, flags);
2411 for_each_irq_pin(entry, cfg->irq_2_pin) { 2440 for_each_irq_pin(entry, cfg->irq_2_pin) {
2412 unsigned int reg; 2441 if (mpc_ioapic_ver(entry->apic) >= 0x20) {
2413 int pin; 2442 /*
2414 2443 * Intr-remapping uses pin number as the virtual vector
2415 pin = entry->pin; 2444 * in the RTE. Actual vector is programmed in
2416 reg = io_apic_read(entry->apic, 0x10 + pin*2); 2445 * intr-remapping table entry. Hence for the io-apic
2417 /* Is the remote IRR bit set? */ 2446 * EOI we use the pin number.
2418 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 2447 */
2419 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2448 if (irq_remapped(cfg))
2420 return true; 2449 io_apic_eoi(entry->apic, entry->pin);
2450 else
2451 io_apic_eoi(entry->apic, cfg->vector);
2452 } else {
2453 __mask_and_edge_IO_APIC_irq(entry);
2454 __unmask_and_level_IO_APIC_irq(entry);
2421 } 2455 }
2422 } 2456 }
2423 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2457 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2424
2425 return false;
2426} 2458}
2427 2459
2428static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
2429{
2430 /* If we are moving the irq we need to mask it */
2431 if (unlikely(irqd_is_setaffinity_pending(data))) {
2432 mask_ioapic(cfg);
2433 return true;
2434 }
2435 return false;
2436}
2437
2438static inline void ioapic_irqd_unmask(struct irq_data *data,
2439 struct irq_cfg *cfg, bool masked)
2440{
2441 if (unlikely(masked)) {
2442 /* Only migrate the irq if the ack has been received.
2443 *
2444 * On rare occasions the broadcast level triggered ack gets
2445 * delayed going to ioapics, and if we reprogram the
2446 * vector while Remote IRR is still set the irq will never
2447 * fire again.
2448 *
2449 * To prevent this scenario we read the Remote IRR bit
2450 * of the ioapic. This has two effects.
2451 * - On any sane system the read of the ioapic will
2452 * flush writes (and acks) going to the ioapic from
2453 * this cpu.
2454 * - We get to see if the ACK has actually been delivered.
2455 *
2456 * Based on failed experiments of reprogramming the
2457 * ioapic entry from outside of irq context starting
2458 * with masking the ioapic entry and then polling until
2459 * Remote IRR was clear before reprogramming the
2460 * ioapic I don't trust the Remote IRR bit to be
2461 * completey accurate.
2462 *
2463 * However there appears to be no other way to plug
2464 * this race, so if the Remote IRR bit is not
2465 * accurate and is causing problems then it is a hardware bug
2466 * and you can go talk to the chipset vendor about it.
2467 */
2468 if (!io_apic_level_ack_pending(cfg))
2469 irq_move_masked_irq(data);
2470 unmask_ioapic(cfg);
2471 }
2472}
2473#else
2474static inline bool ioapic_irqd_mask(struct irq_data *data, struct irq_cfg *cfg)
2475{
2476 return false;
2477}
2478static inline void ioapic_irqd_unmask(struct irq_data *data,
2479 struct irq_cfg *cfg, bool masked)
2480{
2481}
2482#endif
2483
2484static void ack_apic_level(struct irq_data *data) 2460static void ack_apic_level(struct irq_data *data)
2485{ 2461{
2486 struct irq_cfg *cfg = data->chip_data; 2462 struct irq_cfg *cfg = data->chip_data;
2487 int i, irq = data->irq; 2463 int i, do_unmask_irq = 0, irq = data->irq;
2488 unsigned long v; 2464 unsigned long v;
2489 bool masked;
2490 2465
2491 irq_complete_move(cfg); 2466 irq_complete_move(cfg);
2492 masked = ioapic_irqd_mask(data, cfg); 2467#ifdef CONFIG_GENERIC_PENDING_IRQ
2468 /* If we are moving the irq we need to mask it */
2469 if (unlikely(irqd_is_setaffinity_pending(data))) {
2470 do_unmask_irq = 1;
2471 mask_ioapic(cfg);
2472 }
2473#endif
2493 2474
2494 /* 2475 /*
2495 * It appears there is an erratum which affects at least version 0x11 2476 * It appears there is an erratum which affects at least version 0x11
@@ -2545,7 +2526,38 @@ static void ack_apic_level(struct irq_data *data)
2545 eoi_ioapic_irq(irq, cfg); 2526 eoi_ioapic_irq(irq, cfg);
2546 } 2527 }
2547 2528
2548 ioapic_irqd_unmask(data, cfg, masked); 2529 /* Now we can move and renable the irq */
2530 if (unlikely(do_unmask_irq)) {
2531 /* Only migrate the irq if the ack has been received.
2532 *
2533 * On rare occasions the broadcast level triggered ack gets
2534 * delayed going to ioapics, and if we reprogram the
2535 * vector while Remote IRR is still set the irq will never
2536 * fire again.
2537 *
2538 * To prevent this scenario we read the Remote IRR bit
2539 * of the ioapic. This has two effects.
2540 * - On any sane system the read of the ioapic will
2541 * flush writes (and acks) going to the ioapic from
2542 * this cpu.
2543 * - We get to see if the ACK has actually been delivered.
2544 *
2545 * Based on failed experiments of reprogramming the
2546 * ioapic entry from outside of irq context starting
2547 * with masking the ioapic entry and then polling until
2548 * Remote IRR was clear before reprogramming the
2549 * ioapic I don't trust the Remote IRR bit to be
2550 * completey accurate.
2551 *
2552 * However there appears to be no other way to plug
2553 * this race, so if the Remote IRR bit is not
2554 * accurate and is causing problems then it is a hardware bug
2555 * and you can go talk to the chipset vendor about it.
2556 */
2557 if (!io_apic_level_ack_pending(cfg))
2558 irq_move_masked_irq(data);
2559 unmask_ioapic(cfg);
2560 }
2549} 2561}
2550 2562
2551#ifdef CONFIG_IRQ_REMAP 2563#ifdef CONFIG_IRQ_REMAP
@@ -2571,7 +2583,9 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip)
2571 chip->irq_ack = ir_ack_apic_edge; 2583 chip->irq_ack = ir_ack_apic_edge;
2572 chip->irq_eoi = ir_ack_apic_level; 2584 chip->irq_eoi = ir_ack_apic_level;
2573 2585
2574 chip->irq_set_affinity = set_remapped_irq_affinity; 2586#ifdef CONFIG_SMP
2587 chip->irq_set_affinity = ir_ioapic_set_affinity;
2588#endif
2575} 2589}
2576#endif /* CONFIG_IRQ_REMAP */ 2590#endif /* CONFIG_IRQ_REMAP */
2577 2591
@@ -2582,7 +2596,9 @@ static struct irq_chip ioapic_chip __read_mostly = {
2582 .irq_unmask = unmask_ioapic_irq, 2596 .irq_unmask = unmask_ioapic_irq,
2583 .irq_ack = ack_apic_edge, 2597 .irq_ack = ack_apic_edge,
2584 .irq_eoi = ack_apic_level, 2598 .irq_eoi = ack_apic_level,
2599#ifdef CONFIG_SMP
2585 .irq_set_affinity = ioapic_set_affinity, 2600 .irq_set_affinity = ioapic_set_affinity,
2601#endif
2586 .irq_retrigger = ioapic_retrigger_irq, 2602 .irq_retrigger = ioapic_retrigger_irq,
2587}; 2603};
2588 2604
@@ -2781,7 +2797,7 @@ static inline void __init check_timer(void)
2781 * 8259A. 2797 * 8259A.
2782 */ 2798 */
2783 if (pin1 == -1) { 2799 if (pin1 == -1) {
2784 if (irq_remapping_enabled) 2800 if (intr_remapping_enabled)
2785 panic("BIOS bug: timer not connected to IO-APIC"); 2801 panic("BIOS bug: timer not connected to IO-APIC");
2786 pin1 = pin2; 2802 pin1 = pin2;
2787 apic1 = apic2; 2803 apic1 = apic2;
@@ -2814,7 +2830,7 @@ static inline void __init check_timer(void)
2814 clear_IO_APIC_pin(0, pin1); 2830 clear_IO_APIC_pin(0, pin1);
2815 goto out; 2831 goto out;
2816 } 2832 }
2817 if (irq_remapping_enabled) 2833 if (intr_remapping_enabled)
2818 panic("timer doesn't work through Interrupt-remapped IO-APIC"); 2834 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2819 local_irq_disable(); 2835 local_irq_disable();
2820 clear_IO_APIC_pin(apic1, pin1); 2836 clear_IO_APIC_pin(apic1, pin1);
@@ -2877,10 +2893,6 @@ static inline void __init check_timer(void)
2877 } 2893 }
2878 local_irq_disable(); 2894 local_irq_disable();
2879 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); 2895 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2880 if (x2apic_preenabled)
2881 apic_printk(APIC_QUIET, KERN_INFO
2882 "Perhaps problem with the pre-enabled x2apic mode\n"
2883 "Try booting with x2apic and interrupt-remapping disabled in the bios.\n");
2884 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " 2896 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2885 "report. Then try booting with the 'noapic' option.\n"); 2897 "report. Then try booting with the 'noapic' option.\n");
2886out: 2898out:
@@ -2941,26 +2953,27 @@ static int __init io_apic_bug_finalize(void)
2941 2953
2942late_initcall(io_apic_bug_finalize); 2954late_initcall(io_apic_bug_finalize);
2943 2955
2944static void resume_ioapic_id(int ioapic_idx) 2956static void resume_ioapic_id(int ioapic_id)
2945{ 2957{
2946 unsigned long flags; 2958 unsigned long flags;
2947 union IO_APIC_reg_00 reg_00; 2959 union IO_APIC_reg_00 reg_00;
2948 2960
2961
2949 raw_spin_lock_irqsave(&ioapic_lock, flags); 2962 raw_spin_lock_irqsave(&ioapic_lock, flags);
2950 reg_00.raw = io_apic_read(ioapic_idx, 0); 2963 reg_00.raw = io_apic_read(ioapic_id, 0);
2951 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) { 2964 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
2952 reg_00.bits.ID = mpc_ioapic_id(ioapic_idx); 2965 reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
2953 io_apic_write(ioapic_idx, 0, reg_00.raw); 2966 io_apic_write(ioapic_id, 0, reg_00.raw);
2954 } 2967 }
2955 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2968 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2956} 2969}
2957 2970
2958static void ioapic_resume(void) 2971static void ioapic_resume(void)
2959{ 2972{
2960 int ioapic_idx; 2973 int ioapic_id;
2961 2974
2962 for (ioapic_idx = nr_ioapics - 1; ioapic_idx >= 0; ioapic_idx--) 2975 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2963 resume_ioapic_id(ioapic_idx); 2976 resume_ioapic_id(ioapic_id);
2964 2977
2965 restore_ioapic_entries(); 2978 restore_ioapic_entries();
2966} 2979}
@@ -3038,7 +3051,7 @@ void destroy_irq(unsigned int irq)
3038 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); 3051 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3039 3052
3040 if (irq_remapped(cfg)) 3053 if (irq_remapped(cfg))
3041 free_remapped_irq(irq); 3054 free_irte(irq);
3042 raw_spin_lock_irqsave(&vector_lock, flags); 3055 raw_spin_lock_irqsave(&vector_lock, flags);
3043 __clear_irq_vector(irq, cfg); 3056 __clear_irq_vector(irq, cfg);
3044 raw_spin_unlock_irqrestore(&vector_lock, flags); 3057 raw_spin_unlock_irqrestore(&vector_lock, flags);
@@ -3064,43 +3077,61 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3064 if (err) 3077 if (err)
3065 return err; 3078 return err;
3066 3079
3067 err = apic->cpu_mask_to_apicid_and(cfg->domain, 3080 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3068 apic->target_cpus(), &dest);
3069 if (err)
3070 return err;
3071 3081
3072 if (irq_remapped(cfg)) { 3082 if (irq_remapped(cfg)) {
3073 compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); 3083 struct irte irte;
3074 return err; 3084 int ir_index;
3075 } 3085 u16 sub_handle;
3076 3086
3077 if (x2apic_enabled()) 3087 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
3078 msg->address_hi = MSI_ADDR_BASE_HI | 3088 BUG_ON(ir_index == -1);
3079 MSI_ADDR_EXT_DEST_ID(dest); 3089
3080 else 3090 prepare_irte(&irte, cfg->vector, dest);
3081 msg->address_hi = MSI_ADDR_BASE_HI; 3091
3092 /* Set source-id of interrupt request */
3093 if (pdev)
3094 set_msi_sid(&irte, pdev);
3095 else
3096 set_hpet_sid(&irte, hpet_id);
3082 3097
3083 msg->address_lo = 3098 modify_irte(irq, &irte);
3084 MSI_ADDR_BASE_LO |
3085 ((apic->irq_dest_mode == 0) ?
3086 MSI_ADDR_DEST_MODE_PHYSICAL:
3087 MSI_ADDR_DEST_MODE_LOGICAL) |
3088 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3089 MSI_ADDR_REDIRECTION_CPU:
3090 MSI_ADDR_REDIRECTION_LOWPRI) |
3091 MSI_ADDR_DEST_ID(dest);
3092
3093 msg->data =
3094 MSI_DATA_TRIGGER_EDGE |
3095 MSI_DATA_LEVEL_ASSERT |
3096 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3097 MSI_DATA_DELIVERY_FIXED:
3098 MSI_DATA_DELIVERY_LOWPRI) |
3099 MSI_DATA_VECTOR(cfg->vector);
3100 3099
3100 msg->address_hi = MSI_ADDR_BASE_HI;
3101 msg->data = sub_handle;
3102 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
3103 MSI_ADDR_IR_SHV |
3104 MSI_ADDR_IR_INDEX1(ir_index) |
3105 MSI_ADDR_IR_INDEX2(ir_index);
3106 } else {
3107 if (x2apic_enabled())
3108 msg->address_hi = MSI_ADDR_BASE_HI |
3109 MSI_ADDR_EXT_DEST_ID(dest);
3110 else
3111 msg->address_hi = MSI_ADDR_BASE_HI;
3112
3113 msg->address_lo =
3114 MSI_ADDR_BASE_LO |
3115 ((apic->irq_dest_mode == 0) ?
3116 MSI_ADDR_DEST_MODE_PHYSICAL:
3117 MSI_ADDR_DEST_MODE_LOGICAL) |
3118 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3119 MSI_ADDR_REDIRECTION_CPU:
3120 MSI_ADDR_REDIRECTION_LOWPRI) |
3121 MSI_ADDR_DEST_ID(dest);
3122
3123 msg->data =
3124 MSI_DATA_TRIGGER_EDGE |
3125 MSI_DATA_LEVEL_ASSERT |
3126 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3127 MSI_DATA_DELIVERY_FIXED:
3128 MSI_DATA_DELIVERY_LOWPRI) |
3129 MSI_DATA_VECTOR(cfg->vector);
3130 }
3101 return err; 3131 return err;
3102} 3132}
3103 3133
3134#ifdef CONFIG_SMP
3104static int 3135static int
3105msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) 3136msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3106{ 3137{
@@ -3120,8 +3151,9 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3120 3151
3121 __write_msi_msg(data->msi_desc, &msg); 3152 __write_msi_msg(data->msi_desc, &msg);
3122 3153
3123 return IRQ_SET_MASK_OK_NOCOPY; 3154 return 0;
3124} 3155}
3156#endif /* CONFIG_SMP */
3125 3157
3126/* 3158/*
3127 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, 3159 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
@@ -3132,10 +3164,39 @@ static struct irq_chip msi_chip = {
3132 .irq_unmask = unmask_msi_irq, 3164 .irq_unmask = unmask_msi_irq,
3133 .irq_mask = mask_msi_irq, 3165 .irq_mask = mask_msi_irq,
3134 .irq_ack = ack_apic_edge, 3166 .irq_ack = ack_apic_edge,
3167#ifdef CONFIG_SMP
3135 .irq_set_affinity = msi_set_affinity, 3168 .irq_set_affinity = msi_set_affinity,
3169#endif
3136 .irq_retrigger = ioapic_retrigger_irq, 3170 .irq_retrigger = ioapic_retrigger_irq,
3137}; 3171};
3138 3172
3173/*
3174 * Map the PCI dev to the corresponding remapping hardware unit
3175 * and allocate 'nvec' consecutive interrupt-remapping table entries
3176 * in it.
3177 */
3178static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3179{
3180 struct intel_iommu *iommu;
3181 int index;
3182
3183 iommu = map_dev_to_ir(dev);
3184 if (!iommu) {
3185 printk(KERN_ERR
3186 "Unable to map PCI %s to iommu\n", pci_name(dev));
3187 return -ENOENT;
3188 }
3189
3190 index = alloc_irte(iommu, irq, nvec);
3191 if (index < 0) {
3192 printk(KERN_ERR
3193 "Unable to allocate %d IRTE for PCI %s\n", nvec,
3194 pci_name(dev));
3195 return -ENOSPC;
3196 }
3197 return index;
3198}
3199
3139static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3200static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3140{ 3201{
3141 struct irq_chip *chip = &msi_chip; 3202 struct irq_chip *chip = &msi_chip;
@@ -3166,6 +3227,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3166 int node, ret, sub_handle, index = 0; 3227 int node, ret, sub_handle, index = 0;
3167 unsigned int irq, irq_want; 3228 unsigned int irq, irq_want;
3168 struct msi_desc *msidesc; 3229 struct msi_desc *msidesc;
3230 struct intel_iommu *iommu = NULL;
3169 3231
3170 /* x86 doesn't support multiple MSI yet */ 3232 /* x86 doesn't support multiple MSI yet */
3171 if (type == PCI_CAP_ID_MSI && nvec > 1) 3233 if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3179,7 +3241,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3179 if (irq == 0) 3241 if (irq == 0)
3180 return -1; 3242 return -1;
3181 irq_want = irq + 1; 3243 irq_want = irq + 1;
3182 if (!irq_remapping_enabled) 3244 if (!intr_remapping_enabled)
3183 goto no_ir; 3245 goto no_ir;
3184 3246
3185 if (!sub_handle) { 3247 if (!sub_handle) {
@@ -3187,16 +3249,23 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3187 * allocate the consecutive block of IRTE's 3249 * allocate the consecutive block of IRTE's
3188 * for 'nvec' 3250 * for 'nvec'
3189 */ 3251 */
3190 index = msi_alloc_remapped_irq(dev, irq, nvec); 3252 index = msi_alloc_irte(dev, irq, nvec);
3191 if (index < 0) { 3253 if (index < 0) {
3192 ret = index; 3254 ret = index;
3193 goto error; 3255 goto error;
3194 } 3256 }
3195 } else { 3257 } else {
3196 ret = msi_setup_remapped_irq(dev, irq, index, 3258 iommu = map_dev_to_ir(dev);
3197 sub_handle); 3259 if (!iommu) {
3198 if (ret < 0) 3260 ret = -ENOENT;
3199 goto error; 3261 goto error;
3262 }
3263 /*
3264 * setup the mapping between the irq and the IRTE
3265 * base index, the sub_handle pointing to the
3266 * appropriate interrupt remap table entry.
3267 */
3268 set_irte_irq(irq, iommu, index, sub_handle);
3200 } 3269 }
3201no_ir: 3270no_ir:
3202 ret = setup_msi_irq(dev, msidesc, irq); 3271 ret = setup_msi_irq(dev, msidesc, irq);
@@ -3217,6 +3286,7 @@ void native_teardown_msi_irq(unsigned int irq)
3217} 3286}
3218 3287
3219#ifdef CONFIG_DMAR_TABLE 3288#ifdef CONFIG_DMAR_TABLE
3289#ifdef CONFIG_SMP
3220static int 3290static int
3221dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, 3291dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3222 bool force) 3292 bool force)
@@ -3238,15 +3308,19 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3238 3308
3239 dmar_msi_write(irq, &msg); 3309 dmar_msi_write(irq, &msg);
3240 3310
3241 return IRQ_SET_MASK_OK_NOCOPY; 3311 return 0;
3242} 3312}
3243 3313
3314#endif /* CONFIG_SMP */
3315
3244static struct irq_chip dmar_msi_type = { 3316static struct irq_chip dmar_msi_type = {
3245 .name = "DMAR_MSI", 3317 .name = "DMAR_MSI",
3246 .irq_unmask = dmar_msi_unmask, 3318 .irq_unmask = dmar_msi_unmask,
3247 .irq_mask = dmar_msi_mask, 3319 .irq_mask = dmar_msi_mask,
3248 .irq_ack = ack_apic_edge, 3320 .irq_ack = ack_apic_edge,
3321#ifdef CONFIG_SMP
3249 .irq_set_affinity = dmar_msi_set_affinity, 3322 .irq_set_affinity = dmar_msi_set_affinity,
3323#endif
3250 .irq_retrigger = ioapic_retrigger_irq, 3324 .irq_retrigger = ioapic_retrigger_irq,
3251}; 3325};
3252 3326
@@ -3267,6 +3341,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3267 3341
3268#ifdef CONFIG_HPET_TIMER 3342#ifdef CONFIG_HPET_TIMER
3269 3343
3344#ifdef CONFIG_SMP
3270static int hpet_msi_set_affinity(struct irq_data *data, 3345static int hpet_msi_set_affinity(struct irq_data *data,
3271 const struct cpumask *mask, bool force) 3346 const struct cpumask *mask, bool force)
3272{ 3347{
@@ -3286,15 +3361,19 @@ static int hpet_msi_set_affinity(struct irq_data *data,
3286 3361
3287 hpet_msi_write(data->handler_data, &msg); 3362 hpet_msi_write(data->handler_data, &msg);
3288 3363
3289 return IRQ_SET_MASK_OK_NOCOPY; 3364 return 0;
3290} 3365}
3291 3366
3367#endif /* CONFIG_SMP */
3368
3292static struct irq_chip hpet_msi_type = { 3369static struct irq_chip hpet_msi_type = {
3293 .name = "HPET_MSI", 3370 .name = "HPET_MSI",
3294 .irq_unmask = hpet_msi_unmask, 3371 .irq_unmask = hpet_msi_unmask,
3295 .irq_mask = hpet_msi_mask, 3372 .irq_mask = hpet_msi_mask,
3296 .irq_ack = ack_apic_edge, 3373 .irq_ack = ack_apic_edge,
3374#ifdef CONFIG_SMP
3297 .irq_set_affinity = hpet_msi_set_affinity, 3375 .irq_set_affinity = hpet_msi_set_affinity,
3376#endif
3298 .irq_retrigger = ioapic_retrigger_irq, 3377 .irq_retrigger = ioapic_retrigger_irq,
3299}; 3378};
3300 3379
@@ -3304,10 +3383,16 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3304 struct msi_msg msg; 3383 struct msi_msg msg;
3305 int ret; 3384 int ret;
3306 3385
3307 if (irq_remapping_enabled) { 3386 if (intr_remapping_enabled) {
3308 ret = setup_hpet_msi_remapped(irq, id); 3387 struct intel_iommu *iommu = map_hpet_to_ir(id);
3309 if (ret) 3388 int index;
3310 return ret; 3389
3390 if (!iommu)
3391 return -1;
3392
3393 index = alloc_irte(iommu, irq, 1);
3394 if (index < 0)
3395 return -1;
3311 } 3396 }
3312 3397
3313 ret = msi_compose_msg(NULL, irq, &msg, id); 3398 ret = msi_compose_msg(NULL, irq, &msg, id);
@@ -3330,6 +3415,8 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3330 */ 3415 */
3331#ifdef CONFIG_HT_IRQ 3416#ifdef CONFIG_HT_IRQ
3332 3417
3418#ifdef CONFIG_SMP
3419
3333static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) 3420static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3334{ 3421{
3335 struct ht_irq_msg msg; 3422 struct ht_irq_msg msg;
@@ -3354,23 +3441,25 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3354 return -1; 3441 return -1;
3355 3442
3356 target_ht_irq(data->irq, dest, cfg->vector); 3443 target_ht_irq(data->irq, dest, cfg->vector);
3357 return IRQ_SET_MASK_OK_NOCOPY; 3444 return 0;
3358} 3445}
3359 3446
3447#endif
3448
3360static struct irq_chip ht_irq_chip = { 3449static struct irq_chip ht_irq_chip = {
3361 .name = "PCI-HT", 3450 .name = "PCI-HT",
3362 .irq_mask = mask_ht_irq, 3451 .irq_mask = mask_ht_irq,
3363 .irq_unmask = unmask_ht_irq, 3452 .irq_unmask = unmask_ht_irq,
3364 .irq_ack = ack_apic_edge, 3453 .irq_ack = ack_apic_edge,
3454#ifdef CONFIG_SMP
3365 .irq_set_affinity = ht_set_affinity, 3455 .irq_set_affinity = ht_set_affinity,
3456#endif
3366 .irq_retrigger = ioapic_retrigger_irq, 3457 .irq_retrigger = ioapic_retrigger_irq,
3367}; 3458};
3368 3459
3369int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3460int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3370{ 3461{
3371 struct irq_cfg *cfg; 3462 struct irq_cfg *cfg;
3372 struct ht_irq_msg msg;
3373 unsigned dest;
3374 int err; 3463 int err;
3375 3464
3376 if (disable_apic) 3465 if (disable_apic)
@@ -3378,37 +3467,36 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3378 3467
3379 cfg = irq_cfg(irq); 3468 cfg = irq_cfg(irq);
3380 err = assign_irq_vector(irq, cfg, apic->target_cpus()); 3469 err = assign_irq_vector(irq, cfg, apic->target_cpus());
3381 if (err) 3470 if (!err) {
3382 return err; 3471 struct ht_irq_msg msg;
3472 unsigned dest;
3383 3473
3384 err = apic->cpu_mask_to_apicid_and(cfg->domain, 3474 dest = apic->cpu_mask_to_apicid_and(cfg->domain,
3385 apic->target_cpus(), &dest); 3475 apic->target_cpus());
3386 if (err)
3387 return err;
3388 3476
3389 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); 3477 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
3390
3391 msg.address_lo =
3392 HT_IRQ_LOW_BASE |
3393 HT_IRQ_LOW_DEST_ID(dest) |
3394 HT_IRQ_LOW_VECTOR(cfg->vector) |
3395 ((apic->irq_dest_mode == 0) ?
3396 HT_IRQ_LOW_DM_PHYSICAL :
3397 HT_IRQ_LOW_DM_LOGICAL) |
3398 HT_IRQ_LOW_RQEOI_EDGE |
3399 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3400 HT_IRQ_LOW_MT_FIXED :
3401 HT_IRQ_LOW_MT_ARBITRATED) |
3402 HT_IRQ_LOW_IRQ_MASKED;
3403 3478
3404 write_ht_irq_msg(irq, &msg); 3479 msg.address_lo =
3480 HT_IRQ_LOW_BASE |
3481 HT_IRQ_LOW_DEST_ID(dest) |
3482 HT_IRQ_LOW_VECTOR(cfg->vector) |
3483 ((apic->irq_dest_mode == 0) ?
3484 HT_IRQ_LOW_DM_PHYSICAL :
3485 HT_IRQ_LOW_DM_LOGICAL) |
3486 HT_IRQ_LOW_RQEOI_EDGE |
3487 ((apic->irq_delivery_mode != dest_LowestPrio) ?
3488 HT_IRQ_LOW_MT_FIXED :
3489 HT_IRQ_LOW_MT_ARBITRATED) |
3490 HT_IRQ_LOW_IRQ_MASKED;
3405 3491
3406 irq_set_chip_and_handler_name(irq, &ht_irq_chip, 3492 write_ht_irq_msg(irq, &msg);
3407 handle_edge_irq, "edge");
3408 3493
3409 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3494 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3495 handle_edge_irq, "edge");
3410 3496
3411 return 0; 3497 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
3498 }
3499 return err;
3412} 3500}
3413#endif /* CONFIG_HT_IRQ */ 3501#endif /* CONFIG_HT_IRQ */
3414 3502
@@ -3422,25 +3510,26 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3422 return -EINVAL; 3510 return -EINVAL;
3423 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin); 3511 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3424 if (!ret) 3512 if (!ret)
3425 setup_ioapic_irq(irq, cfg, attr); 3513 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3514 attr->trigger, attr->polarity);
3426 return ret; 3515 return ret;
3427} 3516}
3428 3517
3429int io_apic_setup_irq_pin_once(unsigned int irq, int node, 3518int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3430 struct io_apic_irq_attr *attr) 3519 struct io_apic_irq_attr *attr)
3431{ 3520{
3432 unsigned int ioapic_idx = attr->ioapic, pin = attr->ioapic_pin; 3521 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3433 int ret; 3522 int ret;
3434 3523
3435 /* Avoid redundant programming */ 3524 /* Avoid redundant programming */
3436 if (test_bit(pin, ioapics[ioapic_idx].pin_programmed)) { 3525 if (test_bit(pin, ioapics[id].pin_programmed)) {
3437 pr_debug("Pin %d-%d already programmed\n", 3526 pr_debug("Pin %d-%d already programmed\n",
3438 mpc_ioapic_id(ioapic_idx), pin); 3527 mpc_ioapic_id(id), pin);
3439 return 0; 3528 return 0;
3440 } 3529 }
3441 ret = io_apic_setup_irq_pin(irq, node, attr); 3530 ret = io_apic_setup_irq_pin(irq, node, attr);
3442 if (!ret) 3531 if (!ret)
3443 set_bit(pin, ioapics[ioapic_idx].pin_programmed); 3532 set_bit(pin, ioapics[id].pin_programmed);
3444 return ret; 3533 return ret;
3445} 3534}
3446 3535
@@ -3476,6 +3565,7 @@ int get_nr_irqs_gsi(void)
3476 return nr_irqs_gsi; 3565 return nr_irqs_gsi;
3477} 3566}
3478 3567
3568#ifdef CONFIG_SPARSE_IRQ
3479int __init arch_probe_nr_irqs(void) 3569int __init arch_probe_nr_irqs(void)
3480{ 3570{
3481 int nr; 3571 int nr;
@@ -3495,6 +3585,7 @@ int __init arch_probe_nr_irqs(void)
3495 3585
3496 return NR_IRQS_LEGACY; 3586 return NR_IRQS_LEGACY;
3497} 3587}
3588#endif
3498 3589
3499int io_apic_set_pci_routing(struct device *dev, int irq, 3590int io_apic_set_pci_routing(struct device *dev, int irq,
3500 struct io_apic_irq_attr *irq_attr) 3591 struct io_apic_irq_attr *irq_attr)
@@ -3576,8 +3667,7 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3576 3667
3577 /* Sanity check */ 3668 /* Sanity check */
3578 if (reg_00.bits.ID != apic_id) { 3669 if (reg_00.bits.ID != apic_id) {
3579 pr_err("IOAPIC[%d]: Unable to change apic_id!\n", 3670 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
3580 ioapic);
3581 return -1; 3671 return -1;
3582 } 3672 }
3583 } 3673 }
@@ -3683,8 +3773,8 @@ void __init setup_ioapic_dest(void)
3683 else 3773 else
3684 mask = apic->target_cpus(); 3774 mask = apic->target_cpus();
3685 3775
3686 if (irq_remapping_enabled) 3776 if (intr_remapping_enabled)
3687 set_remapped_irq_affinity(idata, mask, false); 3777 ir_ioapic_set_affinity(idata, mask, false);
3688 else 3778 else
3689 ioapic_set_affinity(idata, mask, false); 3779 ioapic_set_affinity(idata, mask, false);
3690 } 3780 }
@@ -3726,7 +3816,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
3726 return res; 3816 return res;
3727} 3817}
3728 3818
3729void __init native_io_apic_init_mappings(void) 3819void __init ioapic_and_gsi_init(void)
3730{ 3820{
3731 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3821 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
3732 struct resource *ioapic_res; 3822 struct resource *ioapic_res;
@@ -3822,33 +3912,15 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
3822static __init int bad_ioapic(unsigned long address) 3912static __init int bad_ioapic(unsigned long address)
3823{ 3913{
3824 if (nr_ioapics >= MAX_IO_APICS) { 3914 if (nr_ioapics >= MAX_IO_APICS) {
3825 pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n", 3915 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
3826 MAX_IO_APICS, nr_ioapics); 3916 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
3827 return 1; 3917 return 1;
3828 } 3918 }
3829 if (!address) { 3919 if (!address) {
3830 pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n"); 3920 printk(KERN_WARNING "WARNING: Bogus (zero) I/O APIC address"
3831 return 1; 3921 " found in table, skipping!\n");
3832 }
3833 return 0;
3834}
3835
3836static __init int bad_ioapic_register(int idx)
3837{
3838 union IO_APIC_reg_00 reg_00;
3839 union IO_APIC_reg_01 reg_01;
3840 union IO_APIC_reg_02 reg_02;
3841
3842 reg_00.raw = io_apic_read(idx, 0);
3843 reg_01.raw = io_apic_read(idx, 1);
3844 reg_02.raw = io_apic_read(idx, 2);
3845
3846 if (reg_00.raw == -1 && reg_01.raw == -1 && reg_02.raw == -1) {
3847 pr_warn("I/O APIC 0x%x registers return all ones, skipping!\n",
3848 mpc_ioapic_addr(idx));
3849 return 1; 3922 return 1;
3850 } 3923 }
3851
3852 return 0; 3924 return 0;
3853} 3925}
3854 3926
@@ -3868,12 +3940,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3868 ioapics[idx].mp_config.apicaddr = address; 3940 ioapics[idx].mp_config.apicaddr = address;
3869 3941
3870 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 3942 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
3871
3872 if (bad_ioapic_register(idx)) {
3873 clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
3874 return;
3875 }
3876
3877 ioapics[idx].mp_config.apicid = io_apic_unique_id(id); 3943 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
3878 ioapics[idx].mp_config.apicver = io_apic_get_version(idx); 3944 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
3879 3945
@@ -3894,10 +3960,10 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3894 if (gsi_cfg->gsi_end >= gsi_top) 3960 if (gsi_cfg->gsi_end >= gsi_top)
3895 gsi_top = gsi_cfg->gsi_end + 1; 3961 gsi_top = gsi_cfg->gsi_end + 1;
3896 3962
3897 pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n", 3963 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
3898 idx, mpc_ioapic_id(idx), 3964 "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
3899 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), 3965 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
3900 gsi_cfg->gsi_base, gsi_cfg->gsi_end); 3966 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
3901 3967
3902 nr_ioapics++; 3968 nr_ioapics++;
3903} 3969}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index d661ee95cab..c4a61ca1349 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -406,13 +406,16 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid)
406 * We use physical apicids here, not logical, so just return the default 406 * We use physical apicids here, not logical, so just return the default
407 * physical broadcast to stop people from breaking us 407 * physical broadcast to stop people from breaking us
408 */ 408 */
409static int 409static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask)
410{
411 return 0x0F;
412}
413
414static inline unsigned int
410numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 415numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
411 const struct cpumask *andmask, 416 const struct cpumask *andmask)
412 unsigned int *apicid)
413{ 417{
414 *apicid = 0x0F; 418 return 0x0F;
415 return 0;
416} 419}
417 420
418/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ 421/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
@@ -438,6 +441,20 @@ static int probe_numaq(void)
438 return found_numaq; 441 return found_numaq;
439} 442}
440 443
444static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask)
445{
446 /* Careful. Some cpus do not strictly honor the set of cpus
447 * specified in the interrupt destination when using lowest
448 * priority interrupt delivery mode.
449 *
450 * In particular there was a hyperthreading cpu observed to
451 * deliver interrupts to the wrong hyperthread when only one
452 * hyperthread was specified in the interrupt desitination.
453 */
454 cpumask_clear(retmask);
455 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
456}
457
441static void numaq_setup_portio_remap(void) 458static void numaq_setup_portio_remap(void)
442{ 459{
443 int num_quads = num_online_nodes(); 460 int num_quads = num_online_nodes();
@@ -461,7 +478,6 @@ static struct apic __refdata apic_numaq = {
461 .name = "NUMAQ", 478 .name = "NUMAQ",
462 .probe = probe_numaq, 479 .probe = probe_numaq,
463 .acpi_madt_oem_check = NULL, 480 .acpi_madt_oem_check = NULL,
464 .apic_id_valid = default_apic_id_valid,
465 .apic_id_registered = numaq_apic_id_registered, 481 .apic_id_registered = numaq_apic_id_registered,
466 482
467 .irq_delivery_mode = dest_LowestPrio, 483 .irq_delivery_mode = dest_LowestPrio,
@@ -474,7 +490,7 @@ static struct apic __refdata apic_numaq = {
474 .check_apicid_used = numaq_check_apicid_used, 490 .check_apicid_used = numaq_check_apicid_used,
475 .check_apicid_present = numaq_check_apicid_present, 491 .check_apicid_present = numaq_check_apicid_present,
476 492
477 .vector_allocation_domain = flat_vector_allocation_domain, 493 .vector_allocation_domain = numaq_vector_allocation_domain,
478 .init_apic_ldr = numaq_init_apic_ldr, 494 .init_apic_ldr = numaq_init_apic_ldr,
479 495
480 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 496 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
@@ -492,6 +508,7 @@ static struct apic __refdata apic_numaq = {
492 .set_apic_id = NULL, 508 .set_apic_id = NULL,
493 .apic_id_mask = 0x0F << 24, 509 .apic_id_mask = 0x0F << 24,
494 510
511 .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid,
495 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, 512 .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and,
496 513
497 .send_IPI_mask = numaq_send_IPI_mask, 514 .send_IPI_mask = numaq_send_IPI_mask,
@@ -512,7 +529,6 @@ static struct apic __refdata apic_numaq = {
512 529
513 .read = native_apic_mem_read, 530 .read = native_apic_mem_read,
514 .write = native_apic_mem_write, 531 .write = native_apic_mem_write,
515 .eoi_write = native_apic_mem_write,
516 .icr_read = native_apic_icr_read, 532 .icr_read = native_apic_icr_read,
517 .icr_write = native_apic_icr_write, 533 .icr_write = native_apic_icr_write,
518 .wait_icr_idle = native_apic_wait_icr_idle, 534 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index eb35ef9ee63..0787bb3412f 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -66,6 +66,21 @@ static void setup_apic_flat_routing(void)
66#endif 66#endif
67} 67}
68 68
69static void default_vector_allocation_domain(int cpu, struct cpumask *retmask)
70{
71 /*
72 * Careful. Some cpus do not strictly honor the set of cpus
73 * specified in the interrupt destination when using lowest
74 * priority interrupt delivery mode.
75 *
76 * In particular there was a hyperthreading cpu observed to
77 * deliver interrupts to the wrong hyperthread when only one
78 * hyperthread was specified in the interrupt desitination.
79 */
80 cpumask_clear(retmask);
81 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
82}
83
69/* should be called last. */ 84/* should be called last. */
70static int probe_default(void) 85static int probe_default(void)
71{ 86{
@@ -77,7 +92,6 @@ static struct apic apic_default = {
77 .name = "default", 92 .name = "default",
78 .probe = probe_default, 93 .probe = probe_default,
79 .acpi_madt_oem_check = NULL, 94 .acpi_madt_oem_check = NULL,
80 .apic_id_valid = default_apic_id_valid,
81 .apic_id_registered = default_apic_id_registered, 95 .apic_id_registered = default_apic_id_registered,
82 96
83 .irq_delivery_mode = dest_LowestPrio, 97 .irq_delivery_mode = dest_LowestPrio,
@@ -90,7 +104,7 @@ static struct apic apic_default = {
90 .check_apicid_used = default_check_apicid_used, 104 .check_apicid_used = default_check_apicid_used,
91 .check_apicid_present = default_check_apicid_present, 105 .check_apicid_present = default_check_apicid_present,
92 106
93 .vector_allocation_domain = flat_vector_allocation_domain, 107 .vector_allocation_domain = default_vector_allocation_domain,
94 .init_apic_ldr = default_init_apic_ldr, 108 .init_apic_ldr = default_init_apic_ldr,
95 109
96 .ioapic_phys_id_map = default_ioapic_phys_id_map, 110 .ioapic_phys_id_map = default_ioapic_phys_id_map,
@@ -108,7 +122,8 @@ static struct apic apic_default = {
108 .set_apic_id = NULL, 122 .set_apic_id = NULL,
109 .apic_id_mask = 0x0F << 24, 123 .apic_id_mask = 0x0F << 24,
110 124
111 .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, 125 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
126 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
112 127
113 .send_IPI_mask = default_send_IPI_mask_logical, 128 .send_IPI_mask = default_send_IPI_mask_logical,
114 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, 129 .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical,
@@ -126,7 +141,6 @@ static struct apic apic_default = {
126 141
127 .read = native_apic_mem_read, 142 .read = native_apic_mem_read,
128 .write = native_apic_mem_write, 143 .write = native_apic_mem_write,
129 .eoi_write = native_apic_mem_write,
130 .icr_read = native_apic_icr_read, 144 .icr_read = native_apic_icr_read,
131 .icr_write = native_apic_icr_write, 145 .icr_write = native_apic_icr_write,
132 .wait_icr_idle = native_apic_wait_icr_idle, 146 .wait_icr_idle = native_apic_wait_icr_idle,
@@ -192,9 +206,6 @@ void __init default_setup_apic_routing(void)
192 206
193 if (apic->setup_apic_routing) 207 if (apic->setup_apic_routing)
194 apic->setup_apic_routing(); 208 apic->setup_apic_routing();
195
196 if (x86_platform.apic_post_init)
197 x86_platform.apic_post_init();
198} 209}
199 210
200void __init generic_apic_probe(void) 211void __init generic_apic_probe(void)
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1793dba7a74..3fe98669892 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,6 +23,11 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
27{
28 return hard_smp_processor_id() >> index_msb;
29}
30
26/* 31/*
27 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 32 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
28 */ 33 */
@@ -43,8 +48,10 @@ void __init default_setup_apic_routing(void)
43 } 48 }
44 } 49 }
45 50
46 if (x86_platform.apic_post_init) 51 if (is_vsmp_box()) {
47 x86_platform.apic_post_init(); 52 /* need to update phys_pkg_id */
53 apic->phys_pkg_id = apicid_phys_pkg_id;
54 }
48} 55}
49 56
50/* Same for both flat and physical. */ 57/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 77c95c0e1bf..19114423c58 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -26,8 +26,6 @@
26 * 26 *
27 */ 27 */
28 28
29#define pr_fmt(fmt) "summit: %s: " fmt, __func__
30
31#include <linux/mm.h> 29#include <linux/mm.h>
32#include <linux/init.h> 30#include <linux/init.h>
33#include <asm/io.h> 31#include <asm/io.h>
@@ -237,8 +235,8 @@ static int summit_apic_id_registered(void)
237 235
238static void summit_setup_apic_routing(void) 236static void summit_setup_apic_routing(void)
239{ 237{
240 pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", 238 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
241 nr_ioapics); 239 nr_ioapics);
242} 240}
243 241
244static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
@@ -265,48 +263,43 @@ static int summit_check_phys_apicid_present(int physical_apicid)
265 return 1; 263 return 1;
266} 264}
267 265
268static inline int 266static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
269summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id)
270{ 267{
271 unsigned int round = 0; 268 unsigned int round = 0;
272 unsigned int cpu, apicid = 0; 269 int cpu, apicid = 0;
273 270
274 /* 271 /*
275 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
276 */ 273 */
277 for_each_cpu_and(cpu, cpumask, cpu_online_mask) { 274 for_each_cpu(cpu, cpumask) {
278 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
279 276
280 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
281 pr_err("Not a valid mask!\n"); 278 printk("%s: Not a valid mask!\n", __func__);
282 return -EINVAL; 279 return BAD_APICID;
283 } 280 }
284 apicid |= new_apicid; 281 apicid |= new_apicid;
285 round++; 282 round++;
286 } 283 }
287 if (!round) 284 return apicid;
288 return -EINVAL;
289 *dest_id = apicid;
290 return 0;
291} 285}
292 286
293static int 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
294summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 288 const struct cpumask *andmask)
295 const struct cpumask *andmask,
296 unsigned int *apicid)
297{ 289{
290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
298 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
299 *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
300 292
301 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
302 return 0; 294 return apicid;
303 295
304 cpumask_and(cpumask, inmask, andmask); 296 cpumask_and(cpumask, inmask, andmask);
305 summit_cpu_mask_to_apicid(cpumask, apicid); 297 cpumask_and(cpumask, cpumask, cpu_online_mask);
298 apicid = summit_cpu_mask_to_apicid(cpumask);
306 299
307 free_cpumask_var(cpumask); 300 free_cpumask_var(cpumask);
308 301
309 return 0; 302 return apicid;
310} 303}
311 304
312/* 305/*
@@ -327,6 +320,20 @@ static int probe_summit(void)
327 return 0; 320 return 0;
328} 321}
329 322
323static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask)
324{
325 /* Careful. Some cpus do not strictly honor the set of cpus
326 * specified in the interrupt destination when using lowest
327 * priority interrupt delivery mode.
328 *
329 * In particular there was a hyperthreading cpu observed to
330 * deliver interrupts to the wrong hyperthread when only one
331 * hyperthread was specified in the interrupt desitination.
332 */
333 cpumask_clear(retmask);
334 cpumask_bits(retmask)[0] = APIC_ALL_CPUS;
335}
336
330#ifdef CONFIG_X86_SUMMIT_NUMA 337#ifdef CONFIG_X86_SUMMIT_NUMA
331static struct rio_table_hdr *rio_table_hdr; 338static struct rio_table_hdr *rio_table_hdr;
332static struct scal_detail *scal_devs[MAX_NUMNODES]; 339static struct scal_detail *scal_devs[MAX_NUMNODES];
@@ -348,7 +355,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
348 } 355 }
349 } 356 }
350 if (i == rio_table_hdr->num_rio_dev) { 357 if (i == rio_table_hdr->num_rio_dev) {
351 pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); 358 printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
352 return last_bus; 359 return last_bus;
353 } 360 }
354 361
@@ -359,7 +366,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
359 } 366 }
360 } 367 }
361 if (i == rio_table_hdr->num_scal_dev) { 368 if (i == rio_table_hdr->num_scal_dev) {
362 pr_err("Couldn't find owner Twister for Cyclone!\n"); 369 printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
363 return last_bus; 370 return last_bus;
364 } 371 }
365 372
@@ -389,7 +396,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
389 num_buses = 9; 396 num_buses = 9;
390 break; 397 break;
391 default: 398 default:
392 pr_info("Unsupported Winnipeg type!\n"); 399 printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
393 return last_bus; 400 return last_bus;
394 } 401 }
395 402
@@ -404,15 +411,13 @@ static int build_detail_arrays(void)
404 int i, scal_detail_size, rio_detail_size; 411 int i, scal_detail_size, rio_detail_size;
405 412
406 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { 413 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
407 pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", 414 printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
408 MAX_NUMNODES, rio_table_hdr->num_scal_dev);
409 return 0; 415 return 0;
410 } 416 }
411 417
412 switch (rio_table_hdr->version) { 418 switch (rio_table_hdr->version) {
413 default: 419 default:
414 pr_warn("Invalid Rio Grande Table Version: %d\n", 420 printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
415 rio_table_hdr->version);
416 return 0; 421 return 0;
417 case 2: 422 case 2:
418 scal_detail_size = 11; 423 scal_detail_size = 11;
@@ -457,7 +462,7 @@ void setup_summit(void)
457 offset = *((unsigned short *)(ptr + offset)); 462 offset = *((unsigned short *)(ptr + offset));
458 } 463 }
459 if (!rio_table_hdr) { 464 if (!rio_table_hdr) {
460 pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); 465 printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
461 return; 466 return;
462 } 467 }
463 468
@@ -491,7 +496,6 @@ static struct apic apic_summit = {
491 .name = "summit", 496 .name = "summit",
492 .probe = probe_summit, 497 .probe = probe_summit,
493 .acpi_madt_oem_check = summit_acpi_madt_oem_check, 498 .acpi_madt_oem_check = summit_acpi_madt_oem_check,
494 .apic_id_valid = default_apic_id_valid,
495 .apic_id_registered = summit_apic_id_registered, 499 .apic_id_registered = summit_apic_id_registered,
496 500
497 .irq_delivery_mode = dest_LowestPrio, 501 .irq_delivery_mode = dest_LowestPrio,
@@ -504,7 +508,7 @@ static struct apic apic_summit = {
504 .check_apicid_used = summit_check_apicid_used, 508 .check_apicid_used = summit_check_apicid_used,
505 .check_apicid_present = summit_check_apicid_present, 509 .check_apicid_present = summit_check_apicid_present,
506 510
507 .vector_allocation_domain = flat_vector_allocation_domain, 511 .vector_allocation_domain = summit_vector_allocation_domain,
508 .init_apic_ldr = summit_init_apic_ldr, 512 .init_apic_ldr = summit_init_apic_ldr,
509 513
510 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
@@ -522,6 +526,7 @@ static struct apic apic_summit = {
522 .set_apic_id = NULL, 526 .set_apic_id = NULL,
523 .apic_id_mask = 0xFF << 24, 527 .apic_id_mask = 0xFF << 24,
524 528
529 .cpu_mask_to_apicid = summit_cpu_mask_to_apicid,
525 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, 530 .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and,
526 531
527 .send_IPI_mask = summit_send_IPI_mask, 532 .send_IPI_mask = summit_send_IPI_mask,
@@ -540,7 +545,6 @@ static struct apic apic_summit = {
540 545
541 .read = native_apic_mem_read, 546 .read = native_apic_mem_read,
542 .write = native_apic_mem_write, 547 .write = native_apic_mem_write,
543 .eoi_write = native_apic_mem_write,
544 .icr_read = native_apic_icr_read, 548 .icr_read = native_apic_icr_read,
545 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
546 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index c88baa4ff0e..50079587582 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
81} 81}
82 82
83static void 83static void
84x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
85{ 85{
86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
87} 87}
@@ -96,37 +96,36 @@ static void x2apic_send_IPI_all(int vector)
96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
97} 97}
98 98
99static int 99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
100x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
101 const struct cpumask *andmask,
102 unsigned int *apicid)
103{ 100{
104 u32 dest = 0; 101 /*
105 u16 cluster; 102 * We're using fixed IRQ delivery, can only return one logical APIC ID.
106 int i; 103 * May as well be the first.
104 */
105 int cpu = cpumask_first(cpumask);
107 106
108 for_each_cpu_and(i, cpumask, andmask) { 107 if ((unsigned)cpu < nr_cpu_ids)
109 if (!cpumask_test_cpu(i, cpu_online_mask)) 108 return per_cpu(x86_cpu_to_logical_apicid, cpu);
110 continue; 109 else
111 dest = per_cpu(x86_cpu_to_logical_apicid, i); 110 return BAD_APICID;
112 cluster = x2apic_cluster(i); 111}
113 break;
114 }
115 112
116 if (!dest) 113static unsigned int
117 return -EINVAL; 114x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
115 const struct cpumask *andmask)
116{
117 int cpu;
118 118
119 for_each_cpu_and(i, cpumask, andmask) { 119 /*
120 if (!cpumask_test_cpu(i, cpu_online_mask)) 120 * We're using fixed IRQ delivery, can only return one logical APIC ID.
121 continue; 121 * May as well be the first.
122 if (cluster != x2apic_cluster(i)) 122 */
123 continue; 123 for_each_cpu_and(cpu, cpumask, andmask) {
124 dest |= per_cpu(x86_cpu_to_logical_apicid, i); 124 if (cpumask_test_cpu(cpu, cpu_online_mask))
125 break;
125 } 126 }
126 127
127 *apicid = dest; 128 return per_cpu(x86_cpu_to_logical_apicid, cpu);
128
129 return 0;
130} 129}
131 130
132static void init_x2apic_ldr(void) 131static void init_x2apic_ldr(void)
@@ -209,50 +208,23 @@ static int x2apic_cluster_probe(void)
209 return 0; 208 return 0;
210} 209}
211 210
212static const struct cpumask *x2apic_cluster_target_cpus(void)
213{
214 return cpu_all_mask;
215}
216
217/*
218 * Each x2apic cluster is an allocation domain.
219 */
220static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask,
221 const struct cpumask *mask)
222{
223 /*
224 * To minimize vector pressure, default case of boot, device bringup
225 * etc will use a single cpu for the interrupt destination.
226 *
227 * On explicit migration requests coming from irqbalance etc,
228 * interrupts will be routed to the x2apic cluster (cluster-id
229 * derived from the first cpu in the mask) members specified
230 * in the mask.
231 */
232 if (mask == x2apic_cluster_target_cpus())
233 cpumask_copy(retmask, cpumask_of(cpu));
234 else
235 cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu));
236}
237
238static struct apic apic_x2apic_cluster = { 211static struct apic apic_x2apic_cluster = {
239 212
240 .name = "cluster x2apic", 213 .name = "cluster x2apic",
241 .probe = x2apic_cluster_probe, 214 .probe = x2apic_cluster_probe,
242 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
243 .apic_id_valid = x2apic_apic_id_valid,
244 .apic_id_registered = x2apic_apic_id_registered, 216 .apic_id_registered = x2apic_apic_id_registered,
245 217
246 .irq_delivery_mode = dest_LowestPrio, 218 .irq_delivery_mode = dest_LowestPrio,
247 .irq_dest_mode = 1, /* logical */ 219 .irq_dest_mode = 1, /* logical */
248 220
249 .target_cpus = x2apic_cluster_target_cpus, 221 .target_cpus = x2apic_target_cpus,
250 .disable_esr = 0, 222 .disable_esr = 0,
251 .dest_logical = APIC_DEST_LOGICAL, 223 .dest_logical = APIC_DEST_LOGICAL,
252 .check_apicid_used = NULL, 224 .check_apicid_used = NULL,
253 .check_apicid_present = NULL, 225 .check_apicid_present = NULL,
254 226
255 .vector_allocation_domain = cluster_vector_allocation_domain, 227 .vector_allocation_domain = x2apic_vector_allocation_domain,
256 .init_apic_ldr = init_x2apic_ldr, 228 .init_apic_ldr = init_x2apic_ldr,
257 229
258 .ioapic_phys_id_map = NULL, 230 .ioapic_phys_id_map = NULL,
@@ -270,6 +242,7 @@ static struct apic apic_x2apic_cluster = {
270 .set_apic_id = x2apic_set_apic_id, 242 .set_apic_id = x2apic_set_apic_id,
271 .apic_id_mask = 0xFFFFFFFFu, 243 .apic_id_mask = 0xFFFFFFFFu,
272 244
245 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
273 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, 246 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
274 247
275 .send_IPI_mask = x2apic_send_IPI_mask, 248 .send_IPI_mask = x2apic_send_IPI_mask,
@@ -286,7 +259,6 @@ static struct apic apic_x2apic_cluster = {
286 259
287 .read = native_apic_msr_read, 260 .read = native_apic_msr_read,
288 .write = native_apic_msr_write, 261 .write = native_apic_msr_write,
289 .eoi_write = native_apic_msr_eoi_write,
290 .icr_read = native_x2apic_icr_read, 262 .icr_read = native_x2apic_icr_read,
291 .icr_write = native_x2apic_icr_write, 263 .icr_write = native_x2apic_icr_write,
292 .wait_icr_idle = native_x2apic_wait_icr_idle, 264 .wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index e03a1e180e8..f5373dfde21 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -24,12 +24,6 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
24{ 24{
25 if (x2apic_phys) 25 if (x2apic_phys)
26 return x2apic_enabled(); 26 return x2apic_enabled();
27 else if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
28 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) &&
29 x2apic_enabled()) {
30 printk(KERN_DEBUG "System requires x2apic physical mode\n");
31 return 1;
32 }
33 else 27 else
34 return 0; 28 return 0;
35} 29}
@@ -76,6 +70,38 @@ static void x2apic_send_IPI_all(int vector)
76 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); 70 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
77} 71}
78 72
73static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
74{
75 /*
76 * We're using fixed IRQ delivery, can only return one phys APIC ID.
77 * May as well be the first.
78 */
79 int cpu = cpumask_first(cpumask);
80
81 if ((unsigned)cpu < nr_cpu_ids)
82 return per_cpu(x86_cpu_to_apicid, cpu);
83 else
84 return BAD_APICID;
85}
86
87static unsigned int
88x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
89 const struct cpumask *andmask)
90{
91 int cpu;
92
93 /*
94 * We're using fixed IRQ delivery, can only return one phys APIC ID.
95 * May as well be the first.
96 */
97 for_each_cpu_and(cpu, cpumask, andmask) {
98 if (cpumask_test_cpu(cpu, cpu_online_mask))
99 break;
100 }
101
102 return per_cpu(x86_cpu_to_apicid, cpu);
103}
104
79static void init_x2apic_ldr(void) 105static void init_x2apic_ldr(void)
80{ 106{
81} 107}
@@ -93,19 +119,18 @@ static struct apic apic_x2apic_phys = {
93 .name = "physical x2apic", 119 .name = "physical x2apic",
94 .probe = x2apic_phys_probe, 120 .probe = x2apic_phys_probe,
95 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
96 .apic_id_valid = x2apic_apic_id_valid,
97 .apic_id_registered = x2apic_apic_id_registered, 122 .apic_id_registered = x2apic_apic_id_registered,
98 123
99 .irq_delivery_mode = dest_Fixed, 124 .irq_delivery_mode = dest_Fixed,
100 .irq_dest_mode = 0, /* physical */ 125 .irq_dest_mode = 0, /* physical */
101 126
102 .target_cpus = online_target_cpus, 127 .target_cpus = x2apic_target_cpus,
103 .disable_esr = 0, 128 .disable_esr = 0,
104 .dest_logical = 0, 129 .dest_logical = 0,
105 .check_apicid_used = NULL, 130 .check_apicid_used = NULL,
106 .check_apicid_present = NULL, 131 .check_apicid_present = NULL,
107 132
108 .vector_allocation_domain = default_vector_allocation_domain, 133 .vector_allocation_domain = x2apic_vector_allocation_domain,
109 .init_apic_ldr = init_x2apic_ldr, 134 .init_apic_ldr = init_x2apic_ldr,
110 135
111 .ioapic_phys_id_map = NULL, 136 .ioapic_phys_id_map = NULL,
@@ -123,7 +148,8 @@ static struct apic apic_x2apic_phys = {
123 .set_apic_id = x2apic_set_apic_id, 148 .set_apic_id = x2apic_set_apic_id,
124 .apic_id_mask = 0xFFFFFFFFu, 149 .apic_id_mask = 0xFFFFFFFFu,
125 150
126 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, 151 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
152 .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and,
127 153
128 .send_IPI_mask = x2apic_send_IPI_mask, 154 .send_IPI_mask = x2apic_send_IPI_mask,
129 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, 155 .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
@@ -139,7 +165,6 @@ static struct apic apic_x2apic_phys = {
139 165
140 .read = native_apic_msr_read, 166 .read = native_apic_msr_read,
141 .write = native_apic_msr_write, 167 .write = native_apic_msr_write,
142 .eoi_write = native_apic_msr_eoi_write,
143 .icr_read = native_x2apic_icr_read, 168 .icr_read = native_x2apic_icr_read,
144 .icr_write = native_x2apic_icr_write, 169 .icr_write = native_x2apic_icr_write,
145 .wait_icr_idle = native_x2apic_wait_icr_idle, 170 .wait_icr_idle = native_x2apic_wait_icr_idle,
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8cfade9510a..cfeb978f49f 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -93,8 +93,6 @@ static int __init early_get_pnodeid(void)
93 93
94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER) 94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; 95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96 if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X)
97 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
98 96
99 uv_hub_info->hub_revision = uv_min_hub_revision_id; 97 uv_hub_info->hub_revision = uv_min_hub_revision_id;
100 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); 98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
@@ -185,6 +183,17 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
185unsigned long sn_rtc_cycles_per_second; 183unsigned long sn_rtc_cycles_per_second;
186EXPORT_SYMBOL(sn_rtc_cycles_per_second); 184EXPORT_SYMBOL(sn_rtc_cycles_per_second);
187 185
186static const struct cpumask *uv_target_cpus(void)
187{
188 return cpu_online_mask;
189}
190
191static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
192{
193 cpumask_clear(retmask);
194 cpumask_set_cpu(cpu, retmask);
195}
196
188static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) 197static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
189{ 198{
190#ifdef CONFIG_SMP 199#ifdef CONFIG_SMP
@@ -255,26 +264,34 @@ static void uv_send_IPI_all(int vector)
255 uv_send_IPI_mask(cpu_online_mask, vector); 264 uv_send_IPI_mask(cpu_online_mask, vector);
256} 265}
257 266
258static int uv_apic_id_valid(int apicid) 267static int uv_apic_id_registered(void)
259{ 268{
260 return 1; 269 return 1;
261} 270}
262 271
263static int uv_apic_id_registered(void) 272static void uv_init_apic_ldr(void)
264{ 273{
265 return 1;
266} 274}
267 275
268static void uv_init_apic_ldr(void) 276static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
269{ 277{
278 /*
279 * We're using fixed IRQ delivery, can only return one phys APIC ID.
280 * May as well be the first.
281 */
282 int cpu = cpumask_first(cpumask);
283
284 if ((unsigned)cpu < nr_cpu_ids)
285 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
286 else
287 return BAD_APICID;
270} 288}
271 289
272static int 290static unsigned int
273uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 291uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
274 const struct cpumask *andmask, 292 const struct cpumask *andmask)
275 unsigned int *apicid)
276{ 293{
277 int unsigned cpu; 294 int cpu;
278 295
279 /* 296 /*
280 * We're using fixed IRQ delivery, can only return one phys APIC ID. 297 * We're using fixed IRQ delivery, can only return one phys APIC ID.
@@ -284,13 +301,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
284 if (cpumask_test_cpu(cpu, cpu_online_mask)) 301 if (cpumask_test_cpu(cpu, cpu_online_mask))
285 break; 302 break;
286 } 303 }
287 304 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
288 if (likely(cpu < nr_cpu_ids)) {
289 *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
290 return 0;
291 }
292
293 return -EINVAL;
294} 305}
295 306
296static unsigned int x2apic_get_apic_id(unsigned long x) 307static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -338,19 +349,18 @@ static struct apic __refdata apic_x2apic_uv_x = {
338 .name = "UV large system", 349 .name = "UV large system",
339 .probe = uv_probe, 350 .probe = uv_probe,
340 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 351 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
341 .apic_id_valid = uv_apic_id_valid,
342 .apic_id_registered = uv_apic_id_registered, 352 .apic_id_registered = uv_apic_id_registered,
343 353
344 .irq_delivery_mode = dest_Fixed, 354 .irq_delivery_mode = dest_Fixed,
345 .irq_dest_mode = 0, /* physical */ 355 .irq_dest_mode = 0, /* physical */
346 356
347 .target_cpus = online_target_cpus, 357 .target_cpus = uv_target_cpus,
348 .disable_esr = 0, 358 .disable_esr = 0,
349 .dest_logical = APIC_DEST_LOGICAL, 359 .dest_logical = APIC_DEST_LOGICAL,
350 .check_apicid_used = NULL, 360 .check_apicid_used = NULL,
351 .check_apicid_present = NULL, 361 .check_apicid_present = NULL,
352 362
353 .vector_allocation_domain = default_vector_allocation_domain, 363 .vector_allocation_domain = uv_vector_allocation_domain,
354 .init_apic_ldr = uv_init_apic_ldr, 364 .init_apic_ldr = uv_init_apic_ldr,
355 365
356 .ioapic_phys_id_map = NULL, 366 .ioapic_phys_id_map = NULL,
@@ -368,6 +378,7 @@ static struct apic __refdata apic_x2apic_uv_x = {
368 .set_apic_id = set_apic_id, 378 .set_apic_id = set_apic_id,
369 .apic_id_mask = 0xFFFFFFFFu, 379 .apic_id_mask = 0xFFFFFFFFu,
370 380
381 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
371 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, 382 .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and,
372 383
373 .send_IPI_mask = uv_send_IPI_mask, 384 .send_IPI_mask = uv_send_IPI_mask,
@@ -385,7 +396,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
385 396
386 .read = native_apic_msr_read, 397 .read = native_apic_msr_read,
387 .write = native_apic_msr_write, 398 .write = native_apic_msr_write,
388 .eoi_write = native_apic_msr_eoi_write,
389 .icr_read = native_x2apic_icr_read, 399 .icr_read = native_x2apic_icr_read,
390 .icr_write = native_x2apic_icr_write, 400 .icr_write = native_x2apic_icr_write,
391 .wait_icr_idle = native_x2apic_wait_icr_idle, 401 .wait_icr_idle = native_x2apic_wait_icr_idle,
@@ -662,11 +672,18 @@ void __cpuinit uv_cpu_init(void)
662/* 672/*
663 * When NMI is received, print a stack trace. 673 * When NMI is received, print a stack trace.
664 */ 674 */
665int uv_handle_nmi(unsigned int reason, struct pt_regs *regs) 675int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
666{ 676{
667 unsigned long real_uv_nmi; 677 unsigned long real_uv_nmi;
668 int bid; 678 int bid;
669 679
680 if (reason != DIE_NMIUNKNOWN)
681 return NOTIFY_OK;
682
683 if (in_crash_kexec)
684 /* do nothing if entering the crash kernel */
685 return NOTIFY_OK;
686
670 /* 687 /*
671 * Each blade has an MMR that indicates when an NMI has been sent 688 * Each blade has an MMR that indicates when an NMI has been sent
672 * to cpus on the blade. If an NMI is detected, atomically 689 * to cpus on the blade. If an NMI is detected, atomically
@@ -687,7 +704,7 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
687 } 704 }
688 705
689 if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count)) 706 if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
690 return NMI_DONE; 707 return NOTIFY_DONE;
691 708
692 __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count; 709 __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
693 710
@@ -700,12 +717,17 @@ int uv_handle_nmi(unsigned int reason, struct pt_regs *regs)
700 dump_stack(); 717 dump_stack();
701 spin_unlock(&uv_nmi_lock); 718 spin_unlock(&uv_nmi_lock);
702 719
703 return NMI_HANDLED; 720 return NOTIFY_STOP;
704} 721}
705 722
723static struct notifier_block uv_dump_stack_nmi_nb = {
724 .notifier_call = uv_handle_nmi,
725 .priority = NMI_LOCAL_LOW_PRIOR - 1,
726};
727
706void uv_register_nmi_notifier(void) 728void uv_register_nmi_notifier(void)
707{ 729{
708 if (register_nmi_handler(NMI_UNKNOWN, uv_handle_nmi, 0, "uv")) 730 if (register_die_notifier(&uv_dump_stack_nmi_nb))
709 printk(KERN_WARNING "UV NMI handler failed to register\n"); 731 printk(KERN_WARNING "UV NMI handler failed to register\n");
710} 732}
711 733
@@ -757,12 +779,7 @@ void __init uv_system_init(void)
757 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) 779 for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
758 uv_possible_blades += 780 uv_possible_blades +=
759 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); 781 hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
760 782 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
761 /* uv_num_possible_blades() is really the hub count */
762 printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
763 is_uv1_hub() ? uv_num_possible_blades() :
764 (uv_num_possible_blades() + 1) / 2,
765 uv_num_possible_blades());
766 783
767 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 784 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
768 uv_blade_info = kzalloc(bytes, GFP_KERNEL); 785 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index d65464e4350..0371c484bb8 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -201,8 +201,6 @@
201 * http://www.microsoft.com/whdc/archive/amp_12.mspx] 201 * http://www.microsoft.com/whdc/archive/amp_12.mspx]
202 */ 202 */
203 203
204#define pr_fmt(fmt) "apm: " fmt
205
206#include <linux/module.h> 204#include <linux/module.h>
207 205
208#include <linux/poll.h> 206#include <linux/poll.h>
@@ -233,6 +231,7 @@
233#include <linux/syscore_ops.h> 231#include <linux/syscore_ops.h>
234#include <linux/i8253.h> 232#include <linux/i8253.h>
235 233
234#include <asm/system.h>
236#include <asm/uaccess.h> 235#include <asm/uaccess.h>
237#include <asm/desc.h> 236#include <asm/desc.h>
238#include <asm/olpc.h> 237#include <asm/olpc.h>
@@ -250,6 +249,8 @@ extern int (*console_blank_hook)(int);
250#define APM_MINOR_DEV 134 249#define APM_MINOR_DEV 134
251 250
252/* 251/*
252 * See Documentation/Config.help for the configuration options.
253 *
253 * Various options can be changed at boot time as follows: 254 * Various options can be changed at boot time as follows:
254 * (We allow underscores for compatibility with the modules code) 255 * (We allow underscores for compatibility with the modules code)
255 * apm=on/off enable/disable APM 256 * apm=on/off enable/disable APM
@@ -384,21 +385,21 @@ static int ignore_sys_suspend;
384static int ignore_normal_resume; 385static int ignore_normal_resume;
385static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; 386static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL;
386 387
387static bool debug __read_mostly; 388static int debug __read_mostly;
388static bool smp __read_mostly; 389static int smp __read_mostly;
389static int apm_disabled = -1; 390static int apm_disabled = -1;
390#ifdef CONFIG_SMP 391#ifdef CONFIG_SMP
391static bool power_off; 392static int power_off;
392#else 393#else
393static bool power_off = 1; 394static int power_off = 1;
394#endif 395#endif
395static bool realmode_power_off; 396static int realmode_power_off;
396#ifdef CONFIG_APM_ALLOW_INTS 397#ifdef CONFIG_APM_ALLOW_INTS
397static bool allow_ints = 1; 398static int allow_ints = 1;
398#else 399#else
399static bool allow_ints; 400static int allow_ints;
400#endif 401#endif
401static bool broken_psr; 402static int broken_psr;
402 403
403static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); 404static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
404static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 405static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
@@ -487,11 +488,11 @@ static void apm_error(char *str, int err)
487 if (error_table[i].key == err) 488 if (error_table[i].key == err)
488 break; 489 break;
489 if (i < ERROR_COUNT) 490 if (i < ERROR_COUNT)
490 pr_notice("%s: %s\n", str, error_table[i].msg); 491 printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg);
491 else if (err < 0) 492 else if (err < 0)
492 pr_notice("%s: linux error code %i\n", str, err); 493 printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err);
493 else 494 else
494 pr_notice("%s: unknown error code %#2.2x\n", 495 printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n",
495 str, err); 496 str, err);
496} 497}
497 498
@@ -1186,7 +1187,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender)
1186 static int notified; 1187 static int notified;
1187 1188
1188 if (notified++ == 0) 1189 if (notified++ == 0)
1189 pr_err("an event queue overflowed\n"); 1190 printk(KERN_ERR "apm: an event queue overflowed\n");
1190 if (++as->event_tail >= APM_MAX_EVENTS) 1191 if (++as->event_tail >= APM_MAX_EVENTS)
1191 as->event_tail = 0; 1192 as->event_tail = 0;
1192 } 1193 }
@@ -1235,7 +1236,8 @@ static int suspend(int vetoable)
1235 struct apm_user *as; 1236 struct apm_user *as;
1236 1237
1237 dpm_suspend_start(PMSG_SUSPEND); 1238 dpm_suspend_start(PMSG_SUSPEND);
1238 dpm_suspend_end(PMSG_SUSPEND); 1239
1240 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1241
1240 local_irq_disable(); 1242 local_irq_disable();
1241 syscore_suspend(); 1243 syscore_suspend();
@@ -1259,9 +1261,9 @@ static int suspend(int vetoable)
1259 syscore_resume(); 1261 syscore_resume();
1260 local_irq_enable(); 1262 local_irq_enable();
1261 1263
1262 dpm_resume_start(PMSG_RESUME); 1264 dpm_resume_noirq(PMSG_RESUME);
1263 dpm_resume_end(PMSG_RESUME);
1264 1265
1266 dpm_resume_end(PMSG_RESUME);
1265 queue_event(APM_NORMAL_RESUME, NULL); 1267 queue_event(APM_NORMAL_RESUME, NULL);
1266 spin_lock(&user_list_lock); 1268 spin_lock(&user_list_lock);
1267 for (as = user_list; as != NULL; as = as->next) { 1269 for (as = user_list; as != NULL; as = as->next) {
@@ -1277,7 +1279,7 @@ static void standby(void)
1277{ 1279{
1278 int err; 1280 int err;
1279 1281
1280 dpm_suspend_end(PMSG_SUSPEND); 1282 dpm_suspend_noirq(PMSG_SUSPEND);
1281 1283
1282 local_irq_disable(); 1284 local_irq_disable();
1283 syscore_suspend(); 1285 syscore_suspend();
@@ -1291,7 +1293,7 @@ static void standby(void)
1291 syscore_resume(); 1293 syscore_resume();
1292 local_irq_enable(); 1294 local_irq_enable();
1293 1295
1294 dpm_resume_start(PMSG_RESUME); 1296 dpm_resume_noirq(PMSG_RESUME);
1295} 1297}
1296 1298
1297static apm_event_t get_event(void) 1299static apm_event_t get_event(void)
@@ -1449,7 +1451,7 @@ static void apm_mainloop(void)
1449static int check_apm_user(struct apm_user *as, const char *func) 1451static int check_apm_user(struct apm_user *as, const char *func)
1450{ 1452{
1451 if (as == NULL || as->magic != APM_BIOS_MAGIC) { 1453 if (as == NULL || as->magic != APM_BIOS_MAGIC) {
1452 pr_err("%s passed bad filp\n", func); 1454 printk(KERN_ERR "apm: %s passed bad filp\n", func);
1453 return 1; 1455 return 1;
1454 } 1456 }
1455 return 0; 1457 return 0;
@@ -1588,7 +1590,7 @@ static int do_release(struct inode *inode, struct file *filp)
1588 as1 = as1->next) 1590 as1 = as1->next)
1589 ; 1591 ;
1590 if (as1 == NULL) 1592 if (as1 == NULL)
1591 pr_err("filp not in user list\n"); 1593 printk(KERN_ERR "apm: filp not in user list\n");
1592 else 1594 else
1593 as1->next = as->next; 1595 as1->next = as->next;
1594 } 1596 }
@@ -1602,9 +1604,11 @@ static int do_open(struct inode *inode, struct file *filp)
1602 struct apm_user *as; 1604 struct apm_user *as;
1603 1605
1604 as = kmalloc(sizeof(*as), GFP_KERNEL); 1606 as = kmalloc(sizeof(*as), GFP_KERNEL);
1605 if (as == NULL) 1607 if (as == NULL) {
1608 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1609 sizeof(*as));
1606 return -ENOMEM; 1610 return -ENOMEM;
1607 1611 }
1608 as->magic = APM_BIOS_MAGIC; 1612 as->magic = APM_BIOS_MAGIC;
1609 as->event_tail = as->event_head = 0; 1613 as->event_tail = as->event_head = 0;
1610 as->suspends_pending = as->standbys_pending = 0; 1614 as->suspends_pending = as->standbys_pending = 0;
@@ -2313,16 +2317,16 @@ static int __init apm_init(void)
2313 } 2317 }
2314 2318
2315 if (apm_info.disabled) { 2319 if (apm_info.disabled) {
2316 pr_notice("disabled on user request.\n"); 2320 printk(KERN_NOTICE "apm: disabled on user request.\n");
2317 return -ENODEV; 2321 return -ENODEV;
2318 } 2322 }
2319 if ((num_online_cpus() > 1) && !power_off && !smp) { 2323 if ((num_online_cpus() > 1) && !power_off && !smp) {
2320 pr_notice("disabled - APM is not SMP safe.\n"); 2324 printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n");
2321 apm_info.disabled = 1; 2325 apm_info.disabled = 1;
2322 return -ENODEV; 2326 return -ENODEV;
2323 } 2327 }
2324 if (!acpi_disabled) { 2328 if (!acpi_disabled) {
2325 pr_notice("overridden by ACPI.\n"); 2329 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2326 apm_info.disabled = 1; 2330 apm_info.disabled = 1;
2327 return -ENODEV; 2331 return -ENODEV;
2328 } 2332 }
@@ -2356,7 +2360,8 @@ static int __init apm_init(void)
2356 2360
2357 kapmd_task = kthread_create(apm, NULL, "kapmd"); 2361 kapmd_task = kthread_create(apm, NULL, "kapmd");
2358 if (IS_ERR(kapmd_task)) { 2362 if (IS_ERR(kapmd_task)) {
2359 pr_err("disabled - Unable to start kernel thread\n"); 2363 printk(KERN_ERR "apm: disabled - Unable to start kernel "
2364 "thread.\n");
2360 err = PTR_ERR(kapmd_task); 2365 err = PTR_ERR(kapmd_task);
2361 kapmd_task = NULL; 2366 kapmd_task = NULL;
2362 remove_proc_entry("apm", NULL); 2367 remove_proc_entry("apm", NULL);
@@ -2400,7 +2405,7 @@ static void __exit apm_exit(void)
2400 * (pm_idle), Wait for all processors to update cached/local 2405 * (pm_idle), Wait for all processors to update cached/local
2401 * copies of pm_idle before proceeding. 2406 * copies of pm_idle before proceeding.
2402 */ 2407 */
2403 kick_all_cpus_sync(); 2408 cpu_idle_wait();
2404 } 2409 }
2405 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) 2410 if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0)
2406 && (apm_info.connection_version > 0x0100)) { 2411 && (apm_info.connection_version > 0x0100)) {
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 28610822fb3..4f13fafc526 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -67,9 +67,4 @@ void common(void) {
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version); 68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); 69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70 OFFSET(BP_pref_address, boot_params, hdr.pref_address);
71 OFFSET(BP_code32_start, boot_params, hdr.code32_start);
72
73 BLANK();
74 DEFINE(PTREGS_SIZE, sizeof(struct pt_regs));
75} 70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 85d98ab15cd..395a10e6806 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -3,11 +3,6 @@
3#include <linux/lguest.h> 3#include <linux/lguest.h>
4#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
5 5
6#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
7static char syscalls[] = {
8#include <asm/syscalls_32.h>
9};
10
11/* workaround for a warning with -Wmissing-prototypes */ 6/* workaround for a warning with -Wmissing-prototypes */
12void foo(void); 7void foo(void);
13 8
@@ -81,7 +76,4 @@ void foo(void)
81 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 76 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
82 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 77 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
83#endif 78#endif
84 BLANK();
85 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
86 DEFINE(NR_syscalls, sizeof(syscalls));
87} 79}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1b4754f82ba..e72a1194af2 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,18 +1,11 @@
1#include <asm/ia32.h> 1#include <asm/ia32.h>
2 2
3#define __SYSCALL_64(nr, sym, compat) [nr] = 1, 3#define __NO_STUBS 1
4#define __SYSCALL_COMMON(nr, sym, compat) [nr] = 1, 4#undef __SYSCALL
5#ifdef CONFIG_X86_X32_ABI 5#undef _ASM_X86_UNISTD_64_H
6# define __SYSCALL_X32(nr, sym, compat) [nr] = 1, 6#define __SYSCALL(nr, sym) [nr] = 1,
7#else 7static char syscalls[] = {
8# define __SYSCALL_X32(nr, sym, compat) /* nothing */ 8#include <asm/unistd.h>
9#endif
10static char syscalls_64[] = {
11#include <asm/syscalls_64.h>
12};
13#define __SYSCALL_I386(nr, sym, compat) [nr] = 1,
14static char syscalls_ia32[] = {
15#include <asm/syscalls_32.h>
16}; 9};
17 10
18int main(void) 11int main(void)
@@ -79,11 +72,7 @@ int main(void)
79 OFFSET(TSS_ist, tss_struct, x86_tss.ist); 72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
80 BLANK(); 73 BLANK();
81 74
82 DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
83 DEFINE(NR_syscalls, sizeof(syscalls_64));
84
85 DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
86 DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
87 76
88 return 0; 77 return 0;
89} 78}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index e2dbcb7dabd..452932d3473 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -27,29 +27,21 @@ static int num_scan_areas;
27 27
28static __init int set_corruption_check(char *arg) 28static __init int set_corruption_check(char *arg)
29{ 29{
30 ssize_t ret; 30 char *end;
31 unsigned long val;
32 31
33 ret = kstrtoul(arg, 10, &val); 32 memory_corruption_check = simple_strtol(arg, &end, 10);
34 if (ret)
35 return ret;
36 33
37 memory_corruption_check = val; 34 return (*end == 0) ? 0 : -EINVAL;
38 return 0;
39} 35}
40early_param("memory_corruption_check", set_corruption_check); 36early_param("memory_corruption_check", set_corruption_check);
41 37
42static __init int set_corruption_check_period(char *arg) 38static __init int set_corruption_check_period(char *arg)
43{ 39{
44 ssize_t ret; 40 char *end;
45 unsigned long val;
46 41
47 ret = kstrtoul(arg, 10, &val); 42 corruption_check_period = simple_strtoul(arg, &end, 10);
48 if (ret)
49 return ret;
50 43
51 corruption_check_period = val; 44 return (*end == 0) ? 0 : -EINVAL;
52 return 0;
53} 45}
54early_param("memory_corruption_check_period", set_corruption_check_period); 46early_param("memory_corruption_check_period", set_corruption_check_period);
55 47
@@ -70,8 +62,7 @@ early_param("memory_corruption_check_size", set_corruption_check_size);
70 62
71void __init setup_bios_corruption_check(void) 63void __init setup_bios_corruption_check(void)
72{ 64{
73 phys_addr_t start, end; 65 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
74 u64 i;
75 66
76 if (memory_corruption_check == -1) { 67 if (memory_corruption_check == -1) {
77 memory_corruption_check = 68 memory_corruption_check =
@@ -91,23 +82,28 @@ void __init setup_bios_corruption_check(void)
91 82
92 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); 83 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
93 84
94 for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { 85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
95 start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), 86 u64 size;
96 PAGE_SIZE, corruption_check_size); 87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
97 end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), 88
98 PAGE_SIZE, corruption_check_size); 89 if (addr == MEMBLOCK_ERROR)
99 if (start >= end) 90 break;
100 continue; 91
92 if (addr >= corruption_check_size)
93 break;
101 94
102 memblock_reserve(start, end - start); 95 if ((addr + size) > corruption_check_size)
103 scan_areas[num_scan_areas].addr = start; 96 size = corruption_check_size - addr;
104 scan_areas[num_scan_areas].size = end - start; 97
98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
99 scan_areas[num_scan_areas].addr = addr;
100 scan_areas[num_scan_areas].size = size;
101 num_scan_areas++;
105 102
106 /* Assume we've already mapped this early memory */ 103 /* Assume we've already mapped this early memory */
107 memset(__va(start), 0, end - start); 104 memset(__va(addr), 0, size);
108 105
109 if (++num_scan_areas >= MAX_SCAN_AREAS) 106 addr += size;
110 break;
111 } 107 }
112 108
113 if (num_scan_areas) 109 if (num_scan_areas)
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0e067d3d96..6042981d030 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -14,9 +14,7 @@ CFLAGS_common.o := $(nostackp)
14 14
15obj-y := intel_cacheinfo.o scattered.o topology.o 15obj-y := intel_cacheinfo.o scattered.o topology.o
16obj-y += proc.o capflags.o powerflags.o common.o 16obj-y += proc.o capflags.o powerflags.o common.o
17obj-y += vmware.o hypervisor.o mshyperv.o 17obj-y += vmware.o hypervisor.o sched.o mshyperv.o
18obj-y += rdrand.o
19obj-y += match.o
20 18
21obj-$(CONFIG_X86_32) += bugs.o 19obj-$(CONFIG_X86_32) += bugs.o
22obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
@@ -30,17 +28,10 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
30 28
31obj-$(CONFIG_PERF_EVENTS) += perf_event.o 29obj-$(CONFIG_PERF_EVENTS) += perf_event.o
32 30
33ifdef CONFIG_PERF_EVENTS
34obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o
35obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_knc.o perf_event_p4.o
36obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
37obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o
38endif
39
40obj-$(CONFIG_X86_MCE) += mcheck/ 31obj-$(CONFIG_X86_MCE) += mcheck/
41obj-$(CONFIG_MTRR) += mtrr/ 32obj-$(CONFIG_MTRR) += mtrr/
42 33
43obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o perf_event_amd_ibs.o 34obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
44 35
45quiet_cmd_mkcapflags = MKCAP $@ 36quiet_cmd_mkcapflags = MKCAP $@
46 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ 37 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 15239fffd6f..b13ed393dfc 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,11 +1,8 @@
1#include <linux/export.h>
2#include <linux/init.h> 1#include <linux/init.h>
3#include <linux/bitops.h> 2#include <linux/bitops.h>
4#include <linux/elf.h>
5#include <linux/mm.h> 3#include <linux/mm.h>
6 4
7#include <linux/io.h> 5#include <linux/io.h>
8#include <linux/sched.h>
9#include <asm/processor.h> 6#include <asm/processor.h>
10#include <asm/apic.h> 7#include <asm/apic.h>
11#include <asm/cpu.h> 8#include <asm/cpu.h>
@@ -19,39 +16,6 @@
19 16
20#include "cpu.h" 17#include "cpu.h"
21 18
22static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
23{
24 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
25 u32 gprs[8] = { 0 };
26 int err;
27
28 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
29
30 gprs[1] = msr;
31 gprs[7] = 0x9c5a203a;
32
33 err = rdmsr_safe_regs(gprs);
34
35 *p = gprs[0] | ((u64)gprs[2] << 32);
36
37 return err;
38}
39
40static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
41{
42 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
43 u32 gprs[8] = { 0 };
44
45 WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__);
46
47 gprs[0] = (u32)val;
48 gprs[1] = msr;
49 gprs[2] = val >> 32;
50 gprs[7] = 0x9c5a203a;
51
52 return wrmsr_safe_regs(gprs);
53}
54
55#ifdef CONFIG_X86_32 19#ifdef CONFIG_X86_32
56/* 20/*
57 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
@@ -59,8 +23,7 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val)
59 * contact AMD for precise details and a CPU swap. 23 * contact AMD for precise details and a CPU swap.
60 * 24 *
61 * See http://www.multimania.com/poulot/k6bug.html 25 * See http://www.multimania.com/poulot/k6bug.html
62 * and section 2.6.2 of "AMD-K6 Processor Revision Guide - Model 6" 26 * http://www.amd.com/K6/k6docs/revgd.html
63 * (Publication # 21266 Issue Date: August 1998)
64 * 27 *
65 * The following test is erm.. interesting. AMD neglected to up 28 * The following test is erm.. interesting. AMD neglected to up
66 * the chip setting when fixing the bug but they also tweaked some 29 * the chip setting when fixing the bug but they also tweaked some
@@ -128,6 +91,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
128 "system stability may be impaired when more than 32 MB are used.\n"); 91 "system stability may be impaired when more than 32 MB are used.\n");
129 else 92 else
130 printk(KERN_CONT "probably OK (after B9730xxxx).\n"); 93 printk(KERN_CONT "probably OK (after B9730xxxx).\n");
94 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
131 } 95 }
132 96
133 /* K6 with old style WHCR */ 97 /* K6 with old style WHCR */
@@ -182,6 +146,7 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
182 146
183static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) 147static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
184{ 148{
149#ifdef CONFIG_SMP
185 /* calling is from identify_secondary_cpu() ? */ 150 /* calling is from identify_secondary_cpu() ? */
186 if (!c->cpu_index) 151 if (!c->cpu_index)
187 return; 152 return;
@@ -225,6 +190,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
225 190
226valid_k7: 191valid_k7:
227 ; 192 ;
193#endif
228} 194}
229 195
230static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) 196static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
@@ -304,7 +270,7 @@ static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
304 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
305 271
306 /* get information required for multi-node processors */ 272 /* get information required for multi-node processors */
307 if (cpu_has_topoext) { 273 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
308 u32 eax, ebx, ecx, edx; 274 u32 eax, ebx, ecx, edx;
309 275
310 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); 276 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
@@ -385,14 +351,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
385 if (node == NUMA_NO_NODE) 351 if (node == NUMA_NO_NODE)
386 node = per_cpu(cpu_llc_id, cpu); 352 node = per_cpu(cpu_llc_id, cpu);
387 353
388 /*
389 * On multi-fabric platform (e.g. Numascale NumaChip) a
390 * platform-specific handler needs to be called to fixup some
391 * IDs of the CPU.
392 */
393 if (x86_cpuinit.fixup_cpu_id)
394 x86_cpuinit.fixup_cpu_id(c, node);
395
396 if (!node_online(node)) { 354 if (!node_online(node)) {
397 /* 355 /*
398 * Two possibilities here: 356 * Two possibilities here:
@@ -452,34 +410,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
452#endif 410#endif
453} 411}
454 412
455static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c)
456{
457 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
458
459 if (c->x86 > 0x10 ||
460 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
461 u64 val;
462
463 rdmsrl(MSR_K7_HWCR, val);
464 if (!(val & BIT(24)))
465 printk(KERN_WARNING FW_BUG "TSC doesn't count "
466 "with P0 frequency!\n");
467 }
468 }
469
470 if (c->x86 == 0x15) {
471 unsigned long upperbit;
472 u32 cpuid, assoc;
473
474 cpuid = cpuid_edx(0x80000005);
475 assoc = cpuid >> 16 & 0xff;
476 upperbit = ((cpuid >> 24) << 10) / assoc;
477
478 va_align.mask = (upperbit - 1) & PAGE_MASK;
479 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
480 }
481}
482
483static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 413static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
484{ 414{
485 early_init_amd_mc(c); 415 early_init_amd_mc(c);
@@ -491,8 +421,6 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
491 if (c->x86_power & (1 << 8)) { 421 if (c->x86_power & (1 << 8)) {
492 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 422 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
493 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 423 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
494 if (!check_tsc_unstable())
495 sched_clock_stable = 1;
496 } 424 }
497 425
498#ifdef CONFIG_X86_64 426#ifdef CONFIG_X86_64
@@ -513,12 +441,27 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
513 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 441 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
514 } 442 }
515#endif 443#endif
444
445 /* We need to do the following only once */
446 if (c != &boot_cpu_data)
447 return;
448
449 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
450
451 if (c->x86 > 0x10 ||
452 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
453 u64 val;
454
455 rdmsrl(MSR_K7_HWCR, val);
456 if (!(val & BIT(24)))
457 printk(KERN_WARNING FW_BUG "TSC doesn't count "
458 "with P0 frequency!\n");
459 }
460 }
516} 461}
517 462
518static void __cpuinit init_amd(struct cpuinfo_x86 *c) 463static void __cpuinit init_amd(struct cpuinfo_x86 *c)
519{ 464{
520 u32 dummy;
521
522#ifdef CONFIG_SMP 465#ifdef CONFIG_SMP
523 unsigned long long value; 466 unsigned long long value;
524 467
@@ -613,38 +556,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
613 } 556 }
614 } 557 }
615 558
616 /* re-enable TopologyExtensions if switched off by BIOS */
617 if ((c->x86 == 0x15) &&
618 (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) &&
619 !cpu_has(c, X86_FEATURE_TOPOEXT)) {
620 u64 val;
621
622 if (!rdmsrl_safe(0xc0011005, &val)) {
623 val |= 1ULL << 54;
624 wrmsrl_safe(0xc0011005, val);
625 rdmsrl(0xc0011005, val);
626 if (val & (1ULL << 54)) {
627 set_cpu_cap(c, X86_FEATURE_TOPOEXT);
628 printk(KERN_INFO FW_INFO "CPU: Re-enabling "
629 "disabled Topology Extensions Support\n");
630 }
631 }
632 }
633
634 /*
635 * The way access filter has a performance penalty on some workloads.
636 * Disable it on the affected CPUs.
637 */
638 if ((c->x86 == 0x15) &&
639 (c->x86_model >= 0x02) && (c->x86_model < 0x20)) {
640 u64 val;
641
642 if (!rdmsrl_safe(0xc0011021, &val) && !(val & 0x1E)) {
643 val |= 0x1E;
644 wrmsrl_safe(0xc0011021, val);
645 }
646 }
647
648 cpu_detect_cache_sizes(c); 559 cpu_detect_cache_sizes(c);
649 560
650 /* Multi core CPU? */ 561 /* Multi core CPU? */
@@ -657,7 +568,12 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
657 detect_ht(c); 568 detect_ht(c);
658#endif 569#endif
659 570
660 init_amd_cacheinfo(c); 571 if (c->extended_cpuid_level >= 0x80000006) {
572 if (cpuid_edx(0x80000006) & 0xf000)
573 num_cache_leaves = 4;
574 else
575 num_cache_leaves = 3;
576 }
661 577
662 if (c->x86 >= 0xf) 578 if (c->x86 >= 0xf)
663 set_cpu_cap(c, X86_FEATURE_K8); 579 set_cpu_cap(c, X86_FEATURE_K8);
@@ -721,11 +637,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
721 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); 637 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
722 if (err == 0) { 638 if (err == 0) {
723 mask |= (1 << 10); 639 mask |= (1 << 10);
724 wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); 640 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
725 } 641 }
726 } 642 }
727
728 rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
729} 643}
730 644
731#ifdef CONFIG_X86_32 645#ifdef CONFIG_X86_32
@@ -746,69 +660,6 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,
746} 660}
747#endif 661#endif
748 662
749static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c)
750{
751 tlb_flushall_shift = 5;
752
753 if (c->x86 <= 0x11)
754 tlb_flushall_shift = 4;
755}
756
757static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c)
758{
759 u32 ebx, eax, ecx, edx;
760 u16 mask = 0xfff;
761
762 if (c->x86 < 0xf)
763 return;
764
765 if (c->extended_cpuid_level < 0x80000006)
766 return;
767
768 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
769
770 tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask;
771 tlb_lli_4k[ENTRIES] = ebx & mask;
772
773 /*
774 * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB
775 * characteristics from the CPUID function 0x80000005 instead.
776 */
777 if (c->x86 == 0xf) {
778 cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
779 mask = 0xff;
780 }
781
782 /* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
783 if (!((eax >> 16) & mask)) {
784 u32 a, b, c, d;
785
786 cpuid(0x80000005, &a, &b, &c, &d);
787 tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff;
788 } else {
789 tlb_lld_2m[ENTRIES] = (eax >> 16) & mask;
790 }
791
792 /* a 4M entry uses two 2M entries */
793 tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1;
794
795 /* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */
796 if (!(eax & mask)) {
797 /* Erratum 658 */
798 if (c->x86 == 0x15 && c->x86_model <= 0x1f) {
799 tlb_lli_2m[ENTRIES] = 1024;
800 } else {
801 cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
802 tlb_lli_2m[ENTRIES] = eax & 0xff;
803 }
804 } else
805 tlb_lli_2m[ENTRIES] = eax & mask;
806
807 tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1;
808
809 cpu_set_tlb_flushall_shift(c);
810}
811
812static const struct cpu_dev __cpuinitconst amd_cpu_dev = { 663static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
813 .c_vendor = "AMD", 664 .c_vendor = "AMD",
814 .c_ident = { "AuthenticAMD" }, 665 .c_ident = { "AuthenticAMD" },
@@ -828,8 +679,6 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
828 .c_size_cache = amd_size_cache, 679 .c_size_cache = amd_size_cache,
829#endif 680#endif
830 .c_early_init = early_init_amd, 681 .c_early_init = early_init_amd,
831 .c_detect_tlb = cpu_detect_tlb_amd,
832 .c_bsp_init = bsp_init_amd,
833 .c_init = init_amd, 682 .c_init = init_amd,
834 .c_x86_vendor = X86_VENDOR_AMD, 683 .c_x86_vendor = X86_VENDOR_AMD,
835}; 684};
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 92dfec986a4..46674fbb62b 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -55,8 +55,8 @@ static void __init check_fpu(void)
55 55
56 if (!boot_cpu_data.hard_math) { 56 if (!boot_cpu_data.hard_math) {
57#ifndef CONFIG_MATH_EMULATION 57#ifndef CONFIG_MATH_EMULATION
58 pr_emerg("No coprocessor found and no math emulation present\n"); 58 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
59 pr_emerg("Giving up\n"); 59 printk(KERN_EMERG "Giving up.\n");
60 for (;;) ; 60 for (;;) ;
61#endif 61#endif
62 return; 62 return;
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
86 86
87 boot_cpu_data.fdiv_bug = fdiv_bug; 87 boot_cpu_data.fdiv_bug = fdiv_bug;
88 if (boot_cpu_data.fdiv_bug) 88 if (boot_cpu_data.fdiv_bug)
89 pr_warn("Hmm, FPU with FDIV bug\n"); 89 printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n");
90} 90}
91 91
92static void __init check_hlt(void) 92static void __init check_hlt(void)
@@ -94,30 +94,66 @@ static void __init check_hlt(void)
94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) 94 if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
95 return; 95 return;
96 96
97 pr_info("Checking 'hlt' instruction... "); 97 printk(KERN_INFO "Checking 'hlt' instruction... ");
98 if (!boot_cpu_data.hlt_works_ok) { 98 if (!boot_cpu_data.hlt_works_ok) {
99 pr_cont("disabled\n"); 99 printk("disabled\n");
100 return; 100 return;
101 } 101 }
102 halt(); 102 halt();
103 halt(); 103 halt();
104 halt(); 104 halt();
105 halt(); 105 halt();
106 pr_cont("OK\n"); 106 printk(KERN_CONT "OK.\n");
107}
108
109/*
110 * Most 386 processors have a bug where a POPAD can lock the
111 * machine even from user space.
112 */
113
114static void __init check_popad(void)
115{
116#ifndef CONFIG_X86_POPAD_OK
117 int res, inp = (int) &res;
118
119 printk(KERN_INFO "Checking for popad bug... ");
120 __asm__ __volatile__(
121 "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx "
122 : "=&a" (res)
123 : "d" (inp)
124 : "ecx", "edi");
125 /*
126 * If this fails, it means that any user program may lock the
127 * CPU hard. Too bad.
128 */
129 if (res != 12345678)
130 printk(KERN_CONT "Buggy.\n");
131 else
132 printk(KERN_CONT "OK.\n");
133#endif
107} 134}
108 135
109/* 136/*
110 * Check whether we are able to run this kernel safely on SMP. 137 * Check whether we are able to run this kernel safely on SMP.
111 * 138 *
112 * - i386 is no longer supported. 139 * - In order to run on a i386, we need to be compiled for i386
140 * (for due to lack of "invlpg" and working WP on a i386)
113 * - In order to run on anything without a TSC, we need to be 141 * - In order to run on anything without a TSC, we need to be
114 * compiled for a i486. 142 * compiled for a i486.
115 */ 143 */
116 144
117static void __init check_config(void) 145static void __init check_config(void)
118{ 146{
119 if (boot_cpu_data.x86 < 4) 147/*
148 * We'd better not be a i386 if we're configured to use some
149 * i486+ only features! (WP works in supervisor mode and the
150 * new "invlpg" and "bswap" instructions)
151 */
152#if defined(CONFIG_X86_WP_WORKS_OK) || defined(CONFIG_X86_INVLPG) || \
153 defined(CONFIG_X86_BSWAP)
154 if (boot_cpu_data.x86 == 3)
120 panic("Kernel requires i486+ for 'invlpg' and other features"); 155 panic("Kernel requires i486+ for 'invlpg' and other features");
156#endif
121} 157}
122 158
123 159
@@ -125,18 +161,14 @@ void __init check_bugs(void)
125{ 161{
126 identify_boot_cpu(); 162 identify_boot_cpu();
127#ifndef CONFIG_SMP 163#ifndef CONFIG_SMP
128 pr_info("CPU: "); 164 printk(KERN_INFO "CPU: ");
129 print_cpu_info(&boot_cpu_data); 165 print_cpu_info(&boot_cpu_data);
130#endif 166#endif
131 check_config(); 167 check_config();
168 check_fpu();
132 check_hlt(); 169 check_hlt();
170 check_popad();
133 init_utsname()->machine[1] = 171 init_utsname()->machine[1] =
134 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); 172 '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);
135 alternative_instructions(); 173 alternative_instructions();
136
137 /*
138 * kernel_fpu_begin/end() in check_fpu() relies on the patched
139 * alternative instructions.
140 */
141 check_fpu();
142} 174}
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 159103c0b1f..e58d978e075 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
278 } 278 }
279#ifdef CONFIG_X86_32 279#ifdef CONFIG_X86_32
280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ 280 /* Cyrix III family needs CX8 & PGE explicitly enabled. */
281 if (c->x86_model >= 6 && c->x86_model <= 13) { 281 if (c->x86_model >= 6 && c->x86_model <= 9) {
282 rdmsr(MSR_VIA_FCR, lo, hi); 282 rdmsr(MSR_VIA_FCR, lo, hi);
283 lo |= (1<<1 | 1<<7); 283 lo |= (1<<1 | 1<<7);
284 wrmsr(MSR_VIA_FCR, lo, hi); 284 wrmsr(MSR_VIA_FCR, lo, hi);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c3ab43a695..62184390a60 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -15,10 +15,8 @@
15#include <asm/stackprotector.h> 15#include <asm/stackprotector.h>
16#include <asm/perf_event.h> 16#include <asm/perf_event.h>
17#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
18#include <asm/archrandom.h>
19#include <asm/hypervisor.h> 18#include <asm/hypervisor.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/debugreg.h>
22#include <asm/sections.h> 20#include <asm/sections.h>
23#include <linux/topology.h> 21#include <linux/topology.h>
24#include <linux/cpumask.h> 22#include <linux/cpumask.h>
@@ -29,7 +27,6 @@
29#include <asm/apic.h> 27#include <asm/apic.h>
30#include <asm/desc.h> 28#include <asm/desc.h>
31#include <asm/i387.h> 29#include <asm/i387.h>
32#include <asm/fpu-internal.h>
33#include <asm/mtrr.h> 30#include <asm/mtrr.h>
34#include <linux/numa.h> 31#include <linux/numa.h>
35#include <asm/asm.h> 32#include <asm/asm.h>
@@ -144,8 +141,6 @@ static int __init x86_xsave_setup(char *s)
144{ 141{
145 setup_clear_cpu_cap(X86_FEATURE_XSAVE); 142 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
146 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); 143 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
147 setup_clear_cpu_cap(X86_FEATURE_AVX);
148 setup_clear_cpu_cap(X86_FEATURE_AVX2);
149 return 1; 144 return 1;
150} 145}
151__setup("noxsave", x86_xsave_setup); 146__setup("noxsave", x86_xsave_setup);
@@ -259,36 +254,23 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
259} 254}
260#endif 255#endif
261 256
257static int disable_smep __cpuinitdata;
262static __init int setup_disable_smep(char *arg) 258static __init int setup_disable_smep(char *arg)
263{ 259{
264 setup_clear_cpu_cap(X86_FEATURE_SMEP); 260 disable_smep = 1;
265 return 1; 261 return 1;
266} 262}
267__setup("nosmep", setup_disable_smep); 263__setup("nosmep", setup_disable_smep);
268 264
269static __always_inline void setup_smep(struct cpuinfo_x86 *c) 265static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
270{ 266{
271 if (cpu_has(c, X86_FEATURE_SMEP)) 267 if (cpu_has(c, X86_FEATURE_SMEP)) {
272 set_in_cr4(X86_CR4_SMEP); 268 if (unlikely(disable_smep)) {
273} 269 setup_clear_cpu_cap(X86_FEATURE_SMEP);
274 270 clear_in_cr4(X86_CR4_SMEP);
275static __init int setup_disable_smap(char *arg) 271 } else
276{ 272 set_in_cr4(X86_CR4_SMEP);
277 setup_clear_cpu_cap(X86_FEATURE_SMAP); 273 }
278 return 1;
279}
280__setup("nosmap", setup_disable_smap);
281
282static __always_inline void setup_smap(struct cpuinfo_x86 *c)
283{
284 unsigned long eflags;
285
286 /* This should have been cleared long ago */
287 raw_local_save_flags(eflags);
288 BUG_ON(eflags & X86_EFLAGS_AC);
289
290 if (cpu_has(c, X86_FEATURE_SMAP))
291 set_in_cr4(X86_CR4_SMAP);
292} 274}
293 275
294/* 276/*
@@ -467,35 +449,6 @@ void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
467 c->x86_cache_size = l2size; 449 c->x86_cache_size = l2size;
468} 450}
469 451
470u16 __read_mostly tlb_lli_4k[NR_INFO];
471u16 __read_mostly tlb_lli_2m[NR_INFO];
472u16 __read_mostly tlb_lli_4m[NR_INFO];
473u16 __read_mostly tlb_lld_4k[NR_INFO];
474u16 __read_mostly tlb_lld_2m[NR_INFO];
475u16 __read_mostly tlb_lld_4m[NR_INFO];
476
477/*
478 * tlb_flushall_shift shows the balance point in replacing cr3 write
479 * with multiple 'invlpg'. It will do this replacement when
480 * flush_tlb_lines <= active_lines/2^tlb_flushall_shift.
481 * If tlb_flushall_shift is -1, means the replacement will be disabled.
482 */
483s8 __read_mostly tlb_flushall_shift = -1;
484
485void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)
486{
487 if (this_cpu->c_detect_tlb)
488 this_cpu->c_detect_tlb(c);
489
490 printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
491 "Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \
492 "tlb_flushall_shift: %d\n",
493 tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],
494 tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],
495 tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES],
496 tlb_flushall_shift);
497}
498
499void __cpuinit detect_ht(struct cpuinfo_x86 *c) 452void __cpuinit detect_ht(struct cpuinfo_x86 *c)
500{ 453{
501#ifdef CONFIG_X86_HT 454#ifdef CONFIG_X86_HT
@@ -722,11 +675,12 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
722 if (this_cpu->c_early_init) 675 if (this_cpu->c_early_init)
723 this_cpu->c_early_init(c); 676 this_cpu->c_early_init(c);
724 677
678#ifdef CONFIG_SMP
725 c->cpu_index = 0; 679 c->cpu_index = 0;
680#endif
726 filter_cpuid_features(c, false); 681 filter_cpuid_features(c, false);
727 682
728 if (this_cpu->c_bsp_init) 683 setup_smep(c);
729 this_cpu->c_bsp_init(c);
730} 684}
731 685
732void __init early_cpu_init(void) 686void __init early_cpu_init(void)
@@ -806,9 +760,14 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
806 c->apicid = c->initial_apicid; 760 c->apicid = c->initial_apicid;
807# endif 761# endif
808#endif 762#endif
763
764#ifdef CONFIG_X86_HT
809 c->phys_proc_id = c->initial_apicid; 765 c->phys_proc_id = c->initial_apicid;
766#endif
810 } 767 }
811 768
769 setup_smep(c);
770
812 get_model_name(c); /* Default name */ 771 get_model_name(c); /* Default name */
813 772
814 detect_nopl(c); 773 detect_nopl(c);
@@ -873,10 +832,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
873 /* Disable the PN if appropriate */ 832 /* Disable the PN if appropriate */
874 squash_the_stupid_serial_number(c); 833 squash_the_stupid_serial_number(c);
875 834
876 /* Set up SMEP/SMAP */
877 setup_smep(c);
878 setup_smap(c);
879
880 /* 835 /*
881 * The vendor-specific functions might have changed features. 836 * The vendor-specific functions might have changed features.
882 * Now we do "generic changes." 837 * Now we do "generic changes."
@@ -902,7 +857,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
902#endif 857#endif
903 858
904 init_hypervisor(c); 859 init_hypervisor(c);
905 x86_init_rdrand(c);
906 860
907 /* 861 /*
908 * Clear/Set all flags overriden by options, need do it 862 * Clear/Set all flags overriden by options, need do it
@@ -955,7 +909,6 @@ void __init identify_boot_cpu(void)
955#else 909#else
956 vgetcpu_set_mode(); 910 vgetcpu_set_mode();
957#endif 911#endif
958 cpu_detect_tlb(&boot_cpu_data);
959} 912}
960 913
961void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 914void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -980,7 +933,7 @@ static const struct msr_range msr_range_array[] __cpuinitconst = {
980 { 0xc0011000, 0xc001103b}, 933 { 0xc0011000, 0xc001103b},
981}; 934};
982 935
983static void __cpuinit __print_cpu_msr(void) 936static void __cpuinit print_cpu_msr(void)
984{ 937{
985 unsigned index_min, index_max; 938 unsigned index_min, index_max;
986 unsigned index; 939 unsigned index;
@@ -992,7 +945,7 @@ static void __cpuinit __print_cpu_msr(void)
992 index_max = msr_range_array[i].max; 945 index_max = msr_range_array[i].max;
993 946
994 for (index = index_min; index < index_max; index++) { 947 for (index = index_min; index < index_max; index++) {
995 if (rdmsrl_safe(index, &val)) 948 if (rdmsrl_amd_safe(index, &val))
996 continue; 949 continue;
997 printk(KERN_INFO " MSR%08x: %016llx\n", index, val); 950 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
998 } 951 }
@@ -1035,24 +988,22 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
1035 printk(KERN_CONT "%s ", vendor); 988 printk(KERN_CONT "%s ", vendor);
1036 989
1037 if (c->x86_model_id[0]) 990 if (c->x86_model_id[0])
1038 printk(KERN_CONT "%s", strim(c->x86_model_id)); 991 printk(KERN_CONT "%s", c->x86_model_id);
1039 else 992 else
1040 printk(KERN_CONT "%d86", c->x86); 993 printk(KERN_CONT "%d86", c->x86);
1041 994
1042 printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model);
1043
1044 if (c->x86_mask || c->cpuid_level >= 0) 995 if (c->x86_mask || c->cpuid_level >= 0)
1045 printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); 996 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
1046 else 997 else
1047 printk(KERN_CONT ")\n"); 998 printk(KERN_CONT "\n");
1048
1049 print_cpu_msr(c);
1050}
1051 999
1052void __cpuinit print_cpu_msr(struct cpuinfo_x86 *c) 1000#ifdef CONFIG_SMP
1053{
1054 if (c->cpu_index < show_msr) 1001 if (c->cpu_index < show_msr)
1055 __print_cpu_msr(); 1002 print_cpu_msr();
1003#else
1004 if (show_msr)
1005 print_cpu_msr();
1006#endif
1056} 1007}
1057 1008
1058static __init int setup_disablecpuid(char *arg) 1009static __init int setup_disablecpuid(char *arg)
@@ -1070,8 +1021,6 @@ __setup("clearcpuid=", setup_disablecpuid);
1070 1021
1071#ifdef CONFIG_X86_64 1022#ifdef CONFIG_X86_64
1072struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; 1023struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
1073struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1,
1074 (unsigned long) nmi_idt_table };
1075 1024
1076DEFINE_PER_CPU_FIRST(union irq_stack_union, 1025DEFINE_PER_CPU_FIRST(union irq_stack_union,
1077 irq_stack_union) __aligned(PAGE_SIZE); 1026 irq_stack_union) __aligned(PAGE_SIZE);
@@ -1093,8 +1042,6 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
1093 1042
1094DEFINE_PER_CPU(unsigned int, irq_count) = -1; 1043DEFINE_PER_CPU(unsigned int, irq_count) = -1;
1095 1044
1096DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1097
1098/* 1045/*
1099 * Special IST stacks which the CPU switches to when it calls 1046 * Special IST stacks which the CPU switches to when it calls
1100 * an IST-marked descriptor entry. Up to 7 stacks (hardware 1047 * an IST-marked descriptor entry. Up to 7 stacks (hardware
@@ -1127,52 +1074,35 @@ void syscall_init(void)
1127 1074
1128 /* Flags to clear on syscall */ 1075 /* Flags to clear on syscall */
1129 wrmsrl(MSR_SYSCALL_MASK, 1076 wrmsrl(MSR_SYSCALL_MASK,
1130 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| 1077 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1131 X86_EFLAGS_IOPL|X86_EFLAGS_AC);
1132} 1078}
1133 1079
1080unsigned long kernel_eflags;
1081
1134/* 1082/*
1135 * Copies of the original ist values from the tss are only accessed during 1083 * Copies of the original ist values from the tss are only accessed during
1136 * debugging, no special alignment required. 1084 * debugging, no special alignment required.
1137 */ 1085 */
1138DEFINE_PER_CPU(struct orig_ist, orig_ist); 1086DEFINE_PER_CPU(struct orig_ist, orig_ist);
1139 1087
1140static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
1141DEFINE_PER_CPU(int, debug_stack_usage);
1142
1143int is_debug_stack(unsigned long addr)
1144{
1145 return __get_cpu_var(debug_stack_usage) ||
1146 (addr <= __get_cpu_var(debug_stack_addr) &&
1147 addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ));
1148}
1149
1150static DEFINE_PER_CPU(u32, debug_stack_use_ctr);
1151
1152void debug_stack_set_zero(void)
1153{
1154 this_cpu_inc(debug_stack_use_ctr);
1155 load_idt((const struct desc_ptr *)&nmi_idt_descr);
1156}
1157
1158void debug_stack_reset(void)
1159{
1160 if (WARN_ON(!this_cpu_read(debug_stack_use_ctr)))
1161 return;
1162 if (this_cpu_dec_return(debug_stack_use_ctr) == 0)
1163 load_idt((const struct desc_ptr *)&idt_descr);
1164}
1165
1166#else /* CONFIG_X86_64 */ 1088#else /* CONFIG_X86_64 */
1167 1089
1168DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; 1090DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
1169EXPORT_PER_CPU_SYMBOL(current_task); 1091EXPORT_PER_CPU_SYMBOL(current_task);
1170DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
1171 1092
1172#ifdef CONFIG_CC_STACKPROTECTOR 1093#ifdef CONFIG_CC_STACKPROTECTOR
1173DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); 1094DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
1174#endif 1095#endif
1175 1096
1097/* Make sure %fs and %gs are initialized properly in idle threads */
1098struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
1099{
1100 memset(regs, 0, sizeof(struct pt_regs));
1101 regs->fs = __KERNEL_PERCPU;
1102 regs->gs = __KERNEL_STACK_CANARY;
1103
1104 return regs;
1105}
1176#endif /* CONFIG_X86_64 */ 1106#endif /* CONFIG_X86_64 */
1177 1107
1178/* 1108/*
@@ -1228,7 +1158,7 @@ void __cpuinit cpu_init(void)
1228 oist = &per_cpu(orig_ist, cpu); 1158 oist = &per_cpu(orig_ist, cpu);
1229 1159
1230#ifdef CONFIG_NUMA 1160#ifdef CONFIG_NUMA
1231 if (this_cpu_read(numa_node) == 0 && 1161 if (cpu != 0 && percpu_read(numa_node) == 0 &&
1232 early_cpu_to_node(cpu) != NUMA_NO_NODE) 1162 early_cpu_to_node(cpu) != NUMA_NO_NODE)
1233 set_numa_node(early_cpu_to_node(cpu)); 1163 set_numa_node(early_cpu_to_node(cpu));
1234#endif 1164#endif
@@ -1260,7 +1190,8 @@ void __cpuinit cpu_init(void)
1260 barrier(); 1190 barrier();
1261 1191
1262 x86_configure_nx(); 1192 x86_configure_nx();
1263 enable_x2apic(); 1193 if (cpu != 0)
1194 enable_x2apic();
1264 1195
1265 /* 1196 /*
1266 * set up and load the per-CPU TSS 1197 * set up and load the per-CPU TSS
@@ -1272,8 +1203,6 @@ void __cpuinit cpu_init(void)
1272 estacks += exception_stack_sizes[v]; 1203 estacks += exception_stack_sizes[v];
1273 oist->ist[v] = t->x86_tss.ist[v] = 1204 oist->ist[v] = t->x86_tss.ist[v] =
1274 (unsigned long)estacks; 1205 (unsigned long)estacks;
1275 if (v == DEBUG_STACK-1)
1276 per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
1277 } 1206 }
1278 } 1207 }
1279 1208
@@ -1300,6 +1229,9 @@ void __cpuinit cpu_init(void)
1300 dbg_restore_debug_regs(); 1229 dbg_restore_debug_regs();
1301 1230
1302 fpu_init(); 1231 fpu_init();
1232 xsave_init();
1233
1234 raw_local_save_flags(kernel_eflags);
1303 1235
1304 if (is_uv_system()) 1236 if (is_uv_system())
1305 uv_cpu_init(); 1237 uv_cpu_init();
@@ -1352,5 +1284,6 @@ void __cpuinit cpu_init(void)
1352 dbg_restore_debug_regs(); 1284 dbg_restore_debug_regs();
1353 1285
1354 fpu_init(); 1286 fpu_init();
1287 xsave_init();
1355} 1288}
1356#endif 1289#endif
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4041c24ae7d..e765633f210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -1,4 +1,5 @@
1#ifndef ARCH_X86_CPU_H 1#ifndef ARCH_X86_CPU_H
2
2#define ARCH_X86_CPU_H 3#define ARCH_X86_CPU_H
3 4
4struct cpu_model_info { 5struct cpu_model_info {
@@ -17,22 +18,12 @@ struct cpu_dev {
17 struct cpu_model_info c_models[4]; 18 struct cpu_model_info c_models[4];
18 19
19 void (*c_early_init)(struct cpuinfo_x86 *); 20 void (*c_early_init)(struct cpuinfo_x86 *);
20 void (*c_bsp_init)(struct cpuinfo_x86 *);
21 void (*c_init)(struct cpuinfo_x86 *); 21 void (*c_init)(struct cpuinfo_x86 *);
22 void (*c_identify)(struct cpuinfo_x86 *); 22 void (*c_identify)(struct cpuinfo_x86 *);
23 void (*c_detect_tlb)(struct cpuinfo_x86 *);
24 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int);
25 int c_x86_vendor; 24 int c_x86_vendor;
26}; 25};
27 26
28struct _tlb_table {
29 unsigned char descriptor;
30 char tlb_type;
31 unsigned int entries;
32 /* unsigned int ways; */
33 char info[128];
34};
35
36#define cpu_dev_register(cpu_devX) \ 27#define cpu_dev_register(cpu_devX) \
37 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \ 28 static const struct cpu_dev *const __cpu_dev_##cpu_devX __used \
38 __attribute__((__section__(".x86_cpu_dev.init"))) = \ 29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
@@ -43,4 +34,6 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
43 34
44extern void get_cpu_cap(struct cpuinfo_x86 *c); 35extern void get_cpu_cap(struct cpuinfo_x86 *c);
45extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
46#endif /* ARCH_X86_CPU_H */ 37extern void get_cpu_cap(struct cpuinfo_x86 *c);
38
39#endif
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index a8f8fa9769d..755f64fb074 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -37,9 +37,6 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] =
37#endif 37#endif
38 &x86_hyper_vmware, 38 &x86_hyper_vmware,
39 &x86_hyper_ms_hyperv, 39 &x86_hyper_ms_hyperv,
40#ifdef CONFIG_KVM_GUEST
41 &x86_hyper_kvm,
42#endif
43}; 40};
44 41
45const struct hypervisor_x86 *x86_hyper; 42const struct hypervisor_x86 *x86_hyper;
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fcaabd0432c..ed6086eedf1 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -47,15 +47,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
47 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 47 (c->x86 == 0x6 && c->x86_model >= 0x0e))
48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
49 49
50 if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) {
51 unsigned lower_word;
52
53 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
54 /* Required by the SDM */
55 sync_core();
56 rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode);
57 }
58
59 /* 50 /*
60 * Atom erratum AAE44/AAF40/AAG38/AAH41: 51 * Atom erratum AAE44/AAF40/AAG38/AAH41:
61 * 52 *
@@ -64,10 +55,17 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
64 * need the microcode to have already been loaded... so if it is 55 * need the microcode to have already been loaded... so if it is
65 * not, recommend a BIOS update and disable large pages. 56 * not, recommend a BIOS update and disable large pages.
66 */ 57 */
67 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && 58 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
68 c->microcode < 0x20e) { 59 u32 ucode, junk;
69 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); 60
70 clear_cpu_cap(c, X86_FEATURE_PSE); 61 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
62 sync_core();
63 rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
64
65 if (ucode < 0x20e) {
66 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
67 clear_cpu_cap(c, X86_FEATURE_PSE);
68 }
71 } 69 }
72 70
73#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
@@ -181,6 +179,7 @@ static void __cpuinit trap_init_f00f_bug(void)
181 179
182static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) 180static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
183{ 181{
182#ifdef CONFIG_SMP
184 /* calling is from identify_secondary_cpu() ? */ 183 /* calling is from identify_secondary_cpu() ? */
185 if (!c->cpu_index) 184 if (!c->cpu_index)
186 return; 185 return;
@@ -197,6 +196,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
197 WARN_ONCE(1, "WARNING: SMP operation may be unreliable" 196 WARN_ONCE(1, "WARNING: SMP operation may be unreliable"
198 "with B stepping processors.\n"); 197 "with B stepping processors.\n");
199 } 198 }
199#endif
200} 200}
201 201
202static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) 202static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
@@ -491,181 +491,6 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
491} 491}
492#endif 492#endif
493 493
494#define TLB_INST_4K 0x01
495#define TLB_INST_4M 0x02
496#define TLB_INST_2M_4M 0x03
497
498#define TLB_INST_ALL 0x05
499#define TLB_INST_1G 0x06
500
501#define TLB_DATA_4K 0x11
502#define TLB_DATA_4M 0x12
503#define TLB_DATA_2M_4M 0x13
504#define TLB_DATA_4K_4M 0x14
505
506#define TLB_DATA_1G 0x16
507
508#define TLB_DATA0_4K 0x21
509#define TLB_DATA0_4M 0x22
510#define TLB_DATA0_2M_4M 0x23
511
512#define STLB_4K 0x41
513
514static const struct _tlb_table intel_tlb_table[] __cpuinitconst = {
515 { 0x01, TLB_INST_4K, 32, " TLB_INST 4 KByte pages, 4-way set associative" },
516 { 0x02, TLB_INST_4M, 2, " TLB_INST 4 MByte pages, full associative" },
517 { 0x03, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way set associative" },
518 { 0x04, TLB_DATA_4M, 8, " TLB_DATA 4 MByte pages, 4-way set associative" },
519 { 0x05, TLB_DATA_4M, 32, " TLB_DATA 4 MByte pages, 4-way set associative" },
520 { 0x0b, TLB_INST_4M, 4, " TLB_INST 4 MByte pages, 4-way set associative" },
521 { 0x4f, TLB_INST_4K, 32, " TLB_INST 4 KByte pages */" },
522 { 0x50, TLB_INST_ALL, 64, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
523 { 0x51, TLB_INST_ALL, 128, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
524 { 0x52, TLB_INST_ALL, 256, " TLB_INST 4 KByte and 2-MByte or 4-MByte pages" },
525 { 0x55, TLB_INST_2M_4M, 7, " TLB_INST 2-MByte or 4-MByte pages, fully associative" },
526 { 0x56, TLB_DATA0_4M, 16, " TLB_DATA0 4 MByte pages, 4-way set associative" },
527 { 0x57, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, 4-way associative" },
528 { 0x59, TLB_DATA0_4K, 16, " TLB_DATA0 4 KByte pages, fully associative" },
529 { 0x5a, TLB_DATA0_2M_4M, 32, " TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative" },
530 { 0x5b, TLB_DATA_4K_4M, 64, " TLB_DATA 4 KByte and 4 MByte pages" },
531 { 0x5c, TLB_DATA_4K_4M, 128, " TLB_DATA 4 KByte and 4 MByte pages" },
532 { 0x5d, TLB_DATA_4K_4M, 256, " TLB_DATA 4 KByte and 4 MByte pages" },
533 { 0xb0, TLB_INST_4K, 128, " TLB_INST 4 KByte pages, 4-way set associative" },
534 { 0xb1, TLB_INST_2M_4M, 4, " TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries" },
535 { 0xb2, TLB_INST_4K, 64, " TLB_INST 4KByte pages, 4-way set associative" },
536 { 0xb3, TLB_DATA_4K, 128, " TLB_DATA 4 KByte pages, 4-way set associative" },
537 { 0xb4, TLB_DATA_4K, 256, " TLB_DATA 4 KByte pages, 4-way associative" },
538 { 0xba, TLB_DATA_4K, 64, " TLB_DATA 4 KByte pages, 4-way associative" },
539 { 0xc0, TLB_DATA_4K_4M, 8, " TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
540 { 0xca, STLB_4K, 512, " STLB 4 KByte pages, 4-way associative" },
541 { 0x00, 0, 0 }
542};
543
544static void __cpuinit intel_tlb_lookup(const unsigned char desc)
545{
546 unsigned char k;
547 if (desc == 0)
548 return;
549
550 /* look up this descriptor in the table */
551 for (k = 0; intel_tlb_table[k].descriptor != desc && \
552 intel_tlb_table[k].descriptor != 0; k++)
553 ;
554
555 if (intel_tlb_table[k].tlb_type == 0)
556 return;
557
558 switch (intel_tlb_table[k].tlb_type) {
559 case STLB_4K:
560 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
561 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
562 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
563 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
564 break;
565 case TLB_INST_ALL:
566 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
567 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
568 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
569 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
570 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
571 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
572 break;
573 case TLB_INST_4K:
574 if (tlb_lli_4k[ENTRIES] < intel_tlb_table[k].entries)
575 tlb_lli_4k[ENTRIES] = intel_tlb_table[k].entries;
576 break;
577 case TLB_INST_4M:
578 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
579 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
580 break;
581 case TLB_INST_2M_4M:
582 if (tlb_lli_2m[ENTRIES] < intel_tlb_table[k].entries)
583 tlb_lli_2m[ENTRIES] = intel_tlb_table[k].entries;
584 if (tlb_lli_4m[ENTRIES] < intel_tlb_table[k].entries)
585 tlb_lli_4m[ENTRIES] = intel_tlb_table[k].entries;
586 break;
587 case TLB_DATA_4K:
588 case TLB_DATA0_4K:
589 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
590 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
591 break;
592 case TLB_DATA_4M:
593 case TLB_DATA0_4M:
594 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
595 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
596 break;
597 case TLB_DATA_2M_4M:
598 case TLB_DATA0_2M_4M:
599 if (tlb_lld_2m[ENTRIES] < intel_tlb_table[k].entries)
600 tlb_lld_2m[ENTRIES] = intel_tlb_table[k].entries;
601 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
602 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
603 break;
604 case TLB_DATA_4K_4M:
605 if (tlb_lld_4k[ENTRIES] < intel_tlb_table[k].entries)
606 tlb_lld_4k[ENTRIES] = intel_tlb_table[k].entries;
607 if (tlb_lld_4m[ENTRIES] < intel_tlb_table[k].entries)
608 tlb_lld_4m[ENTRIES] = intel_tlb_table[k].entries;
609 break;
610 }
611}
612
613static void __cpuinit intel_tlb_flushall_shift_set(struct cpuinfo_x86 *c)
614{
615 switch ((c->x86 << 8) + c->x86_model) {
616 case 0x60f: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
617 case 0x616: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
618 case 0x617: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
619 case 0x61d: /* six-core 45 nm xeon "Dunnington" */
620 tlb_flushall_shift = -1;
621 break;
622 case 0x61a: /* 45 nm nehalem, "Bloomfield" */
623 case 0x61e: /* 45 nm nehalem, "Lynnfield" */
624 case 0x625: /* 32 nm nehalem, "Clarkdale" */
625 case 0x62c: /* 32 nm nehalem, "Gulftown" */
626 case 0x62e: /* 45 nm nehalem-ex, "Beckton" */
627 case 0x62f: /* 32 nm Xeon E7 */
628 tlb_flushall_shift = 6;
629 break;
630 case 0x62a: /* SandyBridge */
631 case 0x62d: /* SandyBridge, "Romely-EP" */
632 tlb_flushall_shift = 5;
633 break;
634 case 0x63a: /* Ivybridge */
635 tlb_flushall_shift = 1;
636 break;
637 default:
638 tlb_flushall_shift = 6;
639 }
640}
641
642static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)
643{
644 int i, j, n;
645 unsigned int regs[4];
646 unsigned char *desc = (unsigned char *)regs;
647
648 if (c->cpuid_level < 2)
649 return;
650
651 /* Number of times to iterate */
652 n = cpuid_eax(2) & 0xFF;
653
654 for (i = 0 ; i < n ; i++) {
655 cpuid(2, &regs[0], &regs[1], &regs[2], &regs[3]);
656
657 /* If bit 31 is set, this is an unknown format */
658 for (j = 0 ; j < 3 ; j++)
659 if (regs[j] & (1 << 31))
660 regs[j] = 0;
661
662 /* Byte 0 is level count, not a descriptor */
663 for (j = 1 ; j < 16 ; j++)
664 intel_tlb_lookup(desc[j]);
665 }
666 intel_tlb_flushall_shift_set(c);
667}
668
669static const struct cpu_dev __cpuinitconst intel_cpu_dev = { 494static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
670 .c_vendor = "Intel", 495 .c_vendor = "Intel",
671 .c_ident = { "GenuineIntel" }, 496 .c_ident = { "GenuineIntel" },
@@ -721,7 +546,6 @@ static const struct cpu_dev __cpuinitconst intel_cpu_dev = {
721 }, 546 },
722 .c_size_cache = intel_size_cache, 547 .c_size_cache = intel_size_cache,
723#endif 548#endif
724 .c_detect_tlb = intel_detect_tlb,
725 .c_early_init = early_init_intel, 549 .c_early_init = early_init_intel,
726 .c_init = init_intel, 550 .c_init = init_intel,
727 .c_x86_vendor = X86_VENDOR_INTEL, 551 .c_x86_vendor = X86_VENDOR_INTEL,
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fe9edec6698..c105c533ed9 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -151,17 +151,28 @@ union _cpuid4_leaf_ecx {
151 u32 full; 151 u32 full;
152}; 152};
153 153
154struct _cpuid4_info_regs { 154struct amd_l3_cache {
155 struct amd_northbridge *nb;
156 unsigned indices;
157 u8 subcaches[4];
158};
159
160struct _cpuid4_info {
155 union _cpuid4_leaf_eax eax; 161 union _cpuid4_leaf_eax eax;
156 union _cpuid4_leaf_ebx ebx; 162 union _cpuid4_leaf_ebx ebx;
157 union _cpuid4_leaf_ecx ecx; 163 union _cpuid4_leaf_ecx ecx;
158 unsigned long size; 164 unsigned long size;
159 struct amd_northbridge *nb; 165 struct amd_l3_cache *l3;
166 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
160}; 167};
161 168
162struct _cpuid4_info { 169/* subset of above _cpuid4_info w/o shared_cpu_map */
163 struct _cpuid4_info_regs base; 170struct _cpuid4_info_regs {
164 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 171 union _cpuid4_leaf_eax eax;
172 union _cpuid4_leaf_ebx ebx;
173 union _cpuid4_leaf_ecx ecx;
174 unsigned long size;
175 struct amd_l3_cache *l3;
165}; 176};
166 177
167unsigned short num_cache_leaves; 178unsigned short num_cache_leaves;
@@ -303,41 +314,52 @@ struct _cache_attr {
303/* 314/*
304 * L3 cache descriptors 315 * L3 cache descriptors
305 */ 316 */
306static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) 317static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
307{ 318{
308 struct amd_l3_cache *l3 = &nb->l3_cache;
309 unsigned int sc0, sc1, sc2, sc3; 319 unsigned int sc0, sc1, sc2, sc3;
310 u32 val = 0; 320 u32 val = 0;
311 321
312 pci_read_config_dword(nb->misc, 0x1C4, &val); 322 pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
313 323
314 /* calculate subcache sizes */ 324 /* calculate subcache sizes */
315 l3->subcaches[0] = sc0 = !(val & BIT(0)); 325 l3->subcaches[0] = sc0 = !(val & BIT(0));
316 l3->subcaches[1] = sc1 = !(val & BIT(4)); 326 l3->subcaches[1] = sc1 = !(val & BIT(4));
317
318 if (boot_cpu_data.x86 == 0x15) {
319 l3->subcaches[0] = sc0 += !(val & BIT(1));
320 l3->subcaches[1] = sc1 += !(val & BIT(5));
321 }
322
323 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); 327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
324 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
325 329
326 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
327} 331}
328 332
329static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) 333static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
334 int index)
330{ 335{
336 static struct amd_l3_cache *__cpuinitdata l3_caches;
331 int node; 337 int node;
332 338
333 /* only for L3, and not in virtualized environments */ 339 /* only for L3, and not in virtualized environments */
334 if (index < 3) 340 if (index < 3 || amd_nb_num() == 0)
335 return; 341 return;
336 342
343 /*
344 * Strictly speaking, the amount in @size below is leaked since it is
345 * never freed but this is done only on shutdown so it doesn't matter.
346 */
347 if (!l3_caches) {
348 int size = amd_nb_num() * sizeof(struct amd_l3_cache);
349
350 l3_caches = kzalloc(size, GFP_ATOMIC);
351 if (!l3_caches)
352 return;
353 }
354
337 node = amd_get_nb_id(smp_processor_id()); 355 node = amd_get_nb_id(smp_processor_id());
338 this_leaf->nb = node_to_amd_nb(node); 356
339 if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) 357 if (!l3_caches[node].nb) {
340 amd_calc_l3_indices(this_leaf->nb); 358 l3_caches[node].nb = node_to_amd_nb(node);
359 amd_calc_l3_indices(&l3_caches[node]);
360 }
361
362 this_leaf->l3 = &l3_caches[node];
341} 363}
342 364
343/* 365/*
@@ -347,11 +369,11 @@ static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int
347 * 369 *
348 * @returns: the disabled index if used or negative value if slot free. 370 * @returns: the disabled index if used or negative value if slot free.
349 */ 371 */
350int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) 372int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
351{ 373{
352 unsigned int reg = 0; 374 unsigned int reg = 0;
353 375
354 pci_read_config_dword(nb->misc, 0x1BC + slot * 4, &reg); 376 pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
355 377
356 /* check whether this slot is activated already */ 378 /* check whether this slot is activated already */
357 if (reg & (3UL << 30)) 379 if (reg & (3UL << 30))
@@ -365,10 +387,11 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
365{ 387{
366 int index; 388 int index;
367 389
368 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 390 if (!this_leaf->l3 ||
391 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
369 return -EINVAL; 392 return -EINVAL;
370 393
371 index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); 394 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
372 if (index >= 0) 395 if (index >= 0)
373 return sprintf(buf, "%d\n", index); 396 return sprintf(buf, "%d\n", index);
374 397
@@ -385,7 +408,7 @@ show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
385SHOW_CACHE_DISABLE(0) 408SHOW_CACHE_DISABLE(0)
386SHOW_CACHE_DISABLE(1) 409SHOW_CACHE_DISABLE(1)
387 410
388static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, 411static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
389 unsigned slot, unsigned long idx) 412 unsigned slot, unsigned long idx)
390{ 413{
391 int i; 414 int i;
@@ -398,10 +421,10 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
398 for (i = 0; i < 4; i++) { 421 for (i = 0; i < 4; i++) {
399 u32 reg = idx | (i << 20); 422 u32 reg = idx | (i << 20);
400 423
401 if (!nb->l3_cache.subcaches[i]) 424 if (!l3->subcaches[i])
402 continue; 425 continue;
403 426
404 pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); 427 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
405 428
406 /* 429 /*
407 * We need to WBINVD on a core on the node containing the L3 430 * We need to WBINVD on a core on the node containing the L3
@@ -411,7 +434,7 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
411 wbinvd_on_cpu(cpu); 434 wbinvd_on_cpu(cpu);
412 435
413 reg |= BIT(31); 436 reg |= BIT(31);
414 pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); 437 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
415 } 438 }
416} 439}
417 440
@@ -425,24 +448,24 @@ static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu,
425 * 448 *
426 * @return: 0 on success, error status on failure 449 * @return: 0 on success, error status on failure
427 */ 450 */
428int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, 451int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
429 unsigned long index) 452 unsigned long index)
430{ 453{
431 int ret = 0; 454 int ret = 0;
432 455
433 /* check if @slot is already used or the index is already disabled */ 456 /* check if @slot is already used or the index is already disabled */
434 ret = amd_get_l3_disable_slot(nb, slot); 457 ret = amd_get_l3_disable_slot(l3, slot);
435 if (ret >= 0) 458 if (ret >= 0)
436 return -EEXIST; 459 return -EINVAL;
437 460
438 if (index > nb->l3_cache.indices) 461 if (index > l3->indices)
439 return -EINVAL; 462 return -EINVAL;
440 463
441 /* check whether the other slot has disabled the same index already */ 464 /* check whether the other slot has disabled the same index already */
442 if (index == amd_get_l3_disable_slot(nb, !slot)) 465 if (index == amd_get_l3_disable_slot(l3, !slot))
443 return -EEXIST; 466 return -EINVAL;
444 467
445 amd_l3_disable_index(nb, cpu, slot, index); 468 amd_l3_disable_index(l3, cpu, slot, index);
446 469
447 return 0; 470 return 0;
448} 471}
@@ -457,7 +480,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
457 if (!capable(CAP_SYS_ADMIN)) 480 if (!capable(CAP_SYS_ADMIN))
458 return -EPERM; 481 return -EPERM;
459 482
460 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 483 if (!this_leaf->l3 ||
484 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
461 return -EINVAL; 485 return -EINVAL;
462 486
463 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 487 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -465,11 +489,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
465 if (strict_strtoul(buf, 10, &val) < 0) 489 if (strict_strtoul(buf, 10, &val) < 0)
466 return -EINVAL; 490 return -EINVAL;
467 491
468 err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); 492 err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
469 if (err) { 493 if (err) {
470 if (err == -EEXIST) 494 if (err == -EEXIST)
471 pr_warning("L3 slot %d in use/index already disabled!\n", 495 printk(KERN_WARNING "L3 disable slot %d in use!\n",
472 slot); 496 slot);
473 return err; 497 return err;
474 } 498 }
475 return count; 499 return count;
@@ -494,7 +518,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
494static ssize_t 518static ssize_t
495show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) 519show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
496{ 520{
497 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 521 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
498 return -EINVAL; 522 return -EINVAL;
499 523
500 return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); 524 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
@@ -509,7 +533,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
509 if (!capable(CAP_SYS_ADMIN)) 533 if (!capable(CAP_SYS_ADMIN))
510 return -EPERM; 534 return -EPERM;
511 535
512 if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) 536 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
513 return -EINVAL; 537 return -EINVAL;
514 538
515 if (strict_strtoul(buf, 16, &val) < 0) 539 if (strict_strtoul(buf, 16, &val) < 0)
@@ -538,11 +562,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
538 unsigned edx; 562 unsigned edx;
539 563
540 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 564 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
541 if (cpu_has_topoext) 565 amd_cpuid4(index, &eax, &ebx, &ecx);
542 cpuid_count(0x8000001d, index, &eax.full,
543 &ebx.full, &ecx.full, &edx);
544 else
545 amd_cpuid4(index, &eax, &ebx, &ecx);
546 amd_init_l3_cache(this_leaf, index); 566 amd_init_l3_cache(this_leaf, index);
547 } else { 567 } else {
548 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 568 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
@@ -561,39 +581,21 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
561 return 0; 581 return 0;
562} 582}
563 583
564static int __cpuinit find_num_cache_leaves(struct cpuinfo_x86 *c) 584static int __cpuinit find_num_cache_leaves(void)
565{ 585{
566 unsigned int eax, ebx, ecx, edx, op; 586 unsigned int eax, ebx, ecx, edx;
567 union _cpuid4_leaf_eax cache_eax; 587 union _cpuid4_leaf_eax cache_eax;
568 int i = -1; 588 int i = -1;
569 589
570 if (c->x86_vendor == X86_VENDOR_AMD)
571 op = 0x8000001d;
572 else
573 op = 4;
574
575 do { 590 do {
576 ++i; 591 ++i;
577 /* Do cpuid(op) loop to find out num_cache_leaves */ 592 /* Do cpuid(4) loop to find out num_cache_leaves */
578 cpuid_count(op, i, &eax, &ebx, &ecx, &edx); 593 cpuid_count(4, i, &eax, &ebx, &ecx, &edx);
579 cache_eax.full = eax; 594 cache_eax.full = eax;
580 } while (cache_eax.split.type != CACHE_TYPE_NULL); 595 } while (cache_eax.split.type != CACHE_TYPE_NULL);
581 return i; 596 return i;
582} 597}
583 598
584void __cpuinit init_amd_cacheinfo(struct cpuinfo_x86 *c)
585{
586
587 if (cpu_has_topoext) {
588 num_cache_leaves = find_num_cache_leaves(c);
589 } else if (c->extended_cpuid_level >= 0x80000006) {
590 if (cpuid_edx(0x80000006) & 0xf000)
591 num_cache_leaves = 4;
592 else
593 num_cache_leaves = 3;
594 }
595}
596
597unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) 599unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
598{ 600{
599 /* Cache sizes */ 601 /* Cache sizes */
@@ -610,7 +612,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
610 612
611 if (is_initialized == 0) { 613 if (is_initialized == 0) {
612 /* Init num_cache_leaves from boot CPU */ 614 /* Init num_cache_leaves from boot CPU */
613 num_cache_leaves = find_num_cache_leaves(c); 615 num_cache_leaves = find_num_cache_leaves();
614 is_initialized++; 616 is_initialized++;
615 } 617 }
616 618
@@ -637,14 +639,14 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
637 new_l2 = this_leaf.size/1024; 639 new_l2 = this_leaf.size/1024;
638 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 640 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
639 index_msb = get_count_order(num_threads_sharing); 641 index_msb = get_count_order(num_threads_sharing);
640 l2_id = c->apicid & ~((1 << index_msb) - 1); 642 l2_id = c->apicid >> index_msb;
641 break; 643 break;
642 case 3: 644 case 3:
643 new_l3 = this_leaf.size/1024; 645 new_l3 = this_leaf.size/1024;
644 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; 646 num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
645 index_msb = get_count_order( 647 index_msb = get_count_order(
646 num_threads_sharing); 648 num_threads_sharing);
647 l3_id = c->apicid & ~((1 << index_msb) - 1); 649 l3_id = c->apicid >> index_msb;
648 break; 650 break;
649 default: 651 default:
650 break; 652 break;
@@ -746,40 +748,14 @@ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
746#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) 748#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
747 749
748#ifdef CONFIG_SMP 750#ifdef CONFIG_SMP
749 751static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
750static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
751{ 752{
752 struct _cpuid4_info *this_leaf; 753 struct _cpuid4_info *this_leaf, *sibling_leaf;
753 int i, sibling; 754 unsigned long num_threads_sharing;
754 755 int index_msb, i, sibling;
755 if (cpu_has_topoext) { 756 struct cpuinfo_x86 *c = &cpu_data(cpu);
756 unsigned int apicid, nshared, first, last;
757
758 if (!per_cpu(ici_cpuid4_info, cpu))
759 return 0;
760
761 this_leaf = CPUID4_INFO_IDX(cpu, index);
762 nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
763 apicid = cpu_data(cpu).apicid;
764 first = apicid - (apicid % nshared);
765 last = first + nshared - 1;
766
767 for_each_online_cpu(i) {
768 apicid = cpu_data(i).apicid;
769 if ((apicid < first) || (apicid > last))
770 continue;
771 if (!per_cpu(ici_cpuid4_info, i))
772 continue;
773 this_leaf = CPUID4_INFO_IDX(i, index);
774 757
775 for_each_online_cpu(sibling) { 758 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
776 apicid = cpu_data(sibling).apicid;
777 if ((apicid < first) || (apicid > last))
778 continue;
779 set_bit(sibling, this_leaf->shared_cpu_map);
780 }
781 }
782 } else if (index == 3) {
783 for_each_cpu(i, cpu_llc_shared_mask(cpu)) { 759 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
784 if (!per_cpu(ici_cpuid4_info, i)) 760 if (!per_cpu(ici_cpuid4_info, i))
785 continue; 761 continue;
@@ -790,26 +766,10 @@ static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
790 set_bit(sibling, this_leaf->shared_cpu_map); 766 set_bit(sibling, this_leaf->shared_cpu_map);
791 } 767 }
792 } 768 }
793 } else 769 return;
794 return 0;
795
796 return 1;
797}
798
799static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
800{
801 struct _cpuid4_info *this_leaf, *sibling_leaf;
802 unsigned long num_threads_sharing;
803 int index_msb, i;
804 struct cpuinfo_x86 *c = &cpu_data(cpu);
805
806 if (c->x86_vendor == X86_VENDOR_AMD) {
807 if (cache_shared_amd_cpu_map_setup(cpu, index))
808 return;
809 } 770 }
810
811 this_leaf = CPUID4_INFO_IDX(cpu, index); 771 this_leaf = CPUID4_INFO_IDX(cpu, index);
812 num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; 772 num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
813 773
814 if (num_threads_sharing == 1) 774 if (num_threads_sharing == 1)
815 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); 775 cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
@@ -860,19 +820,29 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
860 for (i = 0; i < num_cache_leaves; i++) 820 for (i = 0; i < num_cache_leaves; i++)
861 cache_remove_shared_cpu_map(cpu, i); 821 cache_remove_shared_cpu_map(cpu, i);
862 822
823 kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
863 kfree(per_cpu(ici_cpuid4_info, cpu)); 824 kfree(per_cpu(ici_cpuid4_info, cpu));
864 per_cpu(ici_cpuid4_info, cpu) = NULL; 825 per_cpu(ici_cpuid4_info, cpu) = NULL;
865} 826}
866 827
828static int
829__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
830{
831 struct _cpuid4_info_regs *leaf_regs =
832 (struct _cpuid4_info_regs *)this_leaf;
833
834 return cpuid4_cache_lookup_regs(index, leaf_regs);
835}
836
867static void __cpuinit get_cpu_leaves(void *_retval) 837static void __cpuinit get_cpu_leaves(void *_retval)
868{ 838{
869 int j, *retval = _retval, cpu = smp_processor_id(); 839 int j, *retval = _retval, cpu = smp_processor_id();
870 840
871 /* Do cpuid and store the results */ 841 /* Do cpuid and store the results */
872 for (j = 0; j < num_cache_leaves; j++) { 842 for (j = 0; j < num_cache_leaves; j++) {
873 struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); 843 struct _cpuid4_info *this_leaf;
874 844 this_leaf = CPUID4_INFO_IDX(cpu, j);
875 *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); 845 *retval = cpuid4_cache_lookup(j, this_leaf);
876 if (unlikely(*retval < 0)) { 846 if (unlikely(*retval < 0)) {
877 int i; 847 int i;
878 848
@@ -907,7 +877,8 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
907 877
908#include <linux/kobject.h> 878#include <linux/kobject.h>
909#include <linux/sysfs.h> 879#include <linux/sysfs.h>
910#include <linux/cpu.h> 880
881extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
911 882
912/* pointer to kobject for cpuX/cache */ 883/* pointer to kobject for cpuX/cache */
913static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); 884static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
@@ -929,16 +900,16 @@ static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
929 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 900 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
930} 901}
931 902
932show_one_plus(level, base.eax.split.level, 0); 903show_one_plus(level, eax.split.level, 0);
933show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); 904show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1);
934show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); 905show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
935show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); 906show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
936show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); 907show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
937 908
938static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, 909static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
939 unsigned int cpu) 910 unsigned int cpu)
940{ 911{
941 return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); 912 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
942} 913}
943 914
944static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, 915static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
@@ -975,7 +946,7 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
975static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, 946static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
976 unsigned int cpu) 947 unsigned int cpu)
977{ 948{
978 switch (this_leaf->base.eax.split.type) { 949 switch (this_leaf->eax.split.type) {
979 case CACHE_TYPE_DATA: 950 case CACHE_TYPE_DATA:
980 return sprintf(buf, "Data\n"); 951 return sprintf(buf, "Data\n");
981 case CACHE_TYPE_INST: 952 case CACHE_TYPE_INST:
@@ -1026,7 +997,7 @@ static struct attribute ** __cpuinit amd_l3_attrs(void)
1026 if (attrs) 997 if (attrs)
1027 return attrs; 998 return attrs;
1028 999
1029 n = ARRAY_SIZE(default_attrs); 1000 n = sizeof (default_attrs) / sizeof (struct attribute *);
1030 1001
1031 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) 1002 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
1032 n += 2; 1003 n += 2;
@@ -1135,9 +1106,9 @@ err_out:
1135static DECLARE_BITMAP(cache_dev_map, NR_CPUS); 1106static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
1136 1107
1137/* Add/Remove cache interface for CPU device */ 1108/* Add/Remove cache interface for CPU device */
1138static int __cpuinit cache_add_dev(struct device *dev) 1109static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1139{ 1110{
1140 unsigned int cpu = dev->id; 1111 unsigned int cpu = sys_dev->id;
1141 unsigned long i, j; 1112 unsigned long i, j;
1142 struct _index_kobject *this_object; 1113 struct _index_kobject *this_object;
1143 struct _cpuid4_info *this_leaf; 1114 struct _cpuid4_info *this_leaf;
@@ -1149,7 +1120,7 @@ static int __cpuinit cache_add_dev(struct device *dev)
1149 1120
1150 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), 1121 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
1151 &ktype_percpu_entry, 1122 &ktype_percpu_entry,
1152 &dev->kobj, "%s", "cache"); 1123 &sys_dev->kobj, "%s", "cache");
1153 if (retval < 0) { 1124 if (retval < 0) {
1154 cpuid4_cache_sysfs_exit(cpu); 1125 cpuid4_cache_sysfs_exit(cpu);
1155 return retval; 1126 return retval;
@@ -1164,7 +1135,7 @@ static int __cpuinit cache_add_dev(struct device *dev)
1164 1135
1165 ktype_cache.default_attrs = default_attrs; 1136 ktype_cache.default_attrs = default_attrs;
1166#ifdef CONFIG_AMD_NB 1137#ifdef CONFIG_AMD_NB
1167 if (this_leaf->base.nb) 1138 if (this_leaf->l3)
1168 ktype_cache.default_attrs = amd_l3_attrs(); 1139 ktype_cache.default_attrs = amd_l3_attrs();
1169#endif 1140#endif
1170 retval = kobject_init_and_add(&(this_object->kobj), 1141 retval = kobject_init_and_add(&(this_object->kobj),
@@ -1186,9 +1157,9 @@ static int __cpuinit cache_add_dev(struct device *dev)
1186 return 0; 1157 return 0;
1187} 1158}
1188 1159
1189static void __cpuinit cache_remove_dev(struct device *dev) 1160static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
1190{ 1161{
1191 unsigned int cpu = dev->id; 1162 unsigned int cpu = sys_dev->id;
1192 unsigned long i; 1163 unsigned long i;
1193 1164
1194 if (per_cpu(ici_cpuid4_info, cpu) == NULL) 1165 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
@@ -1207,17 +1178,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb,
1207 unsigned long action, void *hcpu) 1178 unsigned long action, void *hcpu)
1208{ 1179{
1209 unsigned int cpu = (unsigned long)hcpu; 1180 unsigned int cpu = (unsigned long)hcpu;
1210 struct device *dev; 1181 struct sys_device *sys_dev;
1211 1182
1212 dev = get_cpu_device(cpu); 1183 sys_dev = get_cpu_sysdev(cpu);
1213 switch (action) { 1184 switch (action) {
1214 case CPU_ONLINE: 1185 case CPU_ONLINE:
1215 case CPU_ONLINE_FROZEN: 1186 case CPU_ONLINE_FROZEN:
1216 cache_add_dev(dev); 1187 cache_add_dev(sys_dev);
1217 break; 1188 break;
1218 case CPU_DEAD: 1189 case CPU_DEAD:
1219 case CPU_DEAD_FROZEN: 1190 case CPU_DEAD_FROZEN:
1220 cache_remove_dev(dev); 1191 cache_remove_dev(sys_dev);
1221 break; 1192 break;
1222 } 1193 }
1223 return NOTIFY_OK; 1194 return NOTIFY_OK;
@@ -1236,9 +1207,9 @@ static int __cpuinit cache_sysfs_init(void)
1236 1207
1237 for_each_online_cpu(i) { 1208 for_each_online_cpu(i) {
1238 int err; 1209 int err;
1239 struct device *dev = get_cpu_device(i); 1210 struct sys_device *sys_dev = get_cpu_sysdev(i);
1240 1211
1241 err = cache_add_dev(dev); 1212 err = cache_add_dev(sys_dev);
1242 if (err) 1213 if (err)
1243 return err; 1214 return err;
1244 } 1215 }
diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c
deleted file mode 100644
index 36565373af8..00000000000
--- a/arch/x86/kernel/cpu/match.c
+++ /dev/null
@@ -1,91 +0,0 @@
1#include <asm/cpu_device_id.h>
2#include <asm/processor.h>
3#include <linux/cpu.h>
4#include <linux/module.h>
5#include <linux/slab.h>
6
7/**
8 * x86_match_cpu - match current CPU again an array of x86_cpu_ids
9 * @match: Pointer to array of x86_cpu_ids. Last entry terminated with
10 * {}.
11 *
12 * Return the entry if the current CPU matches the entries in the
13 * passed x86_cpu_id match table. Otherwise NULL. The match table
14 * contains vendor (X86_VENDOR_*), family, model and feature bits or
15 * respective wildcard entries.
16 *
17 * A typical table entry would be to match a specific CPU
18 * { X86_VENDOR_INTEL, 6, 0x12 }
19 * or to match a specific CPU feature
20 * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) }
21 *
22 * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY,
23 * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor)
24 *
25 * Arrays used to match for this should also be declared using
26 * MODULE_DEVICE_TABLE(x86cpu, ...)
27 *
28 * This always matches against the boot cpu, assuming models and features are
29 * consistent over all CPUs.
30 */
31const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
32{
33 const struct x86_cpu_id *m;
34 struct cpuinfo_x86 *c = &boot_cpu_data;
35
36 for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
37 if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
38 continue;
39 if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
40 continue;
41 if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
42 continue;
43 if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
44 continue;
45 return m;
46 }
47 return NULL;
48}
49EXPORT_SYMBOL(x86_match_cpu);
50
51ssize_t arch_print_cpu_modalias(struct device *dev,
52 struct device_attribute *attr,
53 char *bufptr)
54{
55 int size = PAGE_SIZE;
56 int i, n;
57 char *buf = bufptr;
58
59 n = snprintf(buf, size, "x86cpu:vendor:%04X:family:%04X:"
60 "model:%04X:feature:",
61 boot_cpu_data.x86_vendor,
62 boot_cpu_data.x86,
63 boot_cpu_data.x86_model);
64 size -= n;
65 buf += n;
66 size -= 1;
67 for (i = 0; i < NCAPINTS*32; i++) {
68 if (boot_cpu_has(i)) {
69 n = snprintf(buf, size, ",%04X", i);
70 if (n >= size) {
71 WARN(1, "x86 features overflow page\n");
72 break;
73 }
74 size -= n;
75 buf += n;
76 }
77 }
78 *buf++ = '\n';
79 return buf - bufptr;
80}
81
82int arch_cpu_uevent(struct device *dev, struct kobj_uevent_env *env)
83{
84 char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
85 if (buf) {
86 arch_print_cpu_modalias(NULL, NULL, buf);
87 add_uevent_var(env, "MODALIAS=%s", buf);
88 kfree(buf);
89 }
90 return 0;
91}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index cd8b166a173..83930deec3c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -28,7 +28,6 @@
28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 28 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
29 */ 29 */
30 30
31#include <linux/export.h>
32#include <linux/kernel.h> 31#include <linux/kernel.h>
33#include <linux/acpi.h> 32#include <linux/acpi.h>
34#include <linux/cper.h> 33#include <linux/cper.h>
@@ -42,8 +41,7 @@ void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
42 struct mce m; 41 struct mce m;
43 42
44 /* Only corrected MC is reported */ 43 /* Only corrected MC is reported */
45 if (!corrected || !(mem_err->validation_bits & 44 if (!corrected)
46 CPER_MEM_VALID_PHYSICAL_ADDRESS))
47 return; 45 return;
48 46
49 mce_setup(&m); 47 mce_setup(&m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index ddc72f83933..0ed633c5048 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -17,7 +17,6 @@
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/string.h> 18#include <linux/string.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/preempt.h>
21#include <linux/smp.h> 20#include <linux/smp.h>
22#include <linux/notifier.h> 21#include <linux/notifier.h>
23#include <linux/kdebug.h> 22#include <linux/kdebug.h>
@@ -78,33 +77,27 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
78} 77}
79 78
80static cpumask_var_t mce_inject_cpumask; 79static cpumask_var_t mce_inject_cpumask;
81static DEFINE_MUTEX(mce_inject_mutex);
82 80
83static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) 81static int mce_raise_notify(struct notifier_block *self,
82 unsigned long val, void *data)
84{ 83{
84 struct die_args *args = (struct die_args *)data;
85 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
86 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
87 if (!cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
88 return NMI_DONE; 88 return NOTIFY_DONE;
89 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
90 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
91 raise_exception(m, regs); 91 raise_exception(m, args->regs);
92 else if (m->status) 92 else if (m->status)
93 raise_poll(m); 93 raise_poll(m);
94 return NMI_HANDLED; 94 return NOTIFY_STOP;
95} 95}
96 96
97static void mce_irq_ipi(void *info) 97static struct notifier_block mce_raise_nb = {
98{ 98 .notifier_call = mce_raise_notify,
99 int cpu = smp_processor_id(); 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
100 struct mce *m = &__get_cpu_var(injectm); 100};
101
102 if (cpumask_test_cpu(cpu, mce_inject_cpumask) &&
103 m->inject_flags & MCJ_EXCEPTION) {
104 cpumask_clear_cpu(cpu, mce_inject_cpumask);
105 raise_exception(m, NULL);
106 }
107}
108 101
109/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
110static int raise_local(void) 103static int raise_local(void)
@@ -153,10 +146,9 @@ static void raise_mce(struct mce *m)
153 return; 146 return;
154 147
155#ifdef CONFIG_X86_LOCAL_APIC 148#ifdef CONFIG_X86_LOCAL_APIC
156 if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { 149 if (m->inject_flags & MCJ_NMI_BROADCAST) {
157 unsigned long start; 150 unsigned long start;
158 int cpu; 151 int cpu;
159
160 get_online_cpus(); 152 get_online_cpus();
161 cpumask_copy(mce_inject_cpumask, cpu_online_mask); 153 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
162 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); 154 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
@@ -166,25 +158,13 @@ static void raise_mce(struct mce *m)
166 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 158 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
167 cpumask_clear_cpu(cpu, mce_inject_cpumask); 159 cpumask_clear_cpu(cpu, mce_inject_cpumask);
168 } 160 }
169 if (!cpumask_empty(mce_inject_cpumask)) { 161 if (!cpumask_empty(mce_inject_cpumask))
170 if (m->inject_flags & MCJ_IRQ_BRAODCAST) { 162 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
171 /*
172 * don't wait because mce_irq_ipi is necessary
173 * to be sync with following raise_local
174 */
175 preempt_disable();
176 smp_call_function_many(mce_inject_cpumask,
177 mce_irq_ipi, NULL, 0);
178 preempt_enable();
179 } else if (m->inject_flags & MCJ_NMI_BROADCAST)
180 apic->send_IPI_mask(mce_inject_cpumask,
181 NMI_VECTOR);
182 }
183 start = jiffies; 163 start = jiffies;
184 while (!cpumask_empty(mce_inject_cpumask)) { 164 while (!cpumask_empty(mce_inject_cpumask)) {
185 if (!time_before(jiffies, start + 2*HZ)) { 165 if (!time_before(jiffies, start + 2*HZ)) {
186 printk(KERN_ERR 166 printk(KERN_ERR
187 "Timeout waiting for mce inject %lx\n", 167 "Timeout waiting for mce inject NMI %lx\n",
188 *cpumask_bits(mce_inject_cpumask)); 168 *cpumask_bits(mce_inject_cpumask));
189 break; 169 break;
190 } 170 }
@@ -195,11 +175,7 @@ static void raise_mce(struct mce *m)
195 put_online_cpus(); 175 put_online_cpus();
196 } else 176 } else
197#endif 177#endif
198 {
199 preempt_disable();
200 raise_local(); 178 raise_local();
201 preempt_enable();
202 }
203} 179}
204 180
205/* Error injection interface */ 181/* Error injection interface */
@@ -230,10 +206,7 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
230 * so do it a jiffie or two later everywhere. 206 * so do it a jiffie or two later everywhere.
231 */ 207 */
232 schedule_timeout(2); 208 schedule_timeout(2);
233
234 mutex_lock(&mce_inject_mutex);
235 raise_mce(&m); 209 raise_mce(&m);
236 mutex_unlock(&mce_inject_mutex);
237 return usize; 210 return usize;
238} 211}
239 212
@@ -242,9 +215,8 @@ static int inject_init(void)
242 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) 215 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
243 return -ENOMEM; 216 return -ENOMEM;
244 printk(KERN_INFO "Machine check injector initialized\n"); 217 printk(KERN_INFO "Machine check injector initialized\n");
245 register_mce_write_callback(mce_write); 218 mce_chrdev_ops.write = mce_write;
246 register_nmi_handler(NMI_LOCAL, mce_raise_notify, 0, 219 register_die_notifier(&mce_raise_nb);
247 "mce_notify");
248 return 0; 220 return 0;
249} 221}
250 222
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa5d3b..fefcc69ee8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -1,4 +1,4 @@
1#include <linux/device.h> 1#include <linux/sysdev.h>
2#include <asm/mce.h> 2#include <asm/mce.h>
3 3
4enum severity_level { 4enum severity_level {
@@ -17,26 +17,16 @@ enum severity_level {
17struct mce_bank { 17struct mce_bank {
18 u64 ctl; /* subevents to enable */ 18 u64 ctl; /* subevents to enable */
19 unsigned char init; /* initialise bank? */ 19 unsigned char init; /* initialise bank? */
20 struct device_attribute attr; /* device attribute */ 20 struct sysdev_attribute attr; /* sysdev attribute */
21 char attrname[ATTR_LEN]; /* attribute name */ 21 char attrname[ATTR_LEN]; /* attribute name */
22}; 22};
23 23
24int mce_severity(struct mce *a, int tolerant, char **msg); 24int mce_severity(struct mce *a, int tolerant, char **msg);
25struct dentry *mce_get_debugfs_dir(void); 25struct dentry *mce_get_debugfs_dir(void);
26 26
27extern struct mce_bank *mce_banks; 27extern int mce_ser;
28 28
29#ifdef CONFIG_X86_MCE_INTEL 29extern struct mce_bank *mce_banks;
30unsigned long mce_intel_adjust_timer(unsigned long interval);
31void mce_intel_cmci_poll(void);
32void mce_intel_hcpu_update(unsigned long cpu);
33#else
34# define mce_intel_adjust_timer mce_adjust_timer_default
35static inline void mce_intel_cmci_poll(void) { }
36static inline void mce_intel_hcpu_update(unsigned long cpu) { }
37#endif
38
39void mce_timer_kick(unsigned long interval);
40 30
41#ifdef CONFIG_ACPI_APEI 31#ifdef CONFIG_ACPI_APEI
42int apei_write_mce(struct mce *m); 32int apei_write_mce(struct mce *m);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index beb1f1689e5..7395d5f4272 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -54,7 +54,7 @@ static struct severity {
54#define MASK(x, y) .mask = x, .result = y 54#define MASK(x, y) .mask = x, .result = y
55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) 55#define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S)
56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) 56#define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR)
57#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) 57#define MCACOD 0xffff
58 58
59 MCESEV( 59 MCESEV(
60 NO, "Invalid", 60 NO, "Invalid",
@@ -102,34 +102,11 @@ static struct severity {
102 SER, BITCLR(MCI_STATUS_S) 102 SER, BITCLR(MCI_STATUS_S)
103 ), 103 ),
104 104
105 /* AR add known MCACODs here */
105 MCESEV( 106 MCESEV(
106 PANIC, "Action required with lost events", 107 PANIC, "Action required with lost events",
107 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) 108 SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR)
108 ), 109 ),
109
110 /* known AR MCACODs: */
111#ifdef CONFIG_MEMORY_FAILURE
112 MCESEV(
113 KEEP, "HT thread notices Action required: data load error",
114 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
115 MCGMASK(MCG_STATUS_EIPV, 0)
116 ),
117 MCESEV(
118 AR, "Action required: data load error",
119 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA),
120 USER
121 ),
122 MCESEV(
123 KEEP, "HT thread notices Action required: instruction fetch error",
124 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
125 MCGMASK(MCG_STATUS_EIPV, 0)
126 ),
127 MCESEV(
128 AR, "Action required: instruction fetch error",
129 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR),
130 USER
131 ),
132#endif
133 MCESEV( 110 MCESEV(
134 PANIC, "Action required: unknown MCACOD", 111 PANIC, "Action required: unknown MCACOD",
135 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) 112 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR)
@@ -138,11 +115,11 @@ static struct severity {
138 /* known AO MCACODs: */ 115 /* known AO MCACODs: */
139 MCESEV( 116 MCESEV(
140 AO, "Action optional: memory scrubbing error", 117 AO, "Action optional: memory scrubbing error",
141 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) 118 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|0xfff0, MCI_UC_S|0x00c0)
142 ), 119 ),
143 MCESEV( 120 MCESEV(
144 AO, "Action optional: last level cache writeback error", 121 AO, "Action optional: last level cache writeback error",
145 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) 122 SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|0x017a)
146 ), 123 ),
147 MCESEV( 124 MCESEV(
148 SOME, "Action optional: unknown MCACOD", 125 SOME, "Action optional: unknown MCACOD",
@@ -168,19 +145,15 @@ static struct severity {
168}; 145};
169 146
170/* 147/*
171 * If mcgstatus indicated that ip/cs on the stack were 148 * If the EIPV bit is set, it means the saved IP is the
172 * no good, then "m->cs" will be zero and we will have 149 * instruction which caused the MCE.
173 * to assume the worst case (IN_KERNEL) as we actually
174 * have no idea what we were executing when the machine
175 * check hit.
176 * If we do have a good "m->cs" (or a faked one in the
177 * case we were executing in VM86 mode) we can use it to
178 * distinguish an exception taken in user from from one
179 * taken in the kernel.
180 */ 150 */
181static int error_context(struct mce *m) 151static int error_context(struct mce *m)
182{ 152{
183 return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; 153 if (m->mcgstatus & MCG_STATUS_EIPV)
154 return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
155 /* Unknown, assume kernel */
156 return IN_KERNEL;
184} 157}
185 158
186int mce_severity(struct mce *m, int tolerant, char **msg) 159int mce_severity(struct mce *m, int tolerant, char **msg)
@@ -193,9 +166,9 @@ int mce_severity(struct mce *m, int tolerant, char **msg)
193 continue; 166 continue;
194 if ((m->mcgstatus & s->mcgmask) != s->mcgres) 167 if ((m->mcgstatus & s->mcgmask) != s->mcgres)
195 continue; 168 continue;
196 if (s->ser == SER_REQUIRED && !mca_cfg.ser) 169 if (s->ser == SER_REQUIRED && !mce_ser)
197 continue; 170 continue;
198 if (s->ser == NO_SER && mca_cfg.ser) 171 if (s->ser == NO_SER && mce_ser)
199 continue; 172 continue;
200 if (s->context && ctx != s->context) 173 if (s->context && ctx != s->context)
201 continue; 174 continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 80dbda84f1c..08363b04212 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -7,9 +7,6 @@
7 * Copyright 2008 Intel Corporation 7 * Copyright 2008 Intel Corporation
8 * Author: Andi Kleen 8 * Author: Andi Kleen
9 */ 9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/thread_info.h> 10#include <linux/thread_info.h>
14#include <linux/capability.h> 11#include <linux/capability.h>
15#include <linux/miscdevice.h> 12#include <linux/miscdevice.h>
@@ -22,7 +19,7 @@
22#include <linux/kernel.h> 19#include <linux/kernel.h>
23#include <linux/percpu.h> 20#include <linux/percpu.h>
24#include <linux/string.h> 21#include <linux/string.h>
25#include <linux/device.h> 22#include <linux/sysdev.h>
26#include <linux/syscore_ops.h> 23#include <linux/syscore_ops.h>
27#include <linux/delay.h> 24#include <linux/delay.h>
28#include <linux/ctype.h> 25#include <linux/ctype.h>
@@ -39,8 +36,8 @@
39#include <linux/fs.h> 36#include <linux/fs.h>
40#include <linux/mm.h> 37#include <linux/mm.h>
41#include <linux/debugfs.h> 38#include <linux/debugfs.h>
39#include <linux/edac_mce.h>
42#include <linux/irq_work.h> 40#include <linux/irq_work.h>
43#include <linux/export.h>
44 41
45#include <asm/processor.h> 42#include <asm/processor.h>
46#include <asm/mce.h> 43#include <asm/mce.h>
@@ -58,26 +55,35 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
58#define CREATE_TRACE_POINTS 55#define CREATE_TRACE_POINTS
59#include <trace/events/mce.h> 56#include <trace/events/mce.h>
60 57
58int mce_disabled __read_mostly;
59
60#define MISC_MCELOG_MINOR 227
61
61#define SPINUNIT 100 /* 100ns */ 62#define SPINUNIT 100 /* 100ns */
62 63
63atomic_t mce_entry; 64atomic_t mce_entry;
64 65
65DEFINE_PER_CPU(unsigned, mce_exception_count); 66DEFINE_PER_CPU(unsigned, mce_exception_count);
66 67
67struct mce_bank *mce_banks __read_mostly; 68/*
68 69 * Tolerant levels:
69struct mca_config mca_cfg __read_mostly = { 70 * 0: always panic on uncorrected errors, log corrected errors
70 .bootlog = -1, 71 * 1: panic or SIGBUS on uncorrected errors, log corrected errors
71 /* 72 * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors
72 * Tolerant levels: 73 * 3: never panic or SIGBUS, log all errors (for testing only)
73 * 0: always panic on uncorrected errors, log corrected errors 74 */
74 * 1: panic or SIGBUS on uncorrected errors, log corrected errors 75static int tolerant __read_mostly = 1;
75 * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors 76static int banks __read_mostly;
76 * 3: never panic or SIGBUS, log all errors (for testing only) 77static int rip_msr __read_mostly;
77 */ 78static int mce_bootlog __read_mostly = -1;
78 .tolerant = 1, 79static int monarch_timeout __read_mostly = -1;
79 .monarch_timeout = -1 80static int mce_panic_timeout __read_mostly;
80}; 81static int mce_dont_log_ce __read_mostly;
82int mce_cmci_disabled __read_mostly;
83int mce_ignore_ce __read_mostly;
84int mce_ser __read_mostly;
85
86struct mce_bank *mce_banks __read_mostly;
81 87
82/* User mode helper program triggered by machine check event */ 88/* User mode helper program triggered by machine check event */
83static unsigned long mce_need_notify; 89static unsigned long mce_need_notify;
@@ -89,6 +95,13 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
89static DEFINE_PER_CPU(struct mce, mces_seen); 95static DEFINE_PER_CPU(struct mce, mces_seen);
90static int cpu_missing; 96static int cpu_missing;
91 97
98/*
99 * CPU/chipset specific EDAC code can register a notifier call here to print
100 * MCE errors in a human-readable form.
101 */
102ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
103EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
104
92/* MCA banks polled by the period polling timer for corrected events */ 105/* MCA banks polled by the period polling timer for corrected events */
93DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 106DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
94 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 107 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -96,14 +109,6 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
96 109
97static DEFINE_PER_CPU(struct work_struct, mce_work); 110static DEFINE_PER_CPU(struct work_struct, mce_work);
98 111
99static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
100
101/*
102 * CPU/chipset specific EDAC code can register a notifier call here to print
103 * MCE errors in a human-readable form.
104 */
105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
106
107/* Do initial initialization of a struct mce */ 112/* Do initial initialization of a struct mce */
108void mce_setup(struct mce *m) 113void mce_setup(struct mce *m)
109{ 114{
@@ -114,7 +119,9 @@ void mce_setup(struct mce *m)
114 m->time = get_seconds(); 119 m->time = get_seconds();
115 m->cpuvendor = boot_cpu_data.x86_vendor; 120 m->cpuvendor = boot_cpu_data.x86_vendor;
116 m->cpuid = cpuid_eax(1); 121 m->cpuid = cpuid_eax(1);
122#ifdef CONFIG_SMP
117 m->socketid = cpu_data(m->extcpu).phys_proc_id; 123 m->socketid = cpu_data(m->extcpu).phys_proc_id;
124#endif
118 m->apicid = cpu_data(m->extcpu).initial_apicid; 125 m->apicid = cpu_data(m->extcpu).initial_apicid;
119 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); 126 rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
120} 127}
@@ -137,20 +144,23 @@ static struct mce_log mcelog = {
137void mce_log(struct mce *mce) 144void mce_log(struct mce *mce)
138{ 145{
139 unsigned next, entry; 146 unsigned next, entry;
140 int ret = 0;
141 147
142 /* Emit the trace record: */ 148 /* Emit the trace record: */
143 trace_mce_record(mce); 149 trace_mce_record(mce);
144 150
145 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
146 if (ret == NOTIFY_STOP)
147 return;
148
149 mce->finished = 0; 151 mce->finished = 0;
150 wmb(); 152 wmb();
151 for (;;) { 153 for (;;) {
152 entry = rcu_dereference_check_mce(mcelog.next); 154 entry = rcu_dereference_check_mce(mcelog.next);
153 for (;;) { 155 for (;;) {
156 /*
157 * If edac_mce is enabled, it will check the error type
158 * and will process it, if it is a known error.
159 * Otherwise, the error will be sent through mcelog
160 * interface
161 */
162 if (edac_mce_parse(mce))
163 return;
154 164
155 /* 165 /*
156 * When the buffer fills up discard new entries. 166 * When the buffer fills up discard new entries.
@@ -183,57 +193,6 @@ void mce_log(struct mce *mce)
183 set_bit(0, &mce_need_notify); 193 set_bit(0, &mce_need_notify);
184} 194}
185 195
186static void drain_mcelog_buffer(void)
187{
188 unsigned int next, i, prev = 0;
189
190 next = ACCESS_ONCE(mcelog.next);
191
192 do {
193 struct mce *m;
194
195 /* drain what was logged during boot */
196 for (i = prev; i < next; i++) {
197 unsigned long start = jiffies;
198 unsigned retries = 1;
199
200 m = &mcelog.entry[i];
201
202 while (!m->finished) {
203 if (time_after_eq(jiffies, start + 2*retries))
204 retries++;
205
206 cpu_relax();
207
208 if (!m->finished && retries >= 4) {
209 pr_err("skipping error being logged currently!\n");
210 break;
211 }
212 }
213 smp_rmb();
214 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
215 }
216
217 memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
218 prev = next;
219 next = cmpxchg(&mcelog.next, prev, 0);
220 } while (next != prev);
221}
222
223
224void mce_register_decode_chain(struct notifier_block *nb)
225{
226 atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
227 drain_mcelog_buffer();
228}
229EXPORT_SYMBOL_GPL(mce_register_decode_chain);
230
231void mce_unregister_decode_chain(struct notifier_block *nb)
232{
233 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
234}
235EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
236
237static void print_mce(struct mce *m) 196static void print_mce(struct mce *m)
238{ 197{
239 int ret = 0; 198 int ret = 0;
@@ -258,13 +217,8 @@ static void print_mce(struct mce *m)
258 pr_cont("MISC %llx ", m->misc); 217 pr_cont("MISC %llx ", m->misc);
259 218
260 pr_cont("\n"); 219 pr_cont("\n");
261 /* 220 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
262 * Note this output is parsed by external tools and old fields 221 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
263 * should not be changed.
264 */
265 pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n",
266 m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid,
267 cpu_data(m->extcpu).microcode);
268 222
269 /* 223 /*
270 * Print out human-readable details about the MCE error, 224 * Print out human-readable details about the MCE error,
@@ -294,7 +248,7 @@ static void wait_for_panic(void)
294 while (timeout-- > 0) 248 while (timeout-- > 0)
295 udelay(1); 249 udelay(1);
296 if (panic_timeout == 0) 250 if (panic_timeout == 0)
297 panic_timeout = mca_cfg.panic_timeout; 251 panic_timeout = mce_panic_timeout;
298 panic("Panicing machine check CPU died"); 252 panic("Panicing machine check CPU died");
299} 253}
300 254
@@ -352,7 +306,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
352 pr_emerg(HW_ERR "Machine check: %s\n", exp); 306 pr_emerg(HW_ERR "Machine check: %s\n", exp);
353 if (!fake_panic) { 307 if (!fake_panic) {
354 if (panic_timeout == 0) 308 if (panic_timeout == 0)
355 panic_timeout = mca_cfg.panic_timeout; 309 panic_timeout = mce_panic_timeout;
356 panic(msg); 310 panic(msg);
357 } else 311 } else
358 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); 312 pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
@@ -364,7 +318,7 @@ static int msr_to_offset(u32 msr)
364{ 318{
365 unsigned bank = __this_cpu_read(injectm.bank); 319 unsigned bank = __this_cpu_read(injectm.bank);
366 320
367 if (msr == mca_cfg.rip_msr) 321 if (msr == rip_msr)
368 return offsetof(struct mce, ip); 322 return offsetof(struct mce, ip);
369 if (msr == MSR_IA32_MCx_STATUS(bank)) 323 if (msr == MSR_IA32_MCx_STATUS(bank))
370 return offsetof(struct mce, status); 324 return offsetof(struct mce, status);
@@ -433,18 +387,10 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
433 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { 387 if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) {
434 m->ip = regs->ip; 388 m->ip = regs->ip;
435 m->cs = regs->cs; 389 m->cs = regs->cs;
436
437 /*
438 * When in VM86 mode make the cs look like ring 3
439 * always. This is a lie, but it's better than passing
440 * the additional vm86 bit around everywhere.
441 */
442 if (v8086_mode(regs))
443 m->cs |= 3;
444 } 390 }
445 /* Use accurate RIP reporting if available. */ 391 /* Use accurate RIP reporting if available. */
446 if (mca_cfg.rip_msr) 392 if (rip_msr)
447 m->ip = mce_rdmsrl(mca_cfg.rip_msr); 393 m->ip = mce_rdmsrl(rip_msr);
448 } 394 }
449} 395}
450 396
@@ -505,7 +451,7 @@ static int mce_ring_add(unsigned long pfn)
505 451
506int mce_available(struct cpuinfo_x86 *c) 452int mce_available(struct cpuinfo_x86 *c)
507{ 453{
508 if (mca_cfg.disabled) 454 if (mce_disabled)
509 return 0; 455 return 0;
510 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); 456 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
511} 457}
@@ -544,27 +490,6 @@ static void mce_report_event(struct pt_regs *regs)
544 irq_work_queue(&__get_cpu_var(mce_irq_work)); 490 irq_work_queue(&__get_cpu_var(mce_irq_work));
545} 491}
546 492
547/*
548 * Read ADDR and MISC registers.
549 */
550static void mce_read_aux(struct mce *m, int i)
551{
552 if (m->status & MCI_STATUS_MISCV)
553 m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
554 if (m->status & MCI_STATUS_ADDRV) {
555 m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
556
557 /*
558 * Mask the reported address by the reported granularity.
559 */
560 if (mca_cfg.ser && (m->status & MCI_STATUS_MISCV)) {
561 u8 shift = MCI_MISC_ADDR_LSB(m->misc);
562 m->addr >>= shift;
563 m->addr <<= shift;
564 }
565 }
566}
567
568DEFINE_PER_CPU(unsigned, mce_poll_count); 493DEFINE_PER_CPU(unsigned, mce_poll_count);
569 494
570/* 495/*
@@ -587,11 +512,11 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
587 struct mce m; 512 struct mce m;
588 int i; 513 int i;
589 514
590 this_cpu_inc(mce_poll_count); 515 percpu_inc(mce_poll_count);
591 516
592 mce_gather_info(&m, NULL); 517 mce_gather_info(&m, NULL);
593 518
594 for (i = 0; i < mca_cfg.banks; i++) { 519 for (i = 0; i < banks; i++) {
595 if (!mce_banks[i].ctl || !test_bit(i, *b)) 520 if (!mce_banks[i].ctl || !test_bit(i, *b))
596 continue; 521 continue;
597 522
@@ -612,10 +537,13 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
612 * TBD do the same check for MCI_STATUS_EN here? 537 * TBD do the same check for MCI_STATUS_EN here?
613 */ 538 */
614 if (!(flags & MCP_UC) && 539 if (!(flags & MCP_UC) &&
615 (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) 540 (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)))
616 continue; 541 continue;
617 542
618 mce_read_aux(&m, i); 543 if (m.status & MCI_STATUS_MISCV)
544 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
545 if (m.status & MCI_STATUS_ADDRV)
546 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
619 547
620 if (!(flags & MCP_TIMESTAMP)) 548 if (!(flags & MCP_TIMESTAMP))
621 m.tsc = 0; 549 m.tsc = 0;
@@ -623,8 +551,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
623 * Don't get the IP here because it's unlikely to 551 * Don't get the IP here because it's unlikely to
624 * have anything to do with the actual error location. 552 * have anything to do with the actual error location.
625 */ 553 */
626 if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) 554 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
627 mce_log(&m); 555 mce_log(&m);
556 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
557 }
628 558
629 /* 559 /*
630 * Clear state for this bank. 560 * Clear state for this bank.
@@ -645,22 +575,16 @@ EXPORT_SYMBOL_GPL(machine_check_poll);
645 * Do a quick check if any of the events requires a panic. 575 * Do a quick check if any of the events requires a panic.
646 * This decides if we keep the events around or clear them. 576 * This decides if we keep the events around or clear them.
647 */ 577 */
648static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, 578static int mce_no_way_out(struct mce *m, char **msg)
649 struct pt_regs *regs)
650{ 579{
651 int i, ret = 0; 580 int i;
652 581
653 for (i = 0; i < mca_cfg.banks; i++) { 582 for (i = 0; i < banks; i++) {
654 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); 583 m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
655 if (m->status & MCI_STATUS_VAL) { 584 if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY)
656 __set_bit(i, validp); 585 return 1;
657 if (quirk_no_way_out)
658 quirk_no_way_out(i, m, regs);
659 }
660 if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY)
661 ret = 1;
662 } 586 }
663 return ret; 587 return 0;
664} 588}
665 589
666/* 590/*
@@ -688,11 +612,11 @@ static int mce_timed_out(u64 *t)
688 rmb(); 612 rmb();
689 if (atomic_read(&mce_paniced)) 613 if (atomic_read(&mce_paniced))
690 wait_for_panic(); 614 wait_for_panic();
691 if (!mca_cfg.monarch_timeout) 615 if (!monarch_timeout)
692 goto out; 616 goto out;
693 if ((s64)*t < SPINUNIT) { 617 if ((s64)*t < SPINUNIT) {
694 /* CHECKME: Make panic default for 1 too? */ 618 /* CHECKME: Make panic default for 1 too? */
695 if (mca_cfg.tolerant < 1) 619 if (tolerant < 1)
696 mce_panic("Timeout synchronizing machine check over CPUs", 620 mce_panic("Timeout synchronizing machine check over CPUs",
697 NULL, NULL); 621 NULL, NULL);
698 cpu_missing = 1; 622 cpu_missing = 1;
@@ -742,8 +666,7 @@ static void mce_reign(void)
742 * Grade the severity of the errors of all the CPUs. 666 * Grade the severity of the errors of all the CPUs.
743 */ 667 */
744 for_each_possible_cpu(cpu) { 668 for_each_possible_cpu(cpu) {
745 int severity = mce_severity(&per_cpu(mces_seen, cpu), 669 int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant,
746 mca_cfg.tolerant,
747 &nmsg); 670 &nmsg);
748 if (severity > global_worst) { 671 if (severity > global_worst) {
749 msg = nmsg; 672 msg = nmsg;
@@ -757,7 +680,7 @@ static void mce_reign(void)
757 * This dumps all the mces in the log buffer and stops the 680 * This dumps all the mces in the log buffer and stops the
758 * other CPUs. 681 * other CPUs.
759 */ 682 */
760 if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) 683 if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3)
761 mce_panic("Fatal Machine check", m, msg); 684 mce_panic("Fatal Machine check", m, msg);
762 685
763 /* 686 /*
@@ -770,7 +693,7 @@ static void mce_reign(void)
770 * No machine check event found. Must be some external 693 * No machine check event found. Must be some external
771 * source or one CPU is hung. Panic. 694 * source or one CPU is hung. Panic.
772 */ 695 */
773 if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) 696 if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3)
774 mce_panic("Machine check from unknown source", NULL, NULL); 697 mce_panic("Machine check from unknown source", NULL, NULL);
775 698
776 /* 699 /*
@@ -794,7 +717,7 @@ static int mce_start(int *no_way_out)
794{ 717{
795 int order; 718 int order;
796 int cpus = num_online_cpus(); 719 int cpus = num_online_cpus();
797 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 720 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
798 721
799 if (!timeout) 722 if (!timeout)
800 return -1; 723 return -1;
@@ -858,7 +781,7 @@ static int mce_start(int *no_way_out)
858static int mce_end(int order) 781static int mce_end(int order)
859{ 782{
860 int ret = -1; 783 int ret = -1;
861 u64 timeout = (u64)mca_cfg.monarch_timeout * NSEC_PER_USEC; 784 u64 timeout = (u64)monarch_timeout * NSEC_PER_USEC;
862 785
863 if (!timeout) 786 if (!timeout)
864 goto reset; 787 goto reset;
@@ -939,58 +862,13 @@ static void mce_clear_state(unsigned long *toclear)
939{ 862{
940 int i; 863 int i;
941 864
942 for (i = 0; i < mca_cfg.banks; i++) { 865 for (i = 0; i < banks; i++) {
943 if (test_bit(i, toclear)) 866 if (test_bit(i, toclear))
944 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); 867 mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
945 } 868 }
946} 869}
947 870
948/* 871/*
949 * Need to save faulting physical address associated with a process
950 * in the machine check handler some place where we can grab it back
951 * later in mce_notify_process()
952 */
953#define MCE_INFO_MAX 16
954
955struct mce_info {
956 atomic_t inuse;
957 struct task_struct *t;
958 __u64 paddr;
959 int restartable;
960} mce_info[MCE_INFO_MAX];
961
962static void mce_save_info(__u64 addr, int c)
963{
964 struct mce_info *mi;
965
966 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) {
967 if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) {
968 mi->t = current;
969 mi->paddr = addr;
970 mi->restartable = c;
971 return;
972 }
973 }
974
975 mce_panic("Too many concurrent recoverable errors", NULL, NULL);
976}
977
978static struct mce_info *mce_find_info(void)
979{
980 struct mce_info *mi;
981
982 for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++)
983 if (atomic_read(&mi->inuse) && mi->t == current)
984 return mi;
985 return NULL;
986}
987
988static void mce_clear_info(struct mce_info *mi)
989{
990 atomic_set(&mi->inuse, 0);
991}
992
993/*
994 * The actual machine check handler. This only handles real 872 * The actual machine check handler. This only handles real
995 * exceptions when something got corrupted coming in through int 18. 873 * exceptions when something got corrupted coming in through int 18.
996 * 874 *
@@ -1004,7 +882,6 @@ static void mce_clear_info(struct mce_info *mi)
1004 */ 882 */
1005void do_machine_check(struct pt_regs *regs, long error_code) 883void do_machine_check(struct pt_regs *regs, long error_code)
1006{ 884{
1007 struct mca_config *cfg = &mca_cfg;
1008 struct mce m, *final; 885 struct mce m, *final;
1009 int i; 886 int i;
1010 int worst = 0; 887 int worst = 0;
@@ -1016,7 +893,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1016 int order; 893 int order;
1017 /* 894 /*
1018 * If no_way_out gets set, there is no safe way to recover from this 895 * If no_way_out gets set, there is no safe way to recover from this
1019 * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. 896 * MCE. If tolerant is cranked up, we'll try anyway.
1020 */ 897 */
1021 int no_way_out = 0; 898 int no_way_out = 0;
1022 /* 899 /*
@@ -1025,14 +902,16 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1025 */ 902 */
1026 int kill_it = 0; 903 int kill_it = 0;
1027 DECLARE_BITMAP(toclear, MAX_NR_BANKS); 904 DECLARE_BITMAP(toclear, MAX_NR_BANKS);
1028 DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
1029 char *msg = "Unknown"; 905 char *msg = "Unknown";
1030 906
1031 atomic_inc(&mce_entry); 907 atomic_inc(&mce_entry);
1032 908
1033 this_cpu_inc(mce_exception_count); 909 percpu_inc(mce_exception_count);
1034 910
1035 if (!cfg->banks) 911 if (notify_die(DIE_NMI, "machine check", regs, error_code,
912 18, SIGKILL) == NOTIFY_STOP)
913 goto out;
914 if (!banks)
1036 goto out; 915 goto out;
1037 916
1038 mce_gather_info(&m, regs); 917 mce_gather_info(&m, regs);
@@ -1040,15 +919,12 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1040 final = &__get_cpu_var(mces_seen); 919 final = &__get_cpu_var(mces_seen);
1041 *final = m; 920 *final = m;
1042 921
1043 memset(valid_banks, 0, sizeof(valid_banks)); 922 no_way_out = mce_no_way_out(&m, &msg);
1044 no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs);
1045 923
1046 barrier(); 924 barrier();
1047 925
1048 /* 926 /*
1049 * When no restart IP might need to kill or panic. 927 * When no restart IP must always kill or panic.
1050 * Assume the worst for now, but if we find the
1051 * severity is MCE_AR_SEVERITY we have other options.
1052 */ 928 */
1053 if (!(m.mcgstatus & MCG_STATUS_RIPV)) 929 if (!(m.mcgstatus & MCG_STATUS_RIPV))
1054 kill_it = 1; 930 kill_it = 1;
@@ -1059,10 +935,8 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1059 * because the first one to see it will clear it. 935 * because the first one to see it will clear it.
1060 */ 936 */
1061 order = mce_start(&no_way_out); 937 order = mce_start(&no_way_out);
1062 for (i = 0; i < cfg->banks; i++) { 938 for (i = 0; i < banks; i++) {
1063 __clear_bit(i, toclear); 939 __clear_bit(i, toclear);
1064 if (!test_bit(i, valid_banks))
1065 continue;
1066 if (!mce_banks[i].ctl) 940 if (!mce_banks[i].ctl)
1067 continue; 941 continue;
1068 942
@@ -1078,7 +952,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1078 * Non uncorrected or non signaled errors are handled by 952 * Non uncorrected or non signaled errors are handled by
1079 * machine_check_poll. Leave them alone, unless this panics. 953 * machine_check_poll. Leave them alone, unless this panics.
1080 */ 954 */
1081 if (!(m.status & (cfg->ser ? MCI_STATUS_S : MCI_STATUS_UC)) && 955 if (!(m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC)) &&
1082 !no_way_out) 956 !no_way_out)
1083 continue; 957 continue;
1084 958
@@ -1087,7 +961,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1087 */ 961 */
1088 add_taint(TAINT_MACHINE_CHECK); 962 add_taint(TAINT_MACHINE_CHECK);
1089 963
1090 severity = mce_severity(&m, cfg->tolerant, NULL); 964 severity = mce_severity(&m, tolerant, NULL);
1091 965
1092 /* 966 /*
1093 * When machine check was for corrected handler don't touch, 967 * When machine check was for corrected handler don't touch,
@@ -1104,14 +978,23 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1104 continue; 978 continue;
1105 } 979 }
1106 980
1107 mce_read_aux(&m, i); 981 /*
982 * Kill on action required.
983 */
984 if (severity == MCE_AR_SEVERITY)
985 kill_it = 1;
986
987 if (m.status & MCI_STATUS_MISCV)
988 m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i));
989 if (m.status & MCI_STATUS_ADDRV)
990 m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i));
1108 991
1109 /* 992 /*
1110 * Action optional error. Queue address for later processing. 993 * Action optional error. Queue address for later processing.
1111 * When the ring overflows we just ignore the AO error. 994 * When the ring overflows we just ignore the AO error.
1112 * RED-PEN add some logging mechanism when 995 * RED-PEN add some logging mechanism when
1113 * usable_address or mce_add_ring fails. 996 * usable_address or mce_add_ring fails.
1114 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 997 * RED-PEN don't ignore overflow for tolerant == 0
1115 */ 998 */
1116 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) 999 if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
1117 mce_ring_add(m.addr >> PAGE_SHIFT); 1000 mce_ring_add(m.addr >> PAGE_SHIFT);
@@ -1124,9 +1007,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1124 } 1007 }
1125 } 1008 }
1126 1009
1127 /* mce_clear_state will clear *final, save locally for use later */
1128 m = *final;
1129
1130 if (!no_way_out) 1010 if (!no_way_out)
1131 mce_clear_state(toclear); 1011 mce_clear_state(toclear);
1132 1012
@@ -1138,22 +1018,27 @@ void do_machine_check(struct pt_regs *regs, long error_code)
1138 no_way_out = worst >= MCE_PANIC_SEVERITY; 1018 no_way_out = worst >= MCE_PANIC_SEVERITY;
1139 1019
1140 /* 1020 /*
1141 * At insane "tolerant" levels we take no action. Otherwise 1021 * If we have decided that we just CAN'T continue, and the user
1142 * we only die if we have no other choice. For less serious 1022 * has not set tolerant to an insane level, give up and die.
1143 * issues we try to recover, or limit damage to the current 1023 *
1144 * process. 1024 * This is mainly used in the case when the system doesn't
1025 * support MCE broadcasting or it has been disabled.
1145 */ 1026 */
1146 if (cfg->tolerant < 3) { 1027 if (no_way_out && tolerant < 3)
1147 if (no_way_out) 1028 mce_panic("Fatal machine check on current CPU", final, msg);
1148 mce_panic("Fatal machine check on current CPU", &m, msg); 1029
1149 if (worst == MCE_AR_SEVERITY) { 1030 /*
1150 /* schedule action before return to userland */ 1031 * If the error seems to be unrecoverable, something should be
1151 mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); 1032 * done. Try to kill as little as possible. If we can kill just
1152 set_thread_flag(TIF_MCE_NOTIFY); 1033 * one task, do that. If the user has set the tolerance very
1153 } else if (kill_it) { 1034 * high, don't try to do anything at all.
1154 force_sig(SIGBUS, current); 1035 */
1155 } 1036
1156 } 1037 if (kill_it && tolerant < 3)
1038 force_sig(SIGBUS, current);
1039
1040 /* notify userspace ASAP */
1041 set_thread_flag(TIF_MCE_NOTIFY);
1157 1042
1158 if (worst > 0) 1043 if (worst > 0)
1159 mce_report_event(regs); 1044 mce_report_event(regs);
@@ -1164,66 +1049,34 @@ out:
1164} 1049}
1165EXPORT_SYMBOL_GPL(do_machine_check); 1050EXPORT_SYMBOL_GPL(do_machine_check);
1166 1051
1167#ifndef CONFIG_MEMORY_FAILURE 1052/* dummy to break dependency. actual code is in mm/memory-failure.c */
1168int memory_failure(unsigned long pfn, int vector, int flags) 1053void __attribute__((weak)) memory_failure(unsigned long pfn, int vector)
1169{ 1054{
1170 /* mce_severity() should not hand us an ACTION_REQUIRED error */ 1055 printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn);
1171 BUG_ON(flags & MF_ACTION_REQUIRED);
1172 pr_err("Uncorrected memory error in page 0x%lx ignored\n"
1173 "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
1174 pfn);
1175
1176 return 0;
1177} 1056}
1178#endif
1179 1057
1180/* 1058/*
1181 * Called in process context that interrupted by MCE and marked with 1059 * Called after mce notification in process context. This code
1182 * TIF_MCE_NOTIFY, just before returning to erroneous userland. 1060 * is allowed to sleep. Call the high level VM handler to process
1183 * This code is allowed to sleep. 1061 * any corrupted pages.
1184 * Attempt possible recovery such as calling the high level VM handler to 1062 * Assume that the work queue code only calls this one at a time
1185 * process any corrupted pages, and kill/signal current process if required. 1063 * per CPU.
1186 * Action required errors are handled here. 1064 * Note we don't disable preemption, so this code might run on the wrong
1065 * CPU. In this case the event is picked up by the scheduled work queue.
1066 * This is merely a fast path to expedite processing in some common
1067 * cases.
1187 */ 1068 */
1188void mce_notify_process(void) 1069void mce_notify_process(void)
1189{ 1070{
1190 unsigned long pfn; 1071 unsigned long pfn;
1191 struct mce_info *mi = mce_find_info(); 1072 mce_notify_irq();
1192 int flags = MF_ACTION_REQUIRED; 1073 while (mce_ring_get(&pfn))
1193 1074 memory_failure(pfn, MCE_VECTOR);
1194 if (!mi)
1195 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
1196 pfn = mi->paddr >> PAGE_SHIFT;
1197
1198 clear_thread_flag(TIF_MCE_NOTIFY);
1199
1200 pr_err("Uncorrected hardware memory error in user-access at %llx",
1201 mi->paddr);
1202 /*
1203 * We must call memory_failure() here even if the current process is
1204 * doomed. We still need to mark the page as poisoned and alert any
1205 * other users of the page.
1206 */
1207 if (!mi->restartable)
1208 flags |= MF_MUST_KILL;
1209 if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
1210 pr_err("Memory error not recovered");
1211 force_sig(SIGBUS, current);
1212 }
1213 mce_clear_info(mi);
1214} 1075}
1215 1076
1216/*
1217 * Action optional processing happens here (picking up
1218 * from the list of faulting pages that do_machine_check()
1219 * placed into the "ring").
1220 */
1221static void mce_process_work(struct work_struct *dummy) 1077static void mce_process_work(struct work_struct *dummy)
1222{ 1078{
1223 unsigned long pfn; 1079 mce_notify_process();
1224
1225 while (mce_ring_get(&pfn))
1226 memory_failure(pfn, MCE_VECTOR, 0);
1227} 1080}
1228 1081
1229#ifdef CONFIG_X86_MCE_INTEL 1082#ifdef CONFIG_X86_MCE_INTEL
@@ -1256,78 +1109,35 @@ void mce_log_therm_throt_event(__u64 status)
1256 * poller finds an MCE, poll 2x faster. When the poller finds no more 1109 * poller finds an MCE, poll 2x faster. When the poller finds no more
1257 * errors, poll 2x slower (up to check_interval seconds). 1110 * errors, poll 2x slower (up to check_interval seconds).
1258 */ 1111 */
1259static unsigned long check_interval = 5 * 60; /* 5 minutes */ 1112static int check_interval = 5 * 60; /* 5 minutes */
1260 1113
1261static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ 1114static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1262static DEFINE_PER_CPU(struct timer_list, mce_timer); 1115static DEFINE_PER_CPU(struct timer_list, mce_timer);
1263 1116
1264static unsigned long mce_adjust_timer_default(unsigned long interval) 1117static void mce_start_timer(unsigned long data)
1265{ 1118{
1266 return interval; 1119 struct timer_list *t = &per_cpu(mce_timer, data);
1267} 1120 int *n;
1268
1269static unsigned long (*mce_adjust_timer)(unsigned long interval) =
1270 mce_adjust_timer_default;
1271
1272static void mce_timer_fn(unsigned long data)
1273{
1274 struct timer_list *t = &__get_cpu_var(mce_timer);
1275 unsigned long iv;
1276 1121
1277 WARN_ON(smp_processor_id() != data); 1122 WARN_ON(smp_processor_id() != data);
1278 1123
1279 if (mce_available(__this_cpu_ptr(&cpu_info))) { 1124 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1280 machine_check_poll(MCP_TIMESTAMP, 1125 machine_check_poll(MCP_TIMESTAMP,
1281 &__get_cpu_var(mce_poll_banks)); 1126 &__get_cpu_var(mce_poll_banks));
1282 mce_intel_cmci_poll();
1283 } 1127 }
1284 1128
1285 /* 1129 /*
1286 * Alert userspace if needed. If we logged an MCE, reduce the 1130 * Alert userspace if needed. If we logged an MCE, reduce the
1287 * polling interval, otherwise increase the polling interval. 1131 * polling interval, otherwise increase the polling interval.
1288 */ 1132 */
1289 iv = __this_cpu_read(mce_next_interval); 1133 n = &__get_cpu_var(mce_next_interval);
1290 if (mce_notify_irq()) { 1134 if (mce_notify_irq())
1291 iv = max(iv / 2, (unsigned long) HZ/100); 1135 *n = max(*n/2, HZ/100);
1292 } else { 1136 else
1293 iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); 1137 *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
1294 iv = mce_adjust_timer(iv);
1295 }
1296 __this_cpu_write(mce_next_interval, iv);
1297 /* Might have become 0 after CMCI storm subsided */
1298 if (iv) {
1299 t->expires = jiffies + iv;
1300 add_timer_on(t, smp_processor_id());
1301 }
1302}
1303
1304/*
1305 * Ensure that the timer is firing in @interval from now.
1306 */
1307void mce_timer_kick(unsigned long interval)
1308{
1309 struct timer_list *t = &__get_cpu_var(mce_timer);
1310 unsigned long when = jiffies + interval;
1311 unsigned long iv = __this_cpu_read(mce_next_interval);
1312
1313 if (timer_pending(t)) {
1314 if (time_before(when, t->expires))
1315 mod_timer_pinned(t, when);
1316 } else {
1317 t->expires = round_jiffies(when);
1318 add_timer_on(t, smp_processor_id());
1319 }
1320 if (interval < iv)
1321 __this_cpu_write(mce_next_interval, interval);
1322}
1323
1324/* Must not be called in IRQ context where del_timer_sync() can deadlock */
1325static void mce_timer_delete_all(void)
1326{
1327 int cpu;
1328 1138
1329 for_each_online_cpu(cpu) 1139 t->expires = jiffies + *n;
1330 del_timer_sync(&per_cpu(mce_timer, cpu)); 1140 add_timer_on(t, smp_processor_id());
1331} 1141}
1332 1142
1333static void mce_do_trigger(struct work_struct *work) 1143static void mce_do_trigger(struct work_struct *work)
@@ -1347,6 +1157,8 @@ int mce_notify_irq(void)
1347 /* Not more than two messages every minute */ 1157 /* Not more than two messages every minute */
1348 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); 1158 static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2);
1349 1159
1160 clear_thread_flag(TIF_MCE_NOTIFY);
1161
1350 if (test_and_clear_bit(0, &mce_need_notify)) { 1162 if (test_and_clear_bit(0, &mce_need_notify)) {
1351 /* wake processes polling /dev/mcelog */ 1163 /* wake processes polling /dev/mcelog */
1352 wake_up_interruptible(&mce_chrdev_wait); 1164 wake_up_interruptible(&mce_chrdev_wait);
@@ -1371,13 +1183,11 @@ EXPORT_SYMBOL_GPL(mce_notify_irq);
1371static int __cpuinit __mcheck_cpu_mce_banks_init(void) 1183static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1372{ 1184{
1373 int i; 1185 int i;
1374 u8 num_banks = mca_cfg.banks;
1375 1186
1376 mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); 1187 mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL);
1377 if (!mce_banks) 1188 if (!mce_banks)
1378 return -ENOMEM; 1189 return -ENOMEM;
1379 1190 for (i = 0; i < banks; i++) {
1380 for (i = 0; i < num_banks; i++) {
1381 struct mce_bank *b = &mce_banks[i]; 1191 struct mce_bank *b = &mce_banks[i];
1382 1192
1383 b->ctl = -1ULL; 1193 b->ctl = -1ULL;
@@ -1397,19 +1207,19 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1397 rdmsrl(MSR_IA32_MCG_CAP, cap); 1207 rdmsrl(MSR_IA32_MCG_CAP, cap);
1398 1208
1399 b = cap & MCG_BANKCNT_MASK; 1209 b = cap & MCG_BANKCNT_MASK;
1400 if (!mca_cfg.banks) 1210 if (!banks)
1401 pr_info("CPU supports %d MCE banks\n", b); 1211 printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
1402 1212
1403 if (b > MAX_NR_BANKS) { 1213 if (b > MAX_NR_BANKS) {
1404 pr_warn("Using only %u machine check banks out of %u\n", 1214 printk(KERN_WARNING
1215 "MCE: Using only %u machine check banks out of %u\n",
1405 MAX_NR_BANKS, b); 1216 MAX_NR_BANKS, b);
1406 b = MAX_NR_BANKS; 1217 b = MAX_NR_BANKS;
1407 } 1218 }
1408 1219
1409 /* Don't support asymmetric configurations today */ 1220 /* Don't support asymmetric configurations today */
1410 WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); 1221 WARN_ON(banks != 0 && b != banks);
1411 mca_cfg.banks = b; 1222 banks = b;
1412
1413 if (!mce_banks) { 1223 if (!mce_banks) {
1414 int err = __mcheck_cpu_mce_banks_init(); 1224 int err = __mcheck_cpu_mce_banks_init();
1415 1225
@@ -1419,29 +1229,25 @@ static int __cpuinit __mcheck_cpu_cap_init(void)
1419 1229
1420 /* Use accurate RIP reporting if available. */ 1230 /* Use accurate RIP reporting if available. */
1421 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9) 1231 if ((cap & MCG_EXT_P) && MCG_EXT_CNT(cap) >= 9)
1422 mca_cfg.rip_msr = MSR_IA32_MCG_EIP; 1232 rip_msr = MSR_IA32_MCG_EIP;
1423 1233
1424 if (cap & MCG_SER_P) 1234 if (cap & MCG_SER_P)
1425 mca_cfg.ser = true; 1235 mce_ser = 1;
1426 1236
1427 return 0; 1237 return 0;
1428} 1238}
1429 1239
1430static void __mcheck_cpu_init_generic(void) 1240static void __mcheck_cpu_init_generic(void)
1431{ 1241{
1432 enum mcp_flags m_fl = 0;
1433 mce_banks_t all_banks; 1242 mce_banks_t all_banks;
1434 u64 cap; 1243 u64 cap;
1435 int i; 1244 int i;
1436 1245
1437 if (!mca_cfg.bootlog)
1438 m_fl = MCP_DONTLOG;
1439
1440 /* 1246 /*
1441 * Log the machine checks left over from the previous reset. 1247 * Log the machine checks left over from the previous reset.
1442 */ 1248 */
1443 bitmap_fill(all_banks, MAX_NR_BANKS); 1249 bitmap_fill(all_banks, MAX_NR_BANKS);
1444 machine_check_poll(MCP_UC | m_fl, &all_banks); 1250 machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
1445 1251
1446 set_in_cr4(X86_CR4_MCE); 1252 set_in_cr4(X86_CR4_MCE);
1447 1253
@@ -1449,7 +1255,7 @@ static void __mcheck_cpu_init_generic(void)
1449 if (cap & MCG_CTL_P) 1255 if (cap & MCG_CTL_P)
1450 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); 1256 wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
1451 1257
1452 for (i = 0; i < mca_cfg.banks; i++) { 1258 for (i = 0; i < banks; i++) {
1453 struct mce_bank *b = &mce_banks[i]; 1259 struct mce_bank *b = &mce_banks[i];
1454 1260
1455 if (!b->init) 1261 if (!b->init)
@@ -1459,47 +1265,17 @@ static void __mcheck_cpu_init_generic(void)
1459 } 1265 }
1460} 1266}
1461 1267
1462/*
1463 * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and
1464 * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM
1465 * Vol 3B Table 15-20). But this confuses both the code that determines
1466 * whether the machine check occurred in kernel or user mode, and also
1467 * the severity assessment code. Pretend that EIPV was set, and take the
1468 * ip/cs values from the pt_regs that mce_gather_info() ignored earlier.
1469 */
1470static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
1471{
1472 if (bank != 0)
1473 return;
1474 if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0)
1475 return;
1476 if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC|
1477 MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV|
1478 MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR|
1479 MCACOD)) !=
1480 (MCI_STATUS_UC|MCI_STATUS_EN|
1481 MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S|
1482 MCI_STATUS_AR|MCACOD_INSTR))
1483 return;
1484
1485 m->mcgstatus |= MCG_STATUS_EIPV;
1486 m->ip = regs->ip;
1487 m->cs = regs->cs;
1488}
1489
1490/* Add per CPU specific workarounds here */ 1268/* Add per CPU specific workarounds here */
1491static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) 1269static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1492{ 1270{
1493 struct mca_config *cfg = &mca_cfg;
1494
1495 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1271 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1496 pr_info("unknown CPU type - not enabling MCE support\n"); 1272 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
1497 return -EOPNOTSUPP; 1273 return -EOPNOTSUPP;
1498 } 1274 }
1499 1275
1500 /* This should be disabled by the BIOS, but isn't always */ 1276 /* This should be disabled by the BIOS, but isn't always */
1501 if (c->x86_vendor == X86_VENDOR_AMD) { 1277 if (c->x86_vendor == X86_VENDOR_AMD) {
1502 if (c->x86 == 15 && cfg->banks > 4) { 1278 if (c->x86 == 15 && banks > 4) {
1503 /* 1279 /*
1504 * disable GART TBL walk error reporting, which 1280 * disable GART TBL walk error reporting, which
1505 * trips off incorrectly with the IOMMU & 3ware 1281 * trips off incorrectly with the IOMMU & 3ware
@@ -1507,56 +1283,19 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1507 */ 1283 */
1508 clear_bit(10, (unsigned long *)&mce_banks[4].ctl); 1284 clear_bit(10, (unsigned long *)&mce_banks[4].ctl);
1509 } 1285 }
1510 if (c->x86 <= 17 && cfg->bootlog < 0) { 1286 if (c->x86 <= 17 && mce_bootlog < 0) {
1511 /* 1287 /*
1512 * Lots of broken BIOS around that don't clear them 1288 * Lots of broken BIOS around that don't clear them
1513 * by default and leave crap in there. Don't log: 1289 * by default and leave crap in there. Don't log:
1514 */ 1290 */
1515 cfg->bootlog = 0; 1291 mce_bootlog = 0;
1516 } 1292 }
1517 /* 1293 /*
1518 * Various K7s with broken bank 0 around. Always disable 1294 * Various K7s with broken bank 0 around. Always disable
1519 * by default. 1295 * by default.
1520 */ 1296 */
1521 if (c->x86 == 6 && cfg->banks > 0) 1297 if (c->x86 == 6 && banks > 0)
1522 mce_banks[0].ctl = 0; 1298 mce_banks[0].ctl = 0;
1523
1524 /*
1525 * Turn off MC4_MISC thresholding banks on those models since
1526 * they're not supported there.
1527 */
1528 if (c->x86 == 0x15 &&
1529 (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
1530 int i;
1531 u64 val, hwcr;
1532 bool need_toggle;
1533 u32 msrs[] = {
1534 0x00000413, /* MC4_MISC0 */
1535 0xc0000408, /* MC4_MISC1 */
1536 };
1537
1538 rdmsrl(MSR_K7_HWCR, hwcr);
1539
1540 /* McStatusWrEn has to be set */
1541 need_toggle = !(hwcr & BIT(18));
1542
1543 if (need_toggle)
1544 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
1545
1546 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
1547 rdmsrl(msrs[i], val);
1548
1549 /* CntP bit set? */
1550 if (val & BIT_64(62)) {
1551 val &= ~BIT_64(62);
1552 wrmsrl(msrs[i], val);
1553 }
1554 }
1555
1556 /* restore old settings */
1557 if (need_toggle)
1558 wrmsrl(MSR_K7_HWCR, hwcr);
1559 }
1560 } 1299 }
1561 1300
1562 if (c->x86_vendor == X86_VENDOR_INTEL) { 1301 if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1569,7 +1308,7 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1569 * valid event later, merely don't write CTL0. 1308 * valid event later, merely don't write CTL0.
1570 */ 1309 */
1571 1310
1572 if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) 1311 if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0)
1573 mce_banks[0].init = 0; 1312 mce_banks[0].init = 0;
1574 1313
1575 /* 1314 /*
@@ -1577,23 +1316,20 @@ static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1577 * synchronization with a one second timeout. 1316 * synchronization with a one second timeout.
1578 */ 1317 */
1579 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) && 1318 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
1580 cfg->monarch_timeout < 0) 1319 monarch_timeout < 0)
1581 cfg->monarch_timeout = USEC_PER_SEC; 1320 monarch_timeout = USEC_PER_SEC;
1582 1321
1583 /* 1322 /*
1584 * There are also broken BIOSes on some Pentium M and 1323 * There are also broken BIOSes on some Pentium M and
1585 * earlier systems: 1324 * earlier systems:
1586 */ 1325 */
1587 if (c->x86 == 6 && c->x86_model <= 13 && cfg->bootlog < 0) 1326 if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
1588 cfg->bootlog = 0; 1327 mce_bootlog = 0;
1589
1590 if (c->x86 == 6 && c->x86_model == 45)
1591 quirk_no_way_out = quirk_sandybridge_ifu;
1592 } 1328 }
1593 if (cfg->monarch_timeout < 0) 1329 if (monarch_timeout < 0)
1594 cfg->monarch_timeout = 0; 1330 monarch_timeout = 0;
1595 if (cfg->bootlog != 0) 1331 if (mce_bootlog != 0)
1596 cfg->panic_timeout = 30; 1332 mce_panic_timeout = 30;
1597 1333
1598 return 0; 1334 return 0;
1599} 1335}
@@ -1622,7 +1358,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1622 switch (c->x86_vendor) { 1358 switch (c->x86_vendor) {
1623 case X86_VENDOR_INTEL: 1359 case X86_VENDOR_INTEL:
1624 mce_intel_feature_init(c); 1360 mce_intel_feature_init(c);
1625 mce_adjust_timer = mce_intel_adjust_timer;
1626 break; 1361 break;
1627 case X86_VENDOR_AMD: 1362 case X86_VENDOR_AMD:
1628 mce_amd_feature_init(c); 1363 mce_amd_feature_init(c);
@@ -1632,32 +1367,27 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1632 } 1367 }
1633} 1368}
1634 1369
1635static void mce_start_timer(unsigned int cpu, struct timer_list *t) 1370static void __mcheck_cpu_init_timer(void)
1636{ 1371{
1637 unsigned long iv = mce_adjust_timer(check_interval * HZ); 1372 struct timer_list *t = &__get_cpu_var(mce_timer);
1373 int *n = &__get_cpu_var(mce_next_interval);
1638 1374
1639 __this_cpu_write(mce_next_interval, iv); 1375 setup_timer(t, mce_start_timer, smp_processor_id());
1640 1376
1641 if (mca_cfg.ignore_ce || !iv) 1377 if (mce_ignore_ce)
1642 return; 1378 return;
1643 1379
1644 t->expires = round_jiffies(jiffies + iv); 1380 *n = check_interval * HZ;
1381 if (!*n)
1382 return;
1383 t->expires = round_jiffies(jiffies + *n);
1645 add_timer_on(t, smp_processor_id()); 1384 add_timer_on(t, smp_processor_id());
1646} 1385}
1647 1386
1648static void __mcheck_cpu_init_timer(void)
1649{
1650 struct timer_list *t = &__get_cpu_var(mce_timer);
1651 unsigned int cpu = smp_processor_id();
1652
1653 setup_timer(t, mce_timer_fn, cpu);
1654 mce_start_timer(cpu, t);
1655}
1656
1657/* Handle unconfigured int18 (should never happen) */ 1387/* Handle unconfigured int18 (should never happen) */
1658static void unexpected_machine_check(struct pt_regs *regs, long error_code) 1388static void unexpected_machine_check(struct pt_regs *regs, long error_code)
1659{ 1389{
1660 pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", 1390 printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
1661 smp_processor_id()); 1391 smp_processor_id());
1662} 1392}
1663 1393
@@ -1671,7 +1401,7 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1671 */ 1401 */
1672void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) 1402void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1673{ 1403{
1674 if (mca_cfg.disabled) 1404 if (mce_disabled)
1675 return; 1405 return;
1676 1406
1677 if (__mcheck_cpu_ancient_init(c)) 1407 if (__mcheck_cpu_ancient_init(c))
@@ -1681,7 +1411,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1681 return; 1411 return;
1682 1412
1683 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { 1413 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1684 mca_cfg.disabled = true; 1414 mce_disabled = 1;
1685 return; 1415 return;
1686 } 1416 }
1687 1417
@@ -1757,12 +1487,6 @@ static int __mce_read_apei(char __user **ubuf, size_t usize)
1757 /* Error or no more MCE record */ 1487 /* Error or no more MCE record */
1758 if (rc <= 0) { 1488 if (rc <= 0) {
1759 mce_apei_read_done = 1; 1489 mce_apei_read_done = 1;
1760 /*
1761 * When ERST is disabled, mce_chrdev_read() should return
1762 * "no record" instead of "no device."
1763 */
1764 if (rc == -ENODEV)
1765 return 0;
1766 return rc; 1490 return rc;
1767 } 1491 }
1768 rc = -EFAULT; 1492 rc = -EFAULT;
@@ -1904,35 +1628,16 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
1904 } 1628 }
1905} 1629}
1906 1630
1907static ssize_t (*mce_write)(struct file *filp, const char __user *ubuf, 1631/* Modified in mce-inject.c, so not static or const */
1908 size_t usize, loff_t *off); 1632struct file_operations mce_chrdev_ops = {
1909
1910void register_mce_write_callback(ssize_t (*fn)(struct file *filp,
1911 const char __user *ubuf,
1912 size_t usize, loff_t *off))
1913{
1914 mce_write = fn;
1915}
1916EXPORT_SYMBOL_GPL(register_mce_write_callback);
1917
1918ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf,
1919 size_t usize, loff_t *off)
1920{
1921 if (mce_write)
1922 return mce_write(filp, ubuf, usize, off);
1923 else
1924 return -EINVAL;
1925}
1926
1927static const struct file_operations mce_chrdev_ops = {
1928 .open = mce_chrdev_open, 1633 .open = mce_chrdev_open,
1929 .release = mce_chrdev_release, 1634 .release = mce_chrdev_release,
1930 .read = mce_chrdev_read, 1635 .read = mce_chrdev_read,
1931 .write = mce_chrdev_write,
1932 .poll = mce_chrdev_poll, 1636 .poll = mce_chrdev_poll,
1933 .unlocked_ioctl = mce_chrdev_ioctl, 1637 .unlocked_ioctl = mce_chrdev_ioctl,
1934 .llseek = no_llseek, 1638 .llseek = no_llseek,
1935}; 1639};
1640EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1936 1641
1937static struct miscdevice mce_chrdev_device = { 1642static struct miscdevice mce_chrdev_device = {
1938 MISC_MCELOG_MINOR, 1643 MISC_MCELOG_MINOR,
@@ -1950,12 +1655,9 @@ static struct miscdevice mce_chrdev_device = {
1950 * check, or 0 to not wait 1655 * check, or 0 to not wait
1951 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. 1656 * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
1952 * mce=nobootlog Don't log MCEs from before booting. 1657 * mce=nobootlog Don't log MCEs from before booting.
1953 * mce=bios_cmci_threshold Don't program the CMCI threshold
1954 */ 1658 */
1955static int __init mcheck_enable(char *str) 1659static int __init mcheck_enable(char *str)
1956{ 1660{
1957 struct mca_config *cfg = &mca_cfg;
1958
1959 if (*str == 0) { 1661 if (*str == 0) {
1960 enable_p5_mce(); 1662 enable_p5_mce();
1961 return 1; 1663 return 1;
@@ -1963,25 +1665,24 @@ static int __init mcheck_enable(char *str)
1963 if (*str == '=') 1665 if (*str == '=')
1964 str++; 1666 str++;
1965 if (!strcmp(str, "off")) 1667 if (!strcmp(str, "off"))
1966 cfg->disabled = true; 1668 mce_disabled = 1;
1967 else if (!strcmp(str, "no_cmci")) 1669 else if (!strcmp(str, "no_cmci"))
1968 cfg->cmci_disabled = true; 1670 mce_cmci_disabled = 1;
1969 else if (!strcmp(str, "dont_log_ce")) 1671 else if (!strcmp(str, "dont_log_ce"))
1970 cfg->dont_log_ce = true; 1672 mce_dont_log_ce = 1;
1971 else if (!strcmp(str, "ignore_ce")) 1673 else if (!strcmp(str, "ignore_ce"))
1972 cfg->ignore_ce = true; 1674 mce_ignore_ce = 1;
1973 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) 1675 else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
1974 cfg->bootlog = (str[0] == 'b'); 1676 mce_bootlog = (str[0] == 'b');
1975 else if (!strcmp(str, "bios_cmci_threshold"))
1976 cfg->bios_cmci_threshold = true;
1977 else if (isdigit(str[0])) { 1677 else if (isdigit(str[0])) {
1978 get_option(&str, &(cfg->tolerant)); 1678 get_option(&str, &tolerant);
1979 if (*str == ',') { 1679 if (*str == ',') {
1980 ++str; 1680 ++str;
1981 get_option(&str, &(cfg->monarch_timeout)); 1681 get_option(&str, &monarch_timeout);
1982 } 1682 }
1983 } else { 1683 } else {
1984 pr_info("mce argument %s ignored. Please use /sys\n", str); 1684 printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
1685 str);
1985 return 0; 1686 return 0;
1986 } 1687 }
1987 return 1; 1688 return 1;
@@ -2007,7 +1708,7 @@ static int mce_disable_error_reporting(void)
2007{ 1708{
2008 int i; 1709 int i;
2009 1710
2010 for (i = 0; i < mca_cfg.banks; i++) { 1711 for (i = 0; i < banks; i++) {
2011 struct mce_bank *b = &mce_banks[i]; 1712 struct mce_bank *b = &mce_banks[i];
2012 1713
2013 if (b->init) 1714 if (b->init)
@@ -2044,11 +1745,12 @@ static struct syscore_ops mce_syscore_ops = {
2044}; 1745};
2045 1746
2046/* 1747/*
2047 * mce_device: Sysfs support 1748 * mce_sysdev: Sysfs support
2048 */ 1749 */
2049 1750
2050static void mce_cpu_restart(void *data) 1751static void mce_cpu_restart(void *data)
2051{ 1752{
1753 del_timer_sync(&__get_cpu_var(mce_timer));
2052 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1754 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2053 return; 1755 return;
2054 __mcheck_cpu_init_generic(); 1756 __mcheck_cpu_init_generic();
@@ -2058,15 +1760,16 @@ static void mce_cpu_restart(void *data)
2058/* Reinit MCEs after user configuration changes */ 1760/* Reinit MCEs after user configuration changes */
2059static void mce_restart(void) 1761static void mce_restart(void)
2060{ 1762{
2061 mce_timer_delete_all();
2062 on_each_cpu(mce_cpu_restart, NULL, 1); 1763 on_each_cpu(mce_cpu_restart, NULL, 1);
2063} 1764}
2064 1765
2065/* Toggle features for corrected errors */ 1766/* Toggle features for corrected errors */
2066static void mce_disable_cmci(void *data) 1767static void mce_disable_ce(void *all)
2067{ 1768{
2068 if (!mce_available(__this_cpu_ptr(&cpu_info))) 1769 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2069 return; 1770 return;
1771 if (all)
1772 del_timer_sync(&__get_cpu_var(mce_timer));
2070 cmci_clear(); 1773 cmci_clear();
2071} 1774}
2072 1775
@@ -2080,28 +1783,27 @@ static void mce_enable_ce(void *all)
2080 __mcheck_cpu_init_timer(); 1783 __mcheck_cpu_init_timer();
2081} 1784}
2082 1785
2083static struct bus_type mce_subsys = { 1786static struct sysdev_class mce_sysdev_class = {
2084 .name = "machinecheck", 1787 .name = "machinecheck",
2085 .dev_name = "machinecheck",
2086}; 1788};
2087 1789
2088DEFINE_PER_CPU(struct device *, mce_device); 1790DEFINE_PER_CPU(struct sys_device, mce_sysdev);
2089 1791
2090__cpuinitdata 1792__cpuinitdata
2091void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); 1793void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
2092 1794
2093static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) 1795static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr)
2094{ 1796{
2095 return container_of(attr, struct mce_bank, attr); 1797 return container_of(attr, struct mce_bank, attr);
2096} 1798}
2097 1799
2098static ssize_t show_bank(struct device *s, struct device_attribute *attr, 1800static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr,
2099 char *buf) 1801 char *buf)
2100{ 1802{
2101 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); 1803 return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl);
2102} 1804}
2103 1805
2104static ssize_t set_bank(struct device *s, struct device_attribute *attr, 1806static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr,
2105 const char *buf, size_t size) 1807 const char *buf, size_t size)
2106{ 1808{
2107 u64 new; 1809 u64 new;
@@ -2116,14 +1818,14 @@ static ssize_t set_bank(struct device *s, struct device_attribute *attr,
2116} 1818}
2117 1819
2118static ssize_t 1820static ssize_t
2119show_trigger(struct device *s, struct device_attribute *attr, char *buf) 1821show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf)
2120{ 1822{
2121 strcpy(buf, mce_helper); 1823 strcpy(buf, mce_helper);
2122 strcat(buf, "\n"); 1824 strcat(buf, "\n");
2123 return strlen(mce_helper) + 1; 1825 return strlen(mce_helper) + 1;
2124} 1826}
2125 1827
2126static ssize_t set_trigger(struct device *s, struct device_attribute *attr, 1828static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
2127 const char *buf, size_t siz) 1829 const char *buf, size_t siz)
2128{ 1830{
2129 char *p; 1831 char *p;
@@ -2138,8 +1840,8 @@ static ssize_t set_trigger(struct device *s, struct device_attribute *attr,
2138 return strlen(mce_helper) + !!p; 1840 return strlen(mce_helper) + !!p;
2139} 1841}
2140 1842
2141static ssize_t set_ignore_ce(struct device *s, 1843static ssize_t set_ignore_ce(struct sys_device *s,
2142 struct device_attribute *attr, 1844 struct sysdev_attribute *attr,
2143 const char *buf, size_t size) 1845 const char *buf, size_t size)
2144{ 1846{
2145 u64 new; 1847 u64 new;
@@ -2147,23 +1849,22 @@ static ssize_t set_ignore_ce(struct device *s,
2147 if (strict_strtoull(buf, 0, &new) < 0) 1849 if (strict_strtoull(buf, 0, &new) < 0)
2148 return -EINVAL; 1850 return -EINVAL;
2149 1851
2150 if (mca_cfg.ignore_ce ^ !!new) { 1852 if (mce_ignore_ce ^ !!new) {
2151 if (new) { 1853 if (new) {
2152 /* disable ce features */ 1854 /* disable ce features */
2153 mce_timer_delete_all(); 1855 on_each_cpu(mce_disable_ce, (void *)1, 1);
2154 on_each_cpu(mce_disable_cmci, NULL, 1); 1856 mce_ignore_ce = 1;
2155 mca_cfg.ignore_ce = true;
2156 } else { 1857 } else {
2157 /* enable ce features */ 1858 /* enable ce features */
2158 mca_cfg.ignore_ce = false; 1859 mce_ignore_ce = 0;
2159 on_each_cpu(mce_enable_ce, (void *)1, 1); 1860 on_each_cpu(mce_enable_ce, (void *)1, 1);
2160 } 1861 }
2161 } 1862 }
2162 return size; 1863 return size;
2163} 1864}
2164 1865
2165static ssize_t set_cmci_disabled(struct device *s, 1866static ssize_t set_cmci_disabled(struct sys_device *s,
2166 struct device_attribute *attr, 1867 struct sysdev_attribute *attr,
2167 const char *buf, size_t size) 1868 const char *buf, size_t size)
2168{ 1869{
2169 u64 new; 1870 u64 new;
@@ -2171,131 +1872,122 @@ static ssize_t set_cmci_disabled(struct device *s,
2171 if (strict_strtoull(buf, 0, &new) < 0) 1872 if (strict_strtoull(buf, 0, &new) < 0)
2172 return -EINVAL; 1873 return -EINVAL;
2173 1874
2174 if (mca_cfg.cmci_disabled ^ !!new) { 1875 if (mce_cmci_disabled ^ !!new) {
2175 if (new) { 1876 if (new) {
2176 /* disable cmci */ 1877 /* disable cmci */
2177 on_each_cpu(mce_disable_cmci, NULL, 1); 1878 on_each_cpu(mce_disable_ce, NULL, 1);
2178 mca_cfg.cmci_disabled = true; 1879 mce_cmci_disabled = 1;
2179 } else { 1880 } else {
2180 /* enable cmci */ 1881 /* enable cmci */
2181 mca_cfg.cmci_disabled = false; 1882 mce_cmci_disabled = 0;
2182 on_each_cpu(mce_enable_ce, NULL, 1); 1883 on_each_cpu(mce_enable_ce, NULL, 1);
2183 } 1884 }
2184 } 1885 }
2185 return size; 1886 return size;
2186} 1887}
2187 1888
2188static ssize_t store_int_with_restart(struct device *s, 1889static ssize_t store_int_with_restart(struct sys_device *s,
2189 struct device_attribute *attr, 1890 struct sysdev_attribute *attr,
2190 const char *buf, size_t size) 1891 const char *buf, size_t size)
2191{ 1892{
2192 ssize_t ret = device_store_int(s, attr, buf, size); 1893 ssize_t ret = sysdev_store_int(s, attr, buf, size);
2193 mce_restart(); 1894 mce_restart();
2194 return ret; 1895 return ret;
2195} 1896}
2196 1897
2197static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); 1898static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
2198static DEVICE_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); 1899static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
2199static DEVICE_INT_ATTR(monarch_timeout, 0644, mca_cfg.monarch_timeout); 1900static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout);
2200static DEVICE_BOOL_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); 1901static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce);
2201 1902
2202static struct dev_ext_attribute dev_attr_check_interval = { 1903static struct sysdev_ext_attribute attr_check_interval = {
2203 __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), 1904 _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int,
1905 store_int_with_restart),
2204 &check_interval 1906 &check_interval
2205}; 1907};
2206 1908
2207static struct dev_ext_attribute dev_attr_ignore_ce = { 1909static struct sysdev_ext_attribute attr_ignore_ce = {
2208 __ATTR(ignore_ce, 0644, device_show_bool, set_ignore_ce), 1910 _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce),
2209 &mca_cfg.ignore_ce 1911 &mce_ignore_ce
2210}; 1912};
2211 1913
2212static struct dev_ext_attribute dev_attr_cmci_disabled = { 1914static struct sysdev_ext_attribute attr_cmci_disabled = {
2213 __ATTR(cmci_disabled, 0644, device_show_bool, set_cmci_disabled), 1915 _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled),
2214 &mca_cfg.cmci_disabled 1916 &mce_cmci_disabled
2215}; 1917};
2216 1918
2217static struct device_attribute *mce_device_attrs[] = { 1919static struct sysdev_attribute *mce_sysdev_attrs[] = {
2218 &dev_attr_tolerant.attr, 1920 &attr_tolerant.attr,
2219 &dev_attr_check_interval.attr, 1921 &attr_check_interval.attr,
2220 &dev_attr_trigger, 1922 &attr_trigger,
2221 &dev_attr_monarch_timeout.attr, 1923 &attr_monarch_timeout.attr,
2222 &dev_attr_dont_log_ce.attr, 1924 &attr_dont_log_ce.attr,
2223 &dev_attr_ignore_ce.attr, 1925 &attr_ignore_ce.attr,
2224 &dev_attr_cmci_disabled.attr, 1926 &attr_cmci_disabled.attr,
2225 NULL 1927 NULL
2226}; 1928};
2227 1929
2228static cpumask_var_t mce_device_initialized; 1930static cpumask_var_t mce_sysdev_initialized;
2229
2230static void mce_device_release(struct device *dev)
2231{
2232 kfree(dev);
2233}
2234 1931
2235/* Per cpu device init. All of the cpus still share the same ctrl bank: */ 1932/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */
2236static __cpuinit int mce_device_create(unsigned int cpu) 1933static __cpuinit int mce_sysdev_create(unsigned int cpu)
2237{ 1934{
2238 struct device *dev; 1935 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
2239 int err; 1936 int err;
2240 int i, j; 1937 int i, j;
2241 1938
2242 if (!mce_available(&boot_cpu_data)) 1939 if (!mce_available(&boot_cpu_data))
2243 return -EIO; 1940 return -EIO;
2244 1941
2245 dev = kzalloc(sizeof *dev, GFP_KERNEL); 1942 memset(&sysdev->kobj, 0, sizeof(struct kobject));
2246 if (!dev) 1943 sysdev->id = cpu;
2247 return -ENOMEM; 1944 sysdev->cls = &mce_sysdev_class;
2248 dev->id = cpu;
2249 dev->bus = &mce_subsys;
2250 dev->release = &mce_device_release;
2251 1945
2252 err = device_register(dev); 1946 err = sysdev_register(sysdev);
2253 if (err) 1947 if (err)
2254 return err; 1948 return err;
2255 1949
2256 for (i = 0; mce_device_attrs[i]; i++) { 1950 for (i = 0; mce_sysdev_attrs[i]; i++) {
2257 err = device_create_file(dev, mce_device_attrs[i]); 1951 err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]);
2258 if (err) 1952 if (err)
2259 goto error; 1953 goto error;
2260 } 1954 }
2261 for (j = 0; j < mca_cfg.banks; j++) { 1955 for (j = 0; j < banks; j++) {
2262 err = device_create_file(dev, &mce_banks[j].attr); 1956 err = sysdev_create_file(sysdev, &mce_banks[j].attr);
2263 if (err) 1957 if (err)
2264 goto error2; 1958 goto error2;
2265 } 1959 }
2266 cpumask_set_cpu(cpu, mce_device_initialized); 1960 cpumask_set_cpu(cpu, mce_sysdev_initialized);
2267 per_cpu(mce_device, cpu) = dev;
2268 1961
2269 return 0; 1962 return 0;
2270error2: 1963error2:
2271 while (--j >= 0) 1964 while (--j >= 0)
2272 device_remove_file(dev, &mce_banks[j].attr); 1965 sysdev_remove_file(sysdev, &mce_banks[j].attr);
2273error: 1966error:
2274 while (--i >= 0) 1967 while (--i >= 0)
2275 device_remove_file(dev, mce_device_attrs[i]); 1968 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
2276 1969
2277 device_unregister(dev); 1970 sysdev_unregister(sysdev);
2278 1971
2279 return err; 1972 return err;
2280} 1973}
2281 1974
2282static __cpuinit void mce_device_remove(unsigned int cpu) 1975static __cpuinit void mce_sysdev_remove(unsigned int cpu)
2283{ 1976{
2284 struct device *dev = per_cpu(mce_device, cpu); 1977 struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu);
2285 int i; 1978 int i;
2286 1979
2287 if (!cpumask_test_cpu(cpu, mce_device_initialized)) 1980 if (!cpumask_test_cpu(cpu, mce_sysdev_initialized))
2288 return; 1981 return;
2289 1982
2290 for (i = 0; mce_device_attrs[i]; i++) 1983 for (i = 0; mce_sysdev_attrs[i]; i++)
2291 device_remove_file(dev, mce_device_attrs[i]); 1984 sysdev_remove_file(sysdev, mce_sysdev_attrs[i]);
2292 1985
2293 for (i = 0; i < mca_cfg.banks; i++) 1986 for (i = 0; i < banks; i++)
2294 device_remove_file(dev, &mce_banks[i].attr); 1987 sysdev_remove_file(sysdev, &mce_banks[i].attr);
2295 1988
2296 device_unregister(dev); 1989 sysdev_unregister(sysdev);
2297 cpumask_clear_cpu(cpu, mce_device_initialized); 1990 cpumask_clear_cpu(cpu, mce_sysdev_initialized);
2298 per_cpu(mce_device, cpu) = NULL;
2299} 1991}
2300 1992
2301/* Make sure there are no machine checks on offlined CPUs. */ 1993/* Make sure there are no machine checks on offlined CPUs. */
@@ -2309,7 +2001,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2309 2001
2310 if (!(action & CPU_TASKS_FROZEN)) 2002 if (!(action & CPU_TASKS_FROZEN))
2311 cmci_clear(); 2003 cmci_clear();
2312 for (i = 0; i < mca_cfg.banks; i++) { 2004 for (i = 0; i < banks; i++) {
2313 struct mce_bank *b = &mce_banks[i]; 2005 struct mce_bank *b = &mce_banks[i];
2314 2006
2315 if (b->init) 2007 if (b->init)
@@ -2327,7 +2019,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2327 2019
2328 if (!(action & CPU_TASKS_FROZEN)) 2020 if (!(action & CPU_TASKS_FROZEN))
2329 cmci_reenable(); 2021 cmci_reenable();
2330 for (i = 0; i < mca_cfg.banks; i++) { 2022 for (i = 0; i < banks; i++) {
2331 struct mce_bank *b = &mce_banks[i]; 2023 struct mce_bank *b = &mce_banks[i];
2332 2024
2333 if (b->init) 2025 if (b->init)
@@ -2342,33 +2034,38 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
2342 unsigned int cpu = (unsigned long)hcpu; 2034 unsigned int cpu = (unsigned long)hcpu;
2343 struct timer_list *t = &per_cpu(mce_timer, cpu); 2035 struct timer_list *t = &per_cpu(mce_timer, cpu);
2344 2036
2345 switch (action & ~CPU_TASKS_FROZEN) { 2037 switch (action) {
2346 case CPU_ONLINE: 2038 case CPU_ONLINE:
2347 mce_device_create(cpu); 2039 case CPU_ONLINE_FROZEN:
2040 mce_sysdev_create(cpu);
2348 if (threshold_cpu_callback) 2041 if (threshold_cpu_callback)
2349 threshold_cpu_callback(action, cpu); 2042 threshold_cpu_callback(action, cpu);
2350 break; 2043 break;
2351 case CPU_DEAD: 2044 case CPU_DEAD:
2045 case CPU_DEAD_FROZEN:
2352 if (threshold_cpu_callback) 2046 if (threshold_cpu_callback)
2353 threshold_cpu_callback(action, cpu); 2047 threshold_cpu_callback(action, cpu);
2354 mce_device_remove(cpu); 2048 mce_sysdev_remove(cpu);
2355 mce_intel_hcpu_update(cpu);
2356 break; 2049 break;
2357 case CPU_DOWN_PREPARE: 2050 case CPU_DOWN_PREPARE:
2358 smp_call_function_single(cpu, mce_disable_cpu, &action, 1); 2051 case CPU_DOWN_PREPARE_FROZEN:
2359 del_timer_sync(t); 2052 del_timer_sync(t);
2053 smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
2360 break; 2054 break;
2361 case CPU_DOWN_FAILED: 2055 case CPU_DOWN_FAILED:
2056 case CPU_DOWN_FAILED_FROZEN:
2057 if (!mce_ignore_ce && check_interval) {
2058 t->expires = round_jiffies(jiffies +
2059 __get_cpu_var(mce_next_interval));
2060 add_timer_on(t, cpu);
2061 }
2362 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2062 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
2363 mce_start_timer(cpu, t);
2364 break; 2063 break;
2365 } 2064 case CPU_POST_DEAD:
2366
2367 if (action == CPU_POST_DEAD) {
2368 /* intentionally ignoring frozen here */ 2065 /* intentionally ignoring frozen here */
2369 cmci_rediscover(cpu); 2066 cmci_rediscover(cpu);
2067 break;
2370 } 2068 }
2371
2372 return NOTIFY_OK; 2069 return NOTIFY_OK;
2373} 2070}
2374 2071
@@ -2380,9 +2077,9 @@ static __init void mce_init_banks(void)
2380{ 2077{
2381 int i; 2078 int i;
2382 2079
2383 for (i = 0; i < mca_cfg.banks; i++) { 2080 for (i = 0; i < banks; i++) {
2384 struct mce_bank *b = &mce_banks[i]; 2081 struct mce_bank *b = &mce_banks[i];
2385 struct device_attribute *a = &b->attr; 2082 struct sysdev_attribute *a = &b->attr;
2386 2083
2387 sysfs_attr_init(&a->attr); 2084 sysfs_attr_init(&a->attr);
2388 a->attr.name = b->attrname; 2085 a->attr.name = b->attrname;
@@ -2402,16 +2099,16 @@ static __init int mcheck_init_device(void)
2402 if (!mce_available(&boot_cpu_data)) 2099 if (!mce_available(&boot_cpu_data))
2403 return -EIO; 2100 return -EIO;
2404 2101
2405 zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); 2102 zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL);
2406 2103
2407 mce_init_banks(); 2104 mce_init_banks();
2408 2105
2409 err = subsys_system_register(&mce_subsys, NULL); 2106 err = sysdev_class_register(&mce_sysdev_class);
2410 if (err) 2107 if (err)
2411 return err; 2108 return err;
2412 2109
2413 for_each_online_cpu(i) { 2110 for_each_online_cpu(i) {
2414 err = mce_device_create(i); 2111 err = mce_sysdev_create(i);
2415 if (err) 2112 if (err)
2416 return err; 2113 return err;
2417 } 2114 }
@@ -2424,14 +2121,14 @@ static __init int mcheck_init_device(void)
2424 2121
2425 return err; 2122 return err;
2426} 2123}
2427device_initcall_sync(mcheck_init_device); 2124device_initcall(mcheck_init_device);
2428 2125
2429/* 2126/*
2430 * Old style boot options parsing. Only for compatibility. 2127 * Old style boot options parsing. Only for compatibility.
2431 */ 2128 */
2432static int __init mcheck_disable(char *str) 2129static int __init mcheck_disable(char *str)
2433{ 2130{
2434 mca_cfg.disabled = true; 2131 mce_disabled = 1;
2435 return 1; 2132 return 1;
2436} 2133}
2437__setup("nomce", mcheck_disable); 2134__setup("nomce", mcheck_disable);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 1ac581f38df..f5474218cff 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,17 +1,15 @@
1/* 1/*
2 * (c) 2005-2012 Advanced Micro Devices, Inc. 2 * (c) 2005, 2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the 3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or 4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html 5 * http://www.gnu.org/licenses/gpl.html
6 * 6 *
7 * Written by Jacob Shin - AMD, Inc. 7 * Written by Jacob Shin - AMD, Inc.
8 * 8 *
9 * Maintained by: Borislav Petkov <bp@alien8.de> 9 * Support : jacob.shin@amd.com
10 * 10 *
11 * April 2006 11 * April 2006
12 * - added support for AMD Family 0x10 processors 12 * - added support for AMD Family 0x10 processors
13 * May 2012
14 * - major scrubbing
15 * 13 *
16 * All MC4_MISCi registers are shared between multi-cores 14 * All MC4_MISCi registers are shared between multi-cores
17 */ 15 */
@@ -19,6 +17,7 @@
19#include <linux/notifier.h> 17#include <linux/notifier.h>
20#include <linux/kobject.h> 18#include <linux/kobject.h>
21#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/sysdev.h>
22#include <linux/errno.h> 21#include <linux/errno.h>
23#include <linux/sched.h> 22#include <linux/sched.h>
24#include <linux/sysfs.h> 23#include <linux/sysfs.h>
@@ -27,7 +26,6 @@
27#include <linux/cpu.h> 26#include <linux/cpu.h>
28#include <linux/smp.h> 27#include <linux/smp.h>
29 28
30#include <asm/amd_nb.h>
31#include <asm/apic.h> 29#include <asm/apic.h>
32#include <asm/idle.h> 30#include <asm/idle.h>
33#include <asm/mce.h> 31#include <asm/mce.h>
@@ -48,20 +46,29 @@
48#define MASK_BLKPTR_LO 0xFF000000 46#define MASK_BLKPTR_LO 0xFF000000
49#define MCG_XBLK_ADDR 0xC0000400 47#define MCG_XBLK_ADDR 0xC0000400
50 48
51static const char * const th_names[] = { 49struct threshold_block {
52 "load_store", 50 unsigned int block;
53 "insn_fetch", 51 unsigned int bank;
54 "combined_unit", 52 unsigned int cpu;
55 "", 53 u32 address;
56 "northbridge", 54 u16 interrupt_enable;
57 "execution_unit", 55 u16 threshold_limit;
56 struct kobject kobj;
57 struct list_head miscj;
58}; 58};
59 59
60struct threshold_bank {
61 struct kobject *kobj;
62 struct threshold_block *blocks;
63 cpumask_var_t cpus;
64};
60static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); 65static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
61 66
67#ifdef CONFIG_SMP
62static unsigned char shared_bank[NR_BANKS] = { 68static unsigned char shared_bank[NR_BANKS] = {
63 0, 0, 0, 0, 1 69 0, 0, 0, 0, 1
64}; 70};
71#endif
65 72
66static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ 73static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
67 74
@@ -79,41 +86,6 @@ struct thresh_restart {
79 u16 old_limit; 86 u16 old_limit;
80}; 87};
81 88
82static const char * const bank4_names(struct threshold_block *b)
83{
84 switch (b->address) {
85 /* MSR4_MISC0 */
86 case 0x00000413:
87 return "dram";
88
89 case 0xc0000408:
90 return "ht_links";
91
92 case 0xc0000409:
93 return "l3_cache";
94
95 default:
96 WARN(1, "Funny MSR: 0x%08x\n", b->address);
97 return "";
98 }
99};
100
101
102static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
103{
104 /*
105 * bank 4 supports APIC LVT interrupts implicitly since forever.
106 */
107 if (bank == 4)
108 return true;
109
110 /*
111 * IntP: interrupt present; if this bit is set, the thresholding
112 * bank can generate APIC LVT interrupts
113 */
114 return msr_high_bits & BIT(28);
115}
116
117static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) 89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
118{ 90{
119 int msr = (hi & MASK_LVTOFF_HI) >> 20; 91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
@@ -135,10 +107,8 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
135 return 1; 107 return 1;
136}; 108};
137 109
138/* 110/* must be called with correct cpu affinity */
139 * Called via smp_call_function_single(), must be called with correct 111/* Called via smp_call_function_single() */
140 * cpu affinity.
141 */
142static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
143{ 113{
144 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
@@ -161,12 +131,6 @@ static void threshold_restart_bank(void *_tr)
161 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
162 } 132 }
163 133
164 /* clear IntType */
165 hi &= ~MASK_INT_TYPE_HI;
166
167 if (!tr->b->interrupt_capable)
168 goto done;
169
170 if (tr->set_lvt_off) { 134 if (tr->set_lvt_off) {
171 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { 135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
172 /* set new lvt offset */ 136 /* set new lvt offset */
@@ -175,10 +139,9 @@ static void threshold_restart_bank(void *_tr)
175 } 139 }
176 } 140 }
177 141
178 if (tr->b->interrupt_enable) 142 tr->b->interrupt_enable ?
179 hi |= INT_TYPE_APIC; 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
180 144 (hi &= ~MASK_INT_TYPE_HI);
181 done:
182 145
183 hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
184 wrmsr(tr->b->address, lo, hi); 147 wrmsr(tr->b->address, lo, hi);
@@ -239,18 +202,18 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
239 202
240 if (!block) 203 if (!block)
241 per_cpu(bank_map, cpu) |= (1 << bank); 204 per_cpu(bank_map, cpu) |= (1 << bank);
205#ifdef CONFIG_SMP
206 if (shared_bank[bank] && c->cpu_core_id)
207 break;
208#endif
209 offset = setup_APIC_mce(offset,
210 (high & MASK_LVTOFF_HI) >> 20);
242 211
243 memset(&b, 0, sizeof(b)); 212 memset(&b, 0, sizeof(b));
244 b.cpu = cpu; 213 b.cpu = cpu;
245 b.bank = bank; 214 b.bank = bank;
246 b.block = block; 215 b.block = block;
247 b.address = address; 216 b.address = address;
248 b.interrupt_capable = lvt_interrupt_supported(bank, high);
249
250 if (b.interrupt_capable) {
251 int new = (high & MASK_LVTOFF_HI) >> 20;
252 offset = setup_APIC_mce(offset, new);
253 }
254 217
255 mce_threshold_block_init(&b, offset); 218 mce_threshold_block_init(&b, offset);
256 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
@@ -339,7 +302,7 @@ struct threshold_attr {
339#define SHOW_FIELDS(name) \ 302#define SHOW_FIELDS(name) \
340static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ 303static ssize_t show_ ## name(struct threshold_block *b, char *buf) \
341{ \ 304{ \
342 return sprintf(buf, "%lu\n", (unsigned long) b->name); \ 305 return sprintf(buf, "%lx\n", (unsigned long) b->name); \
343} 306}
344SHOW_FIELDS(interrupt_enable) 307SHOW_FIELDS(interrupt_enable)
345SHOW_FIELDS(threshold_limit) 308SHOW_FIELDS(threshold_limit)
@@ -350,9 +313,6 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
350 struct thresh_restart tr; 313 struct thresh_restart tr;
351 unsigned long new; 314 unsigned long new;
352 315
353 if (!b->interrupt_capable)
354 return -EINVAL;
355
356 if (strict_strtoul(buf, 0, &new) < 0) 316 if (strict_strtoul(buf, 0, &new) < 0)
357 return -EINVAL; 317 return -EINVAL;
358 318
@@ -390,20 +350,37 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
390 return size; 350 return size;
391} 351}
392 352
393static ssize_t show_error_count(struct threshold_block *b, char *buf) 353struct threshold_block_cross_cpu {
354 struct threshold_block *tb;
355 long retval;
356};
357
358static void local_error_count_handler(void *_tbcc)
394{ 359{
395 u32 lo, hi; 360 struct threshold_block_cross_cpu *tbcc = _tbcc;
361 struct threshold_block *b = tbcc->tb;
362 u32 low, high;
396 363
397 rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); 364 rdmsr(b->address, low, high);
365 tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
366}
398 367
399 return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - 368static ssize_t show_error_count(struct threshold_block *b, char *buf)
400 (THRESHOLD_MAX - b->threshold_limit))); 369{
370 struct threshold_block_cross_cpu tbcc = { .tb = b, };
371
372 smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
373 return sprintf(buf, "%lx\n", tbcc.retval);
401} 374}
402 375
403static struct threshold_attr error_count = { 376static ssize_t store_error_count(struct threshold_block *b,
404 .attr = {.name = __stringify(error_count), .mode = 0444 }, 377 const char *buf, size_t count)
405 .show = show_error_count, 378{
406}; 379 struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
380
381 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
382 return 1;
383}
407 384
408#define RW_ATTR(val) \ 385#define RW_ATTR(val) \
409static struct threshold_attr val = { \ 386static struct threshold_attr val = { \
@@ -414,12 +391,13 @@ static struct threshold_attr val = { \
414 391
415RW_ATTR(interrupt_enable); 392RW_ATTR(interrupt_enable);
416RW_ATTR(threshold_limit); 393RW_ATTR(threshold_limit);
394RW_ATTR(error_count);
417 395
418static struct attribute *default_attrs[] = { 396static struct attribute *default_attrs[] = {
397 &interrupt_enable.attr,
419 &threshold_limit.attr, 398 &threshold_limit.attr,
420 &error_count.attr, 399 &error_count.attr,
421 NULL, /* possibly interrupt_enable if supported, see below */ 400 NULL
422 NULL,
423}; 401};
424 402
425#define to_block(k) container_of(k, struct threshold_block, kobj) 403#define to_block(k) container_of(k, struct threshold_block, kobj)
@@ -493,14 +471,8 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
493 b->cpu = cpu; 471 b->cpu = cpu;
494 b->address = address; 472 b->address = address;
495 b->interrupt_enable = 0; 473 b->interrupt_enable = 0;
496 b->interrupt_capable = lvt_interrupt_supported(bank, high);
497 b->threshold_limit = THRESHOLD_MAX; 474 b->threshold_limit = THRESHOLD_MAX;
498 475
499 if (b->interrupt_capable)
500 threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
501 else
502 threshold_ktype.default_attrs[2] = NULL;
503
504 INIT_LIST_HEAD(&b->miscj); 476 INIT_LIST_HEAD(&b->miscj);
505 477
506 if (per_cpu(threshold_banks, cpu)[bank]->blocks) { 478 if (per_cpu(threshold_banks, cpu)[bank]->blocks) {
@@ -512,7 +484,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu,
512 484
513 err = kobject_init_and_add(&b->kobj, &threshold_ktype, 485 err = kobject_init_and_add(&b->kobj, &threshold_ktype,
514 per_cpu(threshold_banks, cpu)[bank]->kobj, 486 per_cpu(threshold_banks, cpu)[bank]->kobj,
515 (bank == 4 ? bank4_names(b) : th_names[bank])); 487 "misc%i", block);
516 if (err) 488 if (err)
517 goto out_free; 489 goto out_free;
518recurse: 490recurse:
@@ -543,91 +515,97 @@ out_free:
543 return err; 515 return err;
544} 516}
545 517
546static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) 518static __cpuinit long
519local_allocate_threshold_blocks(int cpu, unsigned int bank)
547{ 520{
548 struct list_head *head = &b->blocks->miscj; 521 return allocate_threshold_blocks(cpu, bank, 0,
549 struct threshold_block *pos = NULL; 522 MSR_IA32_MC0_MISC + bank * 4);
550 struct threshold_block *tmp = NULL;
551 int err = 0;
552
553 err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
554 if (err)
555 return err;
556
557 list_for_each_entry_safe(pos, tmp, head, miscj) {
558
559 err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
560 if (err) {
561 list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
562 kobject_del(&pos->kobj);
563
564 return err;
565 }
566 }
567 return err;
568} 523}
569 524
525/* symlinks sibling shared banks to first core. first core owns dir/files. */
570static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) 526static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
571{ 527{
572 struct device *dev = per_cpu(mce_device, cpu); 528 int i, err = 0;
573 struct amd_northbridge *nb = NULL;
574 struct threshold_bank *b = NULL; 529 struct threshold_bank *b = NULL;
575 const char *name = th_names[bank]; 530 char name[32];
576 int err = 0;
577 531
578 if (shared_bank[bank]) { 532 sprintf(name, "threshold_bank%i", bank);
579 nb = node_to_amd_nb(amd_get_nb_id(cpu)); 533
534#ifdef CONFIG_SMP
535 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
536 i = cpumask_first(cpu_llc_shared_mask(cpu));
537
538 /* first core not up yet */
539 if (cpu_data(i).cpu_core_id)
540 goto out;
580 541
581 /* threshold descriptor already initialized on this node? */ 542 /* already linked */
582 if (nb && nb->bank4) { 543 if (per_cpu(threshold_banks, cpu)[bank])
583 /* yes, use it */ 544 goto out;
584 b = nb->bank4;
585 err = kobject_add(b->kobj, &dev->kobj, name);
586 if (err)
587 goto out;
588 545
589 per_cpu(threshold_banks, cpu)[bank] = b; 546 b = per_cpu(threshold_banks, i)[bank];
590 atomic_inc(&b->cpus);
591 547
592 err = __threshold_add_blocks(b); 548 if (!b)
549 goto out;
593 550
551 err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj,
552 b->kobj, name);
553 if (err)
594 goto out; 554 goto out;
595 } 555
556 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
557 per_cpu(threshold_banks, cpu)[bank] = b;
558
559 goto out;
596 } 560 }
561#endif
597 562
598 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); 563 b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
599 if (!b) { 564 if (!b) {
600 err = -ENOMEM; 565 err = -ENOMEM;
601 goto out; 566 goto out;
602 } 567 }
568 if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
569 kfree(b);
570 err = -ENOMEM;
571 goto out;
572 }
603 573
604 b->kobj = kobject_create_and_add(name, &dev->kobj); 574 b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj);
605 if (!b->kobj) { 575 if (!b->kobj)
606 err = -EINVAL;
607 goto out_free; 576 goto out_free;
608 } 577
578#ifndef CONFIG_SMP
579 cpumask_setall(b->cpus);
580#else
581 cpumask_set_cpu(cpu, b->cpus);
582#endif
609 583
610 per_cpu(threshold_banks, cpu)[bank] = b; 584 per_cpu(threshold_banks, cpu)[bank] = b;
611 585
612 if (shared_bank[bank]) { 586 err = local_allocate_threshold_blocks(cpu, bank);
613 atomic_set(&b->cpus, 1); 587 if (err)
588 goto out_free;
614 589
615 /* nb is already initialized, see above */ 590 for_each_cpu(i, b->cpus) {
616 if (nb) { 591 if (i == cpu)
617 WARN_ON(nb->bank4); 592 continue;
618 nb->bank4 = b; 593
619 } 594 err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj,
595 b->kobj, name);
596 if (err)
597 goto out;
598
599 per_cpu(threshold_banks, i)[bank] = b;
620 } 600 }
621 601
622 err = allocate_threshold_blocks(cpu, bank, 0, 602 goto out;
623 MSR_IA32_MC0_MISC + bank * 4);
624 if (!err)
625 goto out;
626 603
627 out_free: 604out_free:
605 per_cpu(threshold_banks, cpu)[bank] = NULL;
606 free_cpumask_var(b->cpus);
628 kfree(b); 607 kfree(b);
629 608out:
630 out:
631 return err; 609 return err;
632} 610}
633 611
@@ -648,6 +626,12 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
648 return err; 626 return err;
649} 627}
650 628
629/*
630 * let's be hotplug friendly.
631 * in case of multiple core processors, the first core always takes ownership
632 * of shared sysfs dir/files, and rest of the cores will be symlinked to it.
633 */
634
651static void deallocate_threshold_block(unsigned int cpu, 635static void deallocate_threshold_block(unsigned int cpu,
652 unsigned int bank) 636 unsigned int bank)
653{ 637{
@@ -668,42 +652,37 @@ static void deallocate_threshold_block(unsigned int cpu,
668 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; 652 per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
669} 653}
670 654
671static void __threshold_remove_blocks(struct threshold_bank *b)
672{
673 struct threshold_block *pos = NULL;
674 struct threshold_block *tmp = NULL;
675
676 kobject_del(b->kobj);
677
678 list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
679 kobject_del(&pos->kobj);
680}
681
682static void threshold_remove_bank(unsigned int cpu, int bank) 655static void threshold_remove_bank(unsigned int cpu, int bank)
683{ 656{
684 struct amd_northbridge *nb;
685 struct threshold_bank *b; 657 struct threshold_bank *b;
658 char name[32];
659 int i = 0;
686 660
687 b = per_cpu(threshold_banks, cpu)[bank]; 661 b = per_cpu(threshold_banks, cpu)[bank];
688 if (!b) 662 if (!b)
689 return; 663 return;
690
691 if (!b->blocks) 664 if (!b->blocks)
692 goto free_out; 665 goto free_out;
693 666
694 if (shared_bank[bank]) { 667 sprintf(name, "threshold_bank%i", bank);
695 if (!atomic_dec_and_test(&b->cpus)) { 668
696 __threshold_remove_blocks(b); 669#ifdef CONFIG_SMP
697 per_cpu(threshold_banks, cpu)[bank] = NULL; 670 /* sibling symlink */
698 return; 671 if (shared_bank[bank] && b->blocks->cpu != cpu) {
699 } else { 672 sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name);
700 /* 673 per_cpu(threshold_banks, cpu)[bank] = NULL;
701 * the last CPU on this node using the shared bank is 674
702 * going away, remove that bank now. 675 return;
703 */ 676 }
704 nb = node_to_amd_nb(amd_get_nb_id(cpu)); 677#endif
705 nb->bank4 = NULL; 678
706 } 679 /* remove all sibling symlinks before unregistering */
680 for_each_cpu(i, b->cpus) {
681 if (i == cpu)
682 continue;
683
684 sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name);
685 per_cpu(threshold_banks, i)[bank] = NULL;
707 } 686 }
708 687
709 deallocate_threshold_block(cpu, bank); 688 deallocate_threshold_block(cpu, bank);
@@ -711,6 +690,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
711free_out: 690free_out:
712 kobject_del(b->kobj); 691 kobject_del(b->kobj);
713 kobject_put(b->kobj); 692 kobject_put(b->kobj);
693 free_cpumask_var(b->cpus);
714 kfree(b); 694 kfree(b);
715 per_cpu(threshold_banks, cpu)[bank] = NULL; 695 per_cpu(threshold_banks, cpu)[bank] = NULL;
716} 696}
@@ -759,24 +739,4 @@ static __init int threshold_init_device(void)
759 739
760 return 0; 740 return 0;
761} 741}
762/* 742device_initcall(threshold_init_device);
763 * there are 3 funcs which need to be _initcalled in a logic sequence:
764 * 1. xen_late_init_mcelog
765 * 2. mcheck_init_device
766 * 3. threshold_init_device
767 *
768 * xen_late_init_mcelog must register xen_mce_chrdev_device before
769 * native mce_chrdev_device registration if running under xen platform;
770 *
771 * mcheck_init_device should be inited before threshold_init_device to
772 * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
773 *
774 * so we use following _initcalls
775 * 1. device_initcall(xen_late_init_mcelog);
776 * 2. device_initcall_sync(mcheck_init_device);
777 * 3. late_initcall(threshold_init_device);
778 *
779 * when running under xen, the initcall order is 1,2,3;
780 * on baremetal, we skip 1 and we do only 2 and 3.
781 */
782late_initcall(threshold_init_device);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 402c454fbff..8694ef56459 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,8 +15,6 @@
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/mce.h> 16#include <asm/mce.h>
17 17
18#include "mce-internal.h"
19
20/* 18/*
21 * Support for Intel Correct Machine Check Interrupts. This allows 19 * Support for Intel Correct Machine Check Interrupts. This allows
22 * the CPU to raise an interrupt when a corrected machine check happened. 20 * the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,30 +28,15 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
30 * cmci_discover_lock protects against parallel discovery attempts 28 * cmci_discover_lock protects against parallel discovery attempts
31 * which could race against each other. 29 * which could race against each other.
32 */ 30 */
33static DEFINE_RAW_SPINLOCK(cmci_discover_lock); 31static DEFINE_SPINLOCK(cmci_discover_lock);
34
35#define CMCI_THRESHOLD 1
36#define CMCI_POLL_INTERVAL (30 * HZ)
37#define CMCI_STORM_INTERVAL (1 * HZ)
38#define CMCI_STORM_THRESHOLD 15
39
40static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
41static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
42static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
43
44enum {
45 CMCI_STORM_NONE,
46 CMCI_STORM_ACTIVE,
47 CMCI_STORM_SUBSIDED,
48};
49 32
50static atomic_t cmci_storm_on_cpus; 33#define CMCI_THRESHOLD 1
51 34
52static int cmci_supported(int *banks) 35static int cmci_supported(int *banks)
53{ 36{
54 u64 cap; 37 u64 cap;
55 38
56 if (mca_cfg.cmci_disabled || mca_cfg.ignore_ce) 39 if (mce_cmci_disabled || mce_ignore_ce)
57 return 0; 40 return 0;
58 41
59 /* 42 /*
@@ -70,93 +53,6 @@ static int cmci_supported(int *banks)
70 return !!(cap & MCG_CMCI_P); 53 return !!(cap & MCG_CMCI_P);
71} 54}
72 55
73void mce_intel_cmci_poll(void)
74{
75 if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
76 return;
77 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
78}
79
80void mce_intel_hcpu_update(unsigned long cpu)
81{
82 if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
83 atomic_dec(&cmci_storm_on_cpus);
84
85 per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
86}
87
88unsigned long mce_intel_adjust_timer(unsigned long interval)
89{
90 int r;
91
92 if (interval < CMCI_POLL_INTERVAL)
93 return interval;
94
95 switch (__this_cpu_read(cmci_storm_state)) {
96 case CMCI_STORM_ACTIVE:
97 /*
98 * We switch back to interrupt mode once the poll timer has
99 * silenced itself. That means no events recorded and the
100 * timer interval is back to our poll interval.
101 */
102 __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
103 r = atomic_sub_return(1, &cmci_storm_on_cpus);
104 if (r == 0)
105 pr_notice("CMCI storm subsided: switching to interrupt mode\n");
106 /* FALLTHROUGH */
107
108 case CMCI_STORM_SUBSIDED:
109 /*
110 * We wait for all cpus to go back to SUBSIDED
111 * state. When that happens we switch back to
112 * interrupt mode.
113 */
114 if (!atomic_read(&cmci_storm_on_cpus)) {
115 __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
116 cmci_reenable();
117 cmci_recheck();
118 }
119 return CMCI_POLL_INTERVAL;
120 default:
121 /*
122 * We have shiny weather. Let the poll do whatever it
123 * thinks.
124 */
125 return interval;
126 }
127}
128
129static bool cmci_storm_detect(void)
130{
131 unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
132 unsigned long ts = __this_cpu_read(cmci_time_stamp);
133 unsigned long now = jiffies;
134 int r;
135
136 if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
137 return true;
138
139 if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
140 cnt++;
141 } else {
142 cnt = 1;
143 __this_cpu_write(cmci_time_stamp, now);
144 }
145 __this_cpu_write(cmci_storm_cnt, cnt);
146
147 if (cnt <= CMCI_STORM_THRESHOLD)
148 return false;
149
150 cmci_clear();
151 __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
152 r = atomic_add_return(1, &cmci_storm_on_cpus);
153 mce_timer_kick(CMCI_POLL_INTERVAL);
154
155 if (r == 1)
156 pr_notice("CMCI storm detected: switching to poll mode\n");
157 return true;
158}
159
160/* 56/*
161 * The interrupt handler. This is called on every event. 57 * The interrupt handler. This is called on every event.
162 * Just call the poller directly to log any events. 58 * Just call the poller directly to log any events.
@@ -165,28 +61,33 @@ static bool cmci_storm_detect(void)
165 */ 61 */
166static void intel_threshold_interrupt(void) 62static void intel_threshold_interrupt(void)
167{ 63{
168 if (cmci_storm_detect())
169 return;
170 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 64 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
171 mce_notify_irq(); 65 mce_notify_irq();
172} 66}
173 67
68static void print_update(char *type, int *hdr, int num)
69{
70 if (*hdr == 0)
71 printk(KERN_INFO "CPU %d MCA banks", smp_processor_id());
72 *hdr = 1;
73 printk(KERN_CONT " %s:%d", type, num);
74}
75
174/* 76/*
175 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks 77 * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
176 * on this CPU. Use the algorithm recommended in the SDM to discover shared 78 * on this CPU. Use the algorithm recommended in the SDM to discover shared
177 * banks. 79 * banks.
178 */ 80 */
179static void cmci_discover(int banks) 81static void cmci_discover(int banks, int boot)
180{ 82{
181 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); 83 unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);
182 unsigned long flags; 84 unsigned long flags;
85 int hdr = 0;
183 int i; 86 int i;
184 int bios_wrong_thresh = 0;
185 87
186 raw_spin_lock_irqsave(&cmci_discover_lock, flags); 88 spin_lock_irqsave(&cmci_discover_lock, flags);
187 for (i = 0; i < banks; i++) { 89 for (i = 0; i < banks; i++) {
188 u64 val; 90 u64 val;
189 int bios_zero_thresh = 0;
190 91
191 if (test_bit(i, owned)) 92 if (test_bit(i, owned))
192 continue; 93 continue;
@@ -195,52 +96,29 @@ static void cmci_discover(int banks)
195 96
196 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
197 if (val & MCI_CTL2_CMCI_EN) { 98 if (val & MCI_CTL2_CMCI_EN) {
198 clear_bit(i, owned); 99 if (test_and_clear_bit(i, owned) && !boot)
100 print_update("SHD", &hdr, i);
199 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
200 continue; 102 continue;
201 } 103 }
202 104
203 if (!mca_cfg.bios_cmci_threshold) { 105 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
204 val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; 106 val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
205 val |= CMCI_THRESHOLD;
206 } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) {
207 /*
208 * If bios_cmci_threshold boot option was specified
209 * but the threshold is zero, we'll try to initialize
210 * it to 1.
211 */
212 bios_zero_thresh = 1;
213 val |= CMCI_THRESHOLD;
214 }
215
216 val |= MCI_CTL2_CMCI_EN;
217 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 107 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
218 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 108 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
219 109
220 /* Did the enable bit stick? -- the bank supports CMCI */ 110 /* Did the enable bit stick? -- the bank supports CMCI */
221 if (val & MCI_CTL2_CMCI_EN) { 111 if (val & MCI_CTL2_CMCI_EN) {
222 set_bit(i, owned); 112 if (!test_and_set_bit(i, owned) && !boot)
113 print_update("CMCI", &hdr, i);
223 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 114 __clear_bit(i, __get_cpu_var(mce_poll_banks));
224 /*
225 * We are able to set thresholds for some banks that
226 * had a threshold of 0. This means the BIOS has not
227 * set the thresholds properly or does not work with
228 * this boot option. Note down now and report later.
229 */
230 if (mca_cfg.bios_cmci_threshold && bios_zero_thresh &&
231 (val & MCI_CTL2_CMCI_THRESHOLD_MASK))
232 bios_wrong_thresh = 1;
233 } else { 115 } else {
234 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); 116 WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
235 } 117 }
236 } 118 }
237 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 119 spin_unlock_irqrestore(&cmci_discover_lock, flags);
238 if (mca_cfg.bios_cmci_threshold && bios_wrong_thresh) { 120 if (hdr)
239 pr_info_once( 121 printk(KERN_CONT "\n");
240 "bios_cmci_threshold: Some banks do not have valid thresholds set\n");
241 pr_info_once(
242 "bios_cmci_threshold: Make sure your BIOS supports this boot option\n");
243 }
244} 122}
245 123
246/* 124/*
@@ -272,28 +150,17 @@ void cmci_clear(void)
272 150
273 if (!cmci_supported(&banks)) 151 if (!cmci_supported(&banks))
274 return; 152 return;
275 raw_spin_lock_irqsave(&cmci_discover_lock, flags); 153 spin_lock_irqsave(&cmci_discover_lock, flags);
276 for (i = 0; i < banks; i++) { 154 for (i = 0; i < banks; i++) {
277 if (!test_bit(i, __get_cpu_var(mce_banks_owned))) 155 if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
278 continue; 156 continue;
279 /* Disable CMCI */ 157 /* Disable CMCI */
280 rdmsrl(MSR_IA32_MCx_CTL2(i), val); 158 rdmsrl(MSR_IA32_MCx_CTL2(i), val);
281 val &= ~MCI_CTL2_CMCI_EN; 159 val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
282 wrmsrl(MSR_IA32_MCx_CTL2(i), val); 160 wrmsrl(MSR_IA32_MCx_CTL2(i), val);
283 __clear_bit(i, __get_cpu_var(mce_banks_owned)); 161 __clear_bit(i, __get_cpu_var(mce_banks_owned));
284 } 162 }
285 raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); 163 spin_unlock_irqrestore(&cmci_discover_lock, flags);
286}
287
288static long cmci_rediscover_work_func(void *arg)
289{
290 int banks;
291
292 /* Recheck banks in case CPUs don't all have the same */
293 if (cmci_supported(&banks))
294 cmci_discover(banks);
295
296 return 0;
297} 164}
298 165
299/* 166/*
@@ -302,22 +169,28 @@ static long cmci_rediscover_work_func(void *arg)
302 */ 169 */
303void cmci_rediscover(int dying) 170void cmci_rediscover(int dying)
304{ 171{
305 int cpu, banks; 172 int banks;
173 int cpu;
174 cpumask_var_t old;
306 175
307 if (!cmci_supported(&banks)) 176 if (!cmci_supported(&banks))
308 return; 177 return;
178 if (!alloc_cpumask_var(&old, GFP_KERNEL))
179 return;
180 cpumask_copy(old, &current->cpus_allowed);
309 181
310 for_each_online_cpu(cpu) { 182 for_each_online_cpu(cpu) {
311 if (cpu == dying) 183 if (cpu == dying)
312 continue; 184 continue;
313 185 if (set_cpus_allowed_ptr(current, cpumask_of(cpu)))
314 if (cpu == smp_processor_id()) {
315 cmci_rediscover_work_func(NULL);
316 continue; 186 continue;
317 } 187 /* Recheck banks in case CPUs don't all have the same */
318 188 if (cmci_supported(&banks))
319 work_on_cpu(cpu, cmci_rediscover_work_func, NULL); 189 cmci_discover(banks, 0);
320 } 190 }
191
192 set_cpus_allowed_ptr(current, old);
193 free_cpumask_var(old);
321} 194}
322 195
323/* 196/*
@@ -327,7 +200,7 @@ void cmci_reenable(void)
327{ 200{
328 int banks; 201 int banks;
329 if (cmci_supported(&banks)) 202 if (cmci_supported(&banks))
330 cmci_discover(banks); 203 cmci_discover(banks, 0);
331} 204}
332 205
333static void intel_init_cmci(void) 206static void intel_init_cmci(void)
@@ -338,7 +211,7 @@ static void intel_init_cmci(void)
338 return; 211 return;
339 212
340 mce_threshold_vector = intel_threshold_interrupt; 213 mce_threshold_vector = intel_threshold_interrupt;
341 cmci_discover(banks); 214 cmci_discover(banks, 1);
342 /* 215 /*
343 * For CPU #0 this runs with still disabled APIC, but that's 216 * For CPU #0 this runs with still disabled APIC, but that's
344 * ok because only the vector is set up. We still do another 217 * ok because only the vector is set up. We still do another
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 2d5454cd2c4..5c0e6533d9b 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,6 +9,7 @@
9#include <linux/smp.h> 9#include <linux/smp.h>
10 10
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/system.h>
12#include <asm/mce.h> 13#include <asm/mce.h>
13#include <asm/msr.h> 14#include <asm/msr.h>
14 15
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 47a1870279a..27c625178bf 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -18,13 +18,14 @@
18#include <linux/jiffies.h> 18#include <linux/jiffies.h>
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/percpu.h> 20#include <linux/percpu.h>
21#include <linux/export.h> 21#include <linux/sysdev.h>
22#include <linux/types.h> 22#include <linux/types.h>
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/smp.h> 24#include <linux/smp.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26 26
27#include <asm/processor.h> 27#include <asm/processor.h>
28#include <asm/system.h>
28#include <asm/apic.h> 29#include <asm/apic.h>
29#include <asm/idle.h> 30#include <asm/idle.h>
30#include <asm/mce.h> 31#include <asm/mce.h>
@@ -67,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0);
67static u32 lvtthmr_init __read_mostly; 68static u32 lvtthmr_init __read_mostly;
68 69
69#ifdef CONFIG_SYSFS 70#ifdef CONFIG_SYSFS
70#define define_therm_throt_device_one_ro(_name) \ 71#define define_therm_throt_sysdev_one_ro(_name) \
71 static DEVICE_ATTR(_name, 0444, \ 72 static SYSDEV_ATTR(_name, 0444, \
72 therm_throt_device_show_##_name, \ 73 therm_throt_sysdev_show_##_name, \
73 NULL) \ 74 NULL) \
74 75
75#define define_therm_throt_device_show_func(event, name) \ 76#define define_therm_throt_sysdev_show_func(event, name) \
76 \ 77 \
77static ssize_t therm_throt_device_show_##event##_##name( \ 78static ssize_t therm_throt_sysdev_show_##event##_##name( \
78 struct device *dev, \ 79 struct sys_device *dev, \
79 struct device_attribute *attr, \ 80 struct sysdev_attribute *attr, \
80 char *buf) \ 81 char *buf) \
81{ \ 82{ \
82 unsigned int cpu = dev->id; \ 83 unsigned int cpu = dev->id; \
@@ -93,20 +94,20 @@ static ssize_t therm_throt_device_show_##event##_##name( \
93 return ret; \ 94 return ret; \
94} 95}
95 96
96define_therm_throt_device_show_func(core_throttle, count); 97define_therm_throt_sysdev_show_func(core_throttle, count);
97define_therm_throt_device_one_ro(core_throttle_count); 98define_therm_throt_sysdev_one_ro(core_throttle_count);
98 99
99define_therm_throt_device_show_func(core_power_limit, count); 100define_therm_throt_sysdev_show_func(core_power_limit, count);
100define_therm_throt_device_one_ro(core_power_limit_count); 101define_therm_throt_sysdev_one_ro(core_power_limit_count);
101 102
102define_therm_throt_device_show_func(package_throttle, count); 103define_therm_throt_sysdev_show_func(package_throttle, count);
103define_therm_throt_device_one_ro(package_throttle_count); 104define_therm_throt_sysdev_one_ro(package_throttle_count);
104 105
105define_therm_throt_device_show_func(package_power_limit, count); 106define_therm_throt_sysdev_show_func(package_power_limit, count);
106define_therm_throt_device_one_ro(package_power_limit_count); 107define_therm_throt_sysdev_one_ro(package_power_limit_count);
107 108
108static struct attribute *thermal_throttle_attrs[] = { 109static struct attribute *thermal_throttle_attrs[] = {
109 &dev_attr_core_throttle_count.attr, 110 &attr_core_throttle_count.attr,
110 NULL 111 NULL
111}; 112};
112 113
@@ -221,36 +222,36 @@ static int thresh_event_valid(int event)
221 222
222#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
223/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
224static __cpuinit int thermal_throttle_add_dev(struct device *dev, 225static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
225 unsigned int cpu) 226 unsigned int cpu)
226{ 227{
227 int err; 228 int err;
228 struct cpuinfo_x86 *c = &cpu_data(cpu); 229 struct cpuinfo_x86 *c = &cpu_data(cpu);
229 230
230 err = sysfs_create_group(&dev->kobj, &thermal_attr_group); 231 err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
231 if (err) 232 if (err)
232 return err; 233 return err;
233 234
234 if (cpu_has(c, X86_FEATURE_PLN)) 235 if (cpu_has(c, X86_FEATURE_PLN))
235 err = sysfs_add_file_to_group(&dev->kobj, 236 err = sysfs_add_file_to_group(&sys_dev->kobj,
236 &dev_attr_core_power_limit_count.attr, 237 &attr_core_power_limit_count.attr,
237 thermal_attr_group.name); 238 thermal_attr_group.name);
238 if (cpu_has(c, X86_FEATURE_PTS)) { 239 if (cpu_has(c, X86_FEATURE_PTS)) {
239 err = sysfs_add_file_to_group(&dev->kobj, 240 err = sysfs_add_file_to_group(&sys_dev->kobj,
240 &dev_attr_package_throttle_count.attr, 241 &attr_package_throttle_count.attr,
241 thermal_attr_group.name); 242 thermal_attr_group.name);
242 if (cpu_has(c, X86_FEATURE_PLN)) 243 if (cpu_has(c, X86_FEATURE_PLN))
243 err = sysfs_add_file_to_group(&dev->kobj, 244 err = sysfs_add_file_to_group(&sys_dev->kobj,
244 &dev_attr_package_power_limit_count.attr, 245 &attr_package_power_limit_count.attr,
245 thermal_attr_group.name); 246 thermal_attr_group.name);
246 } 247 }
247 248
248 return err; 249 return err;
249} 250}
250 251
251static __cpuinit void thermal_throttle_remove_dev(struct device *dev) 252static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
252{ 253{
253 sysfs_remove_group(&dev->kobj, &thermal_attr_group); 254 sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
254} 255}
255 256
256/* Mutex protecting device creation against CPU hotplug: */ 257/* Mutex protecting device creation against CPU hotplug: */
@@ -263,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
263 void *hcpu) 264 void *hcpu)
264{ 265{
265 unsigned int cpu = (unsigned long)hcpu; 266 unsigned int cpu = (unsigned long)hcpu;
266 struct device *dev; 267 struct sys_device *sys_dev;
267 int err = 0; 268 int err = 0;
268 269
269 dev = get_cpu_device(cpu); 270 sys_dev = get_cpu_sysdev(cpu);
270 271
271 switch (action) { 272 switch (action) {
272 case CPU_UP_PREPARE: 273 case CPU_UP_PREPARE:
273 case CPU_UP_PREPARE_FROZEN: 274 case CPU_UP_PREPARE_FROZEN:
274 mutex_lock(&therm_cpu_lock); 275 mutex_lock(&therm_cpu_lock);
275 err = thermal_throttle_add_dev(dev, cpu); 276 err = thermal_throttle_add_dev(sys_dev, cpu);
276 mutex_unlock(&therm_cpu_lock); 277 mutex_unlock(&therm_cpu_lock);
277 WARN_ON(err); 278 WARN_ON(err);
278 break; 279 break;
@@ -281,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
281 case CPU_DEAD: 282 case CPU_DEAD:
282 case CPU_DEAD_FROZEN: 283 case CPU_DEAD_FROZEN:
283 mutex_lock(&therm_cpu_lock); 284 mutex_lock(&therm_cpu_lock);
284 thermal_throttle_remove_dev(dev); 285 thermal_throttle_remove_dev(sys_dev);
285 mutex_unlock(&therm_cpu_lock); 286 mutex_unlock(&therm_cpu_lock);
286 break; 287 break;
287 } 288 }
@@ -308,7 +309,7 @@ static __init int thermal_throttle_init_device(void)
308#endif 309#endif
309 /* connect live CPUs to sysfs */ 310 /* connect live CPUs to sysfs */
310 for_each_online_cpu(cpu) { 311 for_each_online_cpu(cpu) {
311 err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); 312 err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu);
312 WARN_ON(err); 313 WARN_ON(err);
313 } 314 }
314#ifdef CONFIG_HOTPLUG_CPU 315#ifdef CONFIG_HOTPLUG_CPU
@@ -321,6 +322,17 @@ device_initcall(thermal_throttle_init_device);
321 322
322#endif /* CONFIG_SYSFS */ 323#endif /* CONFIG_SYSFS */
323 324
325/*
326 * Set up the most two significant bit to notify mce log that this thermal
327 * event type.
328 * This is a temp solution. May be changed in the future with mce log
329 * infrasture.
330 */
331#define CORE_THROTTLED (0)
332#define CORE_POWER_LIMIT ((__u64)1 << 62)
333#define PACKAGE_THROTTLED ((__u64)2 << 62)
334#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
335
324static void notify_thresholds(__u64 msr_val) 336static void notify_thresholds(__u64 msr_val)
325{ 337{
326 /* check whether the interrupt handler is defined; 338 /* check whether the interrupt handler is defined;
@@ -350,23 +362,27 @@ static void intel_thermal_interrupt(void)
350 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 362 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
351 THERMAL_THROTTLING_EVENT, 363 THERMAL_THROTTLING_EVENT,
352 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
353 mce_log_therm_throt_event(msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
354 366
355 if (this_cpu_has(X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
356 therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
357 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
358 CORE_LEVEL); 370 CORE_LEVEL) != 0)
371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
359 372
360 if (this_cpu_has(X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
361 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
362 therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
363 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
364 PACKAGE_LEVEL); 377 PACKAGE_LEVEL) != 0)
378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
365 if (this_cpu_has(X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
366 therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
367 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
368 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
369 PACKAGE_LEVEL); 383 PACKAGE_LEVEL) != 0)
384 mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
385 | msr_val);
370 } 386 }
371} 387}
372 388
@@ -380,8 +396,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
380 396
381asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) 397asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
382{ 398{
383 irq_enter();
384 exit_idle(); 399 exit_idle();
400 irq_enter();
385 inc_irq_stat(irq_thermal_count); 401 inc_irq_stat(irq_thermal_count);
386 smp_thermal_vector(); 402 smp_thermal_vector();
387 irq_exit(); 403 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c
index aa578cadb94..d746df2909c 100644
--- a/arch/x86/kernel/cpu/mcheck/threshold.c
+++ b/arch/x86/kernel/cpu/mcheck/threshold.c
@@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt;
19 19
20asmlinkage void smp_threshold_interrupt(void) 20asmlinkage void smp_threshold_interrupt(void)
21{ 21{
22 irq_enter();
23 exit_idle(); 22 exit_idle();
23 irq_enter();
24 inc_irq_stat(irq_threshold_count); 24 inc_irq_stat(irq_threshold_count);
25 mce_threshold_vector(); 25 mce_threshold_vector();
26 irq_exit(); 26 irq_exit();
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 2d7998fb628..54060f56597 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9 9
10#include <asm/processor.h> 10#include <asm/processor.h>
11#include <asm/system.h>
11#include <asm/mce.h> 12#include <asm/mce.h>
12#include <asm/msr.h> 13#include <asm/msr.h>
13 14
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
index 091972ef49d..dfea390e160 100644
--- a/arch/x86/kernel/cpu/mkcapflags.pl
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -1,4 +1,4 @@
1#!/usr/bin/perl -w 1#!/usr/bin/perl
2# 2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h 3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4# 4#
@@ -8,41 +8,25 @@
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; 8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; 9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10 10
11print OUT "#ifndef _ASM_X86_CPUFEATURE_H\n"; 11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "#include <asm/cpufeature.h>\n";
13print OUT "#endif\n";
14print OUT "\n";
15print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; 12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
16 13
17%features = ();
18$err = 0;
19
20while (defined($line = <IN>)) { 14while (defined($line = <IN>)) {
21 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { 15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
22 $macro = $1; 16 $macro = $1;
23 $feature = "\L$2"; 17 $feature = $2;
24 $tail = $3; 18 $tail = $3;
25 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { 19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
26 $feature = "\L$1"; 20 $feature = $1;
27 } 21 }
28 22
29 next if ($feature eq ''); 23 if ($feature ne '') {
30 24 printf OUT "\t%-32s = \"%s\",\n",
31 if ($features{$feature}++) { 25 "[$macro]", "\L$feature";
32 print STDERR "$in: duplicate feature name: $feature\n";
33 $err++;
34 } 26 }
35 printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature;
36 } 27 }
37} 28}
38print OUT "};\n"; 29print OUT "};\n";
39 30
40close(IN); 31close(IN);
41close(OUT); 32close(OUT);
42
43if ($err) {
44 unlink($out);
45 exit(1);
46}
47
48exit(0);
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 0a630dd4b62..d944bf6c50e 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -11,8 +11,6 @@
11 */ 11 */
12 12
13#include <linux/types.h> 13#include <linux/types.h>
14#include <linux/time.h>
15#include <linux/clocksource.h>
16#include <linux/module.h> 14#include <linux/module.h>
17#include <asm/processor.h> 15#include <asm/processor.h>
18#include <asm/hypervisor.h> 16#include <asm/hypervisor.h>
@@ -38,25 +36,6 @@ static bool __init ms_hyperv_platform(void)
38 !memcmp("Microsoft Hv", hyp_signature, 12); 36 !memcmp("Microsoft Hv", hyp_signature, 12);
39} 37}
40 38
41static cycle_t read_hv_clock(struct clocksource *arg)
42{
43 cycle_t current_tick;
44 /*
45 * Read the partition counter to get the current tick count. This count
46 * is set to 0 when the partition is created and is incremented in
47 * 100 nanosecond units.
48 */
49 rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick);
50 return current_tick;
51}
52
53static struct clocksource hyperv_cs = {
54 .name = "hyperv_clocksource",
55 .rating = 400, /* use this when running on Hyperv*/
56 .read = read_hv_clock,
57 .mask = CLOCKSOURCE_MASK(64),
58};
59
60static void __init ms_hyperv_init_platform(void) 39static void __init ms_hyperv_init_platform(void)
61{ 40{
62 /* 41 /*
@@ -67,8 +46,6 @@ static void __init ms_hyperv_init_platform(void)
67 46
68 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", 47 printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
69 ms_hyperv.features, ms_hyperv.hints); 48 ms_hyperv.features, ms_hyperv.hints);
70
71 clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100);
72} 49}
73 50
74const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { 51const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 35ffda5d072..ac140c7be39 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -258,15 +258,15 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
258 258
259 /* Compute the maximum size with which we can make a range: */ 259 /* Compute the maximum size with which we can make a range: */
260 if (range_startk) 260 if (range_startk)
261 max_align = __ffs(range_startk); 261 max_align = ffs(range_startk) - 1;
262 else 262 else
263 max_align = BITS_PER_LONG - 1; 263 max_align = 32;
264 264
265 align = __fls(range_sizek); 265 align = fls(range_sizek) - 1;
266 if (align > max_align) 266 if (align > max_align)
267 align = max_align; 267 align = max_align;
268 268
269 sizek = 1UL << align; 269 sizek = 1 << align;
270 if (debug_print) { 270 if (debug_print) {
271 char start_factor = 'K', size_factor = 'K'; 271 char start_factor = 'K', size_factor = 'K';
272 unsigned long start_base, size_base; 272 unsigned long start_base, size_base;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index e9fe907cd24..a71efcdbb09 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -12,6 +12,7 @@
12#include <asm/processor-flags.h> 12#include <asm/processor-flags.h>
13#include <asm/cpufeature.h> 13#include <asm/cpufeature.h>
14#include <asm/tlbflush.h> 14#include <asm/tlbflush.h>
15#include <asm/system.h>
15#include <asm/mtrr.h> 16#include <asm/mtrr.h>
16#include <asm/msr.h> 17#include <asm/msr.h>
17#include <asm/pat.h> 18#include <asm/pat.h>
@@ -361,7 +362,11 @@ static void __init print_mtrr_state(void)
361 } 362 }
362 pr_debug("MTRR variable ranges %sabled:\n", 363 pr_debug("MTRR variable ranges %sabled:\n",
363 mtrr_state.enabled & 2 ? "en" : "dis"); 364 mtrr_state.enabled & 2 ? "en" : "dis");
364 high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; 365 if (size_or_mask & 0xffffffffUL)
366 high_width = ffs(size_or_mask & 0xffffffffUL) - 1;
367 else
368 high_width = ffs(size_or_mask>>32) + 32 - 1;
369 high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4;
365 370
366 for (i = 0; i < num_var_ranges; ++i) { 371 for (i = 0; i < num_var_ranges; ++i) {
367 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) 372 if (mtrr_state.var_ranges[i].mask_lo & (1 << 11))
@@ -542,7 +547,6 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
542 547
543 if (tmp != mask_lo) { 548 if (tmp != mask_lo) {
544 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); 549 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
545 add_taint(TAINT_FIRMWARE_WORKAROUND);
546 mask_lo = tmp; 550 mask_lo = tmp;
547 } 551 }
548 } 552 }
@@ -689,7 +693,6 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
689 693
690 /* Disable MTRRs, and set the default type to uncached */ 694 /* Disable MTRRs, and set the default type to uncached */
691 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); 695 mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
692 wbinvd();
693} 696}
694 697
695static void post_set(void) __releases(set_atomicity_lock) 698static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b..79289632cb2 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -167,7 +167,6 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
167{ 167{
168 int err = 0; 168 int err = 0;
169 mtrr_type type; 169 mtrr_type type;
170 unsigned long base;
171 unsigned long size; 170 unsigned long size;
172 struct mtrr_sentry sentry; 171 struct mtrr_sentry sentry;
173 struct mtrr_gentry gentry; 172 struct mtrr_gentry gentry;
@@ -268,14 +267,14 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
268#endif 267#endif
269 if (gentry.regnum >= num_var_ranges) 268 if (gentry.regnum >= num_var_ranges)
270 return -EINVAL; 269 return -EINVAL;
271 mtrr_if->get(gentry.regnum, &base, &size, &type); 270 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
272 271
273 /* Hide entries that go above 4GB */ 272 /* Hide entries that go above 4GB */
274 if (base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)) 273 if (gentry.base + size - 1 >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))
275 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT))) 274 || size >= (1UL << (8 * sizeof(gentry.size) - PAGE_SHIFT)))
276 gentry.base = gentry.size = gentry.type = 0; 275 gentry.base = gentry.size = gentry.type = 0;
277 else { 276 else {
278 gentry.base = base << PAGE_SHIFT; 277 gentry.base <<= PAGE_SHIFT;
279 gentry.size = size << PAGE_SHIFT; 278 gentry.size = size << PAGE_SHIFT;
280 gentry.type = type; 279 gentry.type = type;
281 } 280 }
@@ -322,12 +321,11 @@ mtrr_ioctl(struct file *file, unsigned int cmd, unsigned long __arg)
322#endif 321#endif
323 if (gentry.regnum >= num_var_ranges) 322 if (gentry.regnum >= num_var_ranges)
324 return -EINVAL; 323 return -EINVAL;
325 mtrr_if->get(gentry.regnum, &base, &size, &type); 324 mtrr_if->get(gentry.regnum, &gentry.base, &size, &type);
326 /* Hide entries that would overflow */ 325 /* Hide entries that would overflow */
327 if (size != (__typeof__(gentry.size))size) 326 if (size != (__typeof__(gentry.size))size)
328 gentry.base = gentry.size = gentry.type = 0; 327 gentry.base = gentry.size = gentry.type = 0;
329 else { 328 else {
330 gentry.base = base;
331 gentry.size = size; 329 gentry.size = size;
332 gentry.type = type; 330 gentry.type = type;
333 } 331 }
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 726bf963c22..6b96110bb0c 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -606,7 +606,7 @@ void __init mtrr_bp_init(void)
606 606
607 /* 607 /*
608 * This is an AMD specific MSR, but we assume(hope?) that 608 * This is an AMD specific MSR, but we assume(hope?) that
609 * Intel will implement it too when they extend the address 609 * Intel will implement it to when they extend the address
610 * bus of the Xeon. 610 * bus of the Xeon.
611 */ 611 */
612 if (cpuid_eax(0x80000000) >= 0x80000008) { 612 if (cpuid_eax(0x80000000) >= 0x80000008) {
@@ -695,16 +695,11 @@ void mtrr_ap_init(void)
695} 695}
696 696
697/** 697/**
698 * Save current fixed-range MTRR state of the first cpu in cpu_online_mask. 698 * Save current fixed-range MTRR state of the BSP
699 */ 699 */
700void mtrr_save_state(void) 700void mtrr_save_state(void)
701{ 701{
702 int first_cpu; 702 smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
703
704 get_online_cpus();
705 first_cpu = cpumask_first(cpu_online_mask);
706 smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
707 put_online_cpus();
708} 703}
709 704
710void set_mtrr_aps_delayed_init(void) 705void set_mtrr_aps_delayed_init(void)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 4428fd178bc..cfa62ec090e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -24,30 +24,302 @@
24#include <linux/slab.h> 24#include <linux/slab.h>
25#include <linux/cpu.h> 25#include <linux/cpu.h>
26#include <linux/bitops.h> 26#include <linux/bitops.h>
27#include <linux/device.h>
28 27
29#include <asm/apic.h> 28#include <asm/apic.h>
30#include <asm/stacktrace.h> 29#include <asm/stacktrace.h>
31#include <asm/nmi.h> 30#include <asm/nmi.h>
31#include <asm/compat.h>
32#include <asm/smp.h> 32#include <asm/smp.h>
33#include <asm/alternative.h> 33#include <asm/alternative.h>
34#include <asm/timer.h>
35#include <asm/desc.h>
36#include <asm/ldt.h>
37 34
38#include "perf_event.h" 35#if 0
36#undef wrmsrl
37#define wrmsrl(msr, val) \
38do { \
39 trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
40 (unsigned long)(val)); \
41 native_write_msr((msr), (u32)((u64)(val)), \
42 (u32)((u64)(val) >> 32)); \
43} while (0)
44#endif
45
46/*
47 * | NHM/WSM | SNB |
48 * register -------------------------------
49 * | HT | no HT | HT | no HT |
50 *-----------------------------------------
51 * offcore | core | core | cpu | core |
52 * lbr_sel | core | core | cpu | core |
53 * ld_lat | cpu | core | cpu | core |
54 *-----------------------------------------
55 *
56 * Given that there is a small number of shared regs,
57 * we can pre-allocate their slot in the per-cpu
58 * per-core reg tables.
59 */
60enum extra_reg_type {
61 EXTRA_REG_NONE = -1, /* not used */
62
63 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
64 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
65
66 EXTRA_REG_MAX /* number of entries needed */
67};
68
69struct event_constraint {
70 union {
71 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
72 u64 idxmsk64;
73 };
74 u64 code;
75 u64 cmask;
76 int weight;
77};
78
79struct amd_nb {
80 int nb_id; /* NorthBridge id */
81 int refcnt; /* reference count */
82 struct perf_event *owners[X86_PMC_IDX_MAX];
83 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
84};
85
86struct intel_percore;
87
88#define MAX_LBR_ENTRIES 16
89
90struct cpu_hw_events {
91 /*
92 * Generic x86 PMC bits
93 */
94 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
95 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
96 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
97 int enabled;
98
99 int n_events;
100 int n_added;
101 int n_txn;
102 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
103 u64 tags[X86_PMC_IDX_MAX];
104 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
105
106 unsigned int group_flag;
107
108 /*
109 * Intel DebugStore bits
110 */
111 struct debug_store *ds;
112 u64 pebs_enabled;
113
114 /*
115 * Intel LBR bits
116 */
117 int lbr_users;
118 void *lbr_context;
119 struct perf_branch_stack lbr_stack;
120 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
121
122 /*
123 * manage shared (per-core, per-cpu) registers
124 * used on Intel NHM/WSM/SNB
125 */
126 struct intel_shared_regs *shared_regs;
127
128 /*
129 * AMD specific bits
130 */
131 struct amd_nb *amd_nb;
132};
133
134#define __EVENT_CONSTRAINT(c, n, m, w) {\
135 { .idxmsk64 = (n) }, \
136 .code = (c), \
137 .cmask = (m), \
138 .weight = (w), \
139}
140
141#define EVENT_CONSTRAINT(c, n, m) \
142 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
143
144/*
145 * Constraint on the Event code.
146 */
147#define INTEL_EVENT_CONSTRAINT(c, n) \
148 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
149
150/*
151 * Constraint on the Event code + UMask + fixed-mask
152 *
153 * filter mask to validate fixed counter events.
154 * the following filters disqualify for fixed counters:
155 * - inv
156 * - edge
157 * - cnt-mask
158 * The other filters are supported by fixed counters.
159 * The any-thread option is supported starting with v3.
160 */
161#define FIXED_EVENT_CONSTRAINT(c, n) \
162 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
163
164/*
165 * Constraint on the Event code + UMask
166 */
167#define INTEL_UEVENT_CONSTRAINT(c, n) \
168 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
169
170#define EVENT_CONSTRAINT_END \
171 EVENT_CONSTRAINT(0, 0, 0)
172
173#define for_each_event_constraint(e, c) \
174 for ((e) = (c); (e)->weight; (e)++)
175
176/*
177 * Per register state.
178 */
179struct er_account {
180 raw_spinlock_t lock; /* per-core: protect structure */
181 u64 config; /* extra MSR config */
182 u64 reg; /* extra MSR number */
183 atomic_t ref; /* reference count */
184};
185
186/*
187 * Extra registers for specific events.
188 *
189 * Some events need large masks and require external MSRs.
190 * Those extra MSRs end up being shared for all events on
191 * a PMU and sometimes between PMU of sibling HT threads.
192 * In either case, the kernel needs to handle conflicting
193 * accesses to those extra, shared, regs. The data structure
194 * to manage those registers is stored in cpu_hw_event.
195 */
196struct extra_reg {
197 unsigned int event;
198 unsigned int msr;
199 u64 config_mask;
200 u64 valid_mask;
201 int idx; /* per_xxx->regs[] reg index */
202};
203
204#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
205 .event = (e), \
206 .msr = (ms), \
207 .config_mask = (m), \
208 .valid_mask = (vm), \
209 .idx = EXTRA_REG_##i \
210 }
211
212#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
213 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
214
215#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
216
217union perf_capabilities {
218 struct {
219 u64 lbr_format : 6;
220 u64 pebs_trap : 1;
221 u64 pebs_arch_reg : 1;
222 u64 pebs_format : 4;
223 u64 smm_freeze : 1;
224 };
225 u64 capabilities;
226};
227
228/*
229 * struct x86_pmu - generic x86 pmu
230 */
231struct x86_pmu {
232 /*
233 * Generic x86 PMC bits
234 */
235 const char *name;
236 int version;
237 int (*handle_irq)(struct pt_regs *);
238 void (*disable_all)(void);
239 void (*enable_all)(int added);
240 void (*enable)(struct perf_event *);
241 void (*disable)(struct perf_event *);
242 int (*hw_config)(struct perf_event *event);
243 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
244 unsigned eventsel;
245 unsigned perfctr;
246 u64 (*event_map)(int);
247 int max_events;
248 int num_counters;
249 int num_counters_fixed;
250 int cntval_bits;
251 u64 cntval_mask;
252 int apic;
253 u64 max_period;
254 struct event_constraint *
255 (*get_event_constraints)(struct cpu_hw_events *cpuc,
256 struct perf_event *event);
257
258 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
259 struct perf_event *event);
260 struct event_constraint *event_constraints;
261 void (*quirks)(void);
262 int perfctr_second_write;
263
264 int (*cpu_prepare)(int cpu);
265 void (*cpu_starting)(int cpu);
266 void (*cpu_dying)(int cpu);
267 void (*cpu_dead)(int cpu);
268
269 /*
270 * Intel Arch Perfmon v2+
271 */
272 u64 intel_ctrl;
273 union perf_capabilities intel_cap;
274
275 /*
276 * Intel DebugStore bits
277 */
278 int bts, pebs;
279 int bts_active, pebs_active;
280 int pebs_record_size;
281 void (*drain_pebs)(struct pt_regs *regs);
282 struct event_constraint *pebs_constraints;
283
284 /*
285 * Intel LBR
286 */
287 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
288 int lbr_nr; /* hardware stack size */
289
290 /*
291 * Extra registers for events
292 */
293 struct extra_reg *extra_regs;
294 unsigned int er_flags;
295};
39 296
40struct x86_pmu x86_pmu __read_mostly; 297#define ERF_NO_HT_SHARING 1
298#define ERF_HAS_RSP_1 2
41 299
42DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 300static struct x86_pmu x86_pmu __read_mostly;
301
302static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
43 .enabled = 1, 303 .enabled = 1,
44}; 304};
45 305
46u64 __read_mostly hw_cache_event_ids 306static int x86_perf_event_set_period(struct perf_event *event);
307
308/*
309 * Generalized hw caching related hw_event table, filled
310 * in on a per model basis. A value of 0 means
311 * 'not supported', -1 means 'hw_event makes no sense on
312 * this CPU', any other value means the raw hw_event
313 * ID.
314 */
315
316#define C(x) PERF_COUNT_HW_CACHE_##x
317
318static u64 __read_mostly hw_cache_event_ids
47 [PERF_COUNT_HW_CACHE_MAX] 319 [PERF_COUNT_HW_CACHE_MAX]
48 [PERF_COUNT_HW_CACHE_OP_MAX] 320 [PERF_COUNT_HW_CACHE_OP_MAX]
49 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 321 [PERF_COUNT_HW_CACHE_RESULT_MAX];
50u64 __read_mostly hw_cache_extra_regs 322static u64 __read_mostly hw_cache_extra_regs
51 [PERF_COUNT_HW_CACHE_MAX] 323 [PERF_COUNT_HW_CACHE_MAX]
52 [PERF_COUNT_HW_CACHE_OP_MAX] 324 [PERF_COUNT_HW_CACHE_OP_MAX]
53 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 325 [PERF_COUNT_HW_CACHE_RESULT_MAX];
@@ -57,7 +329,8 @@ u64 __read_mostly hw_cache_extra_regs
57 * Can only be executed on the CPU where the event is active. 329 * Can only be executed on the CPU where the event is active.
58 * Returns the delta events processed. 330 * Returns the delta events processed.
59 */ 331 */
60u64 x86_perf_event_update(struct perf_event *event) 332static u64
333x86_perf_event_update(struct perf_event *event)
61{ 334{
62 struct hw_perf_event *hwc = &event->hw; 335 struct hw_perf_event *hwc = &event->hw;
63 int shift = 64 - x86_pmu.cntval_bits; 336 int shift = 64 - x86_pmu.cntval_bits;
@@ -65,7 +338,7 @@ u64 x86_perf_event_update(struct perf_event *event)
65 int idx = hwc->idx; 338 int idx = hwc->idx;
66 s64 delta; 339 s64 delta;
67 340
68 if (idx == INTEL_PMC_IDX_FIXED_BTS) 341 if (idx == X86_PMC_IDX_FIXED_BTS)
69 return 0; 342 return 0;
70 343
71 /* 344 /*
@@ -77,7 +350,7 @@ u64 x86_perf_event_update(struct perf_event *event)
77 */ 350 */
78again: 351again:
79 prev_raw_count = local64_read(&hwc->prev_count); 352 prev_raw_count = local64_read(&hwc->prev_count);
80 rdpmcl(hwc->event_base_rdpmc, new_raw_count); 353 rdmsrl(hwc->event_base, new_raw_count);
81 354
82 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 355 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
83 new_raw_count) != prev_raw_count) 356 new_raw_count) != prev_raw_count)
@@ -100,6 +373,30 @@ again:
100 return new_raw_count; 373 return new_raw_count;
101} 374}
102 375
376static inline int x86_pmu_addr_offset(int index)
377{
378 int offset;
379
380 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
381 alternative_io(ASM_NOP2,
382 "shll $1, %%eax",
383 X86_FEATURE_PERFCTR_CORE,
384 "=a" (offset),
385 "a" (index));
386
387 return offset;
388}
389
390static inline unsigned int x86_pmu_config_addr(int index)
391{
392 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
393}
394
395static inline unsigned int x86_pmu_event_addr(int index)
396{
397 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
398}
399
103/* 400/*
104 * Find and validate any extra registers to set up. 401 * Find and validate any extra registers to set up.
105 */ 402 */
@@ -180,7 +477,7 @@ static void release_pmc_hardware(void) {}
180 477
181static bool check_hw_exists(void) 478static bool check_hw_exists(void)
182{ 479{
183 u64 val, val_new = ~0; 480 u64 val, val_new = 0;
184 int i, reg, ret = 0; 481 int i, reg, ret = 0;
185 482
186 /* 483 /*
@@ -208,16 +505,13 @@ static bool check_hw_exists(void)
208 } 505 }
209 506
210 /* 507 /*
211 * Read the current value, change it and read it back to see if it 508 * Now write a value and read it back to see if it matches,
212 * matches, this is needed to detect certain hardware emulators 509 * this is needed to detect certain hardware emulators (qemu/kvm)
213 * (qemu/kvm) that don't trap on the MSR access and always return 0s. 510 * that don't trap on the MSR access and always return 0s.
214 */ 511 */
215 reg = x86_pmu_event_addr(0); 512 val = 0xabcdUL;
216 if (rdmsrl_safe(reg, &val)) 513 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
217 goto msr_fail; 514 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
218 val ^= 0xffffUL;
219 ret = wrmsrl_safe(reg, val);
220 ret |= rdmsrl_safe(reg, &val_new);
221 if (ret || val != val_new) 515 if (ret || val != val_new)
222 goto msr_fail; 516 goto msr_fail;
223 517
@@ -234,11 +528,13 @@ bios_fail:
234 528
235msr_fail: 529msr_fail:
236 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); 530 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
237 printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new);
238 531
239 return false; 532 return false;
240} 533}
241 534
535static void reserve_ds_buffers(void);
536static void release_ds_buffers(void);
537
242static void hw_perf_event_destroy(struct perf_event *event) 538static void hw_perf_event_destroy(struct perf_event *event)
243{ 539{
244 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { 540 if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
@@ -287,7 +583,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
287 return x86_pmu_extra_regs(val, event); 583 return x86_pmu_extra_regs(val, event);
288} 584}
289 585
290int x86_setup_perfctr(struct perf_event *event) 586static int x86_setup_perfctr(struct perf_event *event)
291{ 587{
292 struct perf_event_attr *attr = &event->attr; 588 struct perf_event_attr *attr = &event->attr;
293 struct hw_perf_event *hwc = &event->hw; 589 struct hw_perf_event *hwc = &event->hw;
@@ -308,8 +604,12 @@ int x86_setup_perfctr(struct perf_event *event)
308 return -EOPNOTSUPP; 604 return -EOPNOTSUPP;
309 } 605 }
310 606
607 /*
608 * Do not allow config1 (extended registers) to propagate,
609 * there's no sane user-space generalization yet:
610 */
311 if (attr->type == PERF_TYPE_RAW) 611 if (attr->type == PERF_TYPE_RAW)
312 return x86_pmu_extra_regs(event->attr.config, event); 612 return 0;
313 613
314 if (attr->type == PERF_TYPE_HW_CACHE) 614 if (attr->type == PERF_TYPE_HW_CACHE)
315 return set_ext_hw_attr(hwc, event); 615 return set_ext_hw_attr(hwc, event);
@@ -340,9 +640,6 @@ int x86_setup_perfctr(struct perf_event *event)
340 /* BTS is currently only allowed for user-mode. */ 640 /* BTS is currently only allowed for user-mode. */
341 if (!attr->exclude_kernel) 641 if (!attr->exclude_kernel)
342 return -EOPNOTSUPP; 642 return -EOPNOTSUPP;
343
344 if (!attr->exclude_guest)
345 return -EOPNOTSUPP;
346 } 643 }
347 644
348 hwc->config |= config; 645 hwc->config |= config;
@@ -350,46 +647,13 @@ int x86_setup_perfctr(struct perf_event *event)
350 return 0; 647 return 0;
351} 648}
352 649
353/* 650static int x86_pmu_hw_config(struct perf_event *event)
354 * check that branch_sample_type is compatible with
355 * settings needed for precise_ip > 1 which implies
356 * using the LBR to capture ALL taken branches at the
357 * priv levels of the measurement
358 */
359static inline int precise_br_compat(struct perf_event *event)
360{
361 u64 m = event->attr.branch_sample_type;
362 u64 b = 0;
363
364 /* must capture all branches */
365 if (!(m & PERF_SAMPLE_BRANCH_ANY))
366 return 0;
367
368 m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;
369
370 if (!event->attr.exclude_user)
371 b |= PERF_SAMPLE_BRANCH_USER;
372
373 if (!event->attr.exclude_kernel)
374 b |= PERF_SAMPLE_BRANCH_KERNEL;
375
376 /*
377 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
378 */
379
380 return m == b;
381}
382
383int x86_pmu_hw_config(struct perf_event *event)
384{ 651{
385 if (event->attr.precise_ip) { 652 if (event->attr.precise_ip) {
386 int precise = 0; 653 int precise = 0;
387 654
388 if (!event->attr.exclude_guest)
389 return -EOPNOTSUPP;
390
391 /* Support for constant skid */ 655 /* Support for constant skid */
392 if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { 656 if (x86_pmu.pebs_active) {
393 precise++; 657 precise++;
394 658
395 /* Support for IP fixup */ 659 /* Support for IP fixup */
@@ -399,36 +663,6 @@ int x86_pmu_hw_config(struct perf_event *event)
399 663
400 if (event->attr.precise_ip > precise) 664 if (event->attr.precise_ip > precise)
401 return -EOPNOTSUPP; 665 return -EOPNOTSUPP;
402 /*
403 * check that PEBS LBR correction does not conflict with
404 * whatever the user is asking with attr->branch_sample_type
405 */
406 if (event->attr.precise_ip > 1) {
407 u64 *br_type = &event->attr.branch_sample_type;
408
409 if (has_branch_stack(event)) {
410 if (!precise_br_compat(event))
411 return -EOPNOTSUPP;
412
413 /* branch_sample_type is compatible */
414
415 } else {
416 /*
417 * user did not specify branch_sample_type
418 *
419 * For PEBS fixups, we capture all
420 * the branches at the priv level of the
421 * event.
422 */
423 *br_type = PERF_SAMPLE_BRANCH_ANY;
424
425 if (!event->attr.exclude_user)
426 *br_type |= PERF_SAMPLE_BRANCH_USER;
427
428 if (!event->attr.exclude_kernel)
429 *br_type |= PERF_SAMPLE_BRANCH_KERNEL;
430 }
431 }
432 } 666 }
433 667
434 /* 668 /*
@@ -485,12 +719,11 @@ static int __x86_pmu_event_init(struct perf_event *event)
485 719
486 /* mark unused */ 720 /* mark unused */
487 event->hw.extra_reg.idx = EXTRA_REG_NONE; 721 event->hw.extra_reg.idx = EXTRA_REG_NONE;
488 event->hw.branch_reg.idx = EXTRA_REG_NONE;
489 722
490 return x86_pmu.hw_config(event); 723 return x86_pmu.hw_config(event);
491} 724}
492 725
493void x86_pmu_disable_all(void) 726static void x86_pmu_disable_all(void)
494{ 727{
495 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 728 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
496 int idx; 729 int idx;
@@ -525,7 +758,15 @@ static void x86_pmu_disable(struct pmu *pmu)
525 x86_pmu.disable_all(); 758 x86_pmu.disable_all();
526} 759}
527 760
528void x86_pmu_enable_all(int added) 761static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
762 u64 enable_mask)
763{
764 if (hwc->extra_reg.reg)
765 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
766 wrmsrl(hwc->config_base, hwc->config | enable_mask);
767}
768
769static void x86_pmu_enable_all(int added)
529{ 770{
530 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 771 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
531 int idx; 772 int idx;
@@ -547,195 +788,18 @@ static inline int is_x86_event(struct perf_event *event)
547 return event->pmu == &pmu; 788 return event->pmu == &pmu;
548} 789}
549 790
550/* 791static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
551 * Event scheduler state:
552 *
553 * Assign events iterating over all events and counters, beginning
554 * with events with least weights first. Keep the current iterator
555 * state in struct sched_state.
556 */
557struct sched_state {
558 int weight;
559 int event; /* event index */
560 int counter; /* counter index */
561 int unassigned; /* number of events to be assigned left */
562 unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
563};
564
565/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
566#define SCHED_STATES_MAX 2
567
568struct perf_sched {
569 int max_weight;
570 int max_events;
571 struct event_constraint **constraints;
572 struct sched_state state;
573 int saved_states;
574 struct sched_state saved[SCHED_STATES_MAX];
575};
576
577/*
578 * Initialize interator that runs through all events and counters.
579 */
580static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c,
581 int num, int wmin, int wmax)
582{
583 int idx;
584
585 memset(sched, 0, sizeof(*sched));
586 sched->max_events = num;
587 sched->max_weight = wmax;
588 sched->constraints = c;
589
590 for (idx = 0; idx < num; idx++) {
591 if (c[idx]->weight == wmin)
592 break;
593 }
594
595 sched->state.event = idx; /* start with min weight */
596 sched->state.weight = wmin;
597 sched->state.unassigned = num;
598}
599
600static void perf_sched_save_state(struct perf_sched *sched)
601{
602 if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
603 return;
604
605 sched->saved[sched->saved_states] = sched->state;
606 sched->saved_states++;
607}
608
609static bool perf_sched_restore_state(struct perf_sched *sched)
610{
611 if (!sched->saved_states)
612 return false;
613
614 sched->saved_states--;
615 sched->state = sched->saved[sched->saved_states];
616
617 /* continue with next counter: */
618 clear_bit(sched->state.counter++, sched->state.used);
619
620 return true;
621}
622
623/*
624 * Select a counter for the current event to schedule. Return true on
625 * success.
626 */
627static bool __perf_sched_find_counter(struct perf_sched *sched)
628{
629 struct event_constraint *c;
630 int idx;
631
632 if (!sched->state.unassigned)
633 return false;
634
635 if (sched->state.event >= sched->max_events)
636 return false;
637
638 c = sched->constraints[sched->state.event];
639
640 /* Prefer fixed purpose counters */
641 if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
642 idx = INTEL_PMC_IDX_FIXED;
643 for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
644 if (!__test_and_set_bit(idx, sched->state.used))
645 goto done;
646 }
647 }
648 /* Grab the first unused counter starting with idx */
649 idx = sched->state.counter;
650 for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
651 if (!__test_and_set_bit(idx, sched->state.used))
652 goto done;
653 }
654
655 return false;
656
657done:
658 sched->state.counter = idx;
659
660 if (c->overlap)
661 perf_sched_save_state(sched);
662
663 return true;
664}
665
666static bool perf_sched_find_counter(struct perf_sched *sched)
667{
668 while (!__perf_sched_find_counter(sched)) {
669 if (!perf_sched_restore_state(sched))
670 return false;
671 }
672
673 return true;
674}
675
676/*
677 * Go through all unassigned events and find the next one to schedule.
678 * Take events with the least weight first. Return true on success.
679 */
680static bool perf_sched_next_event(struct perf_sched *sched)
681{
682 struct event_constraint *c;
683
684 if (!sched->state.unassigned || !--sched->state.unassigned)
685 return false;
686
687 do {
688 /* next event */
689 sched->state.event++;
690 if (sched->state.event >= sched->max_events) {
691 /* next weight */
692 sched->state.event = 0;
693 sched->state.weight++;
694 if (sched->state.weight > sched->max_weight)
695 return false;
696 }
697 c = sched->constraints[sched->state.event];
698 } while (c->weight != sched->state.weight);
699
700 sched->state.counter = 0; /* start with first counter */
701
702 return true;
703}
704
705/*
706 * Assign a counter for each event.
707 */
708int perf_assign_events(struct event_constraint **constraints, int n,
709 int wmin, int wmax, int *assign)
710{
711 struct perf_sched sched;
712
713 perf_sched_init(&sched, constraints, n, wmin, wmax);
714
715 do {
716 if (!perf_sched_find_counter(&sched))
717 break; /* failed */
718 if (assign)
719 assign[sched.state.event] = sched.state.counter;
720 } while (perf_sched_next_event(&sched));
721
722 return sched.state.unassigned;
723}
724
725int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
726{ 792{
727 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; 793 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
728 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 794 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
729 int i, wmin, wmax, num = 0; 795 int i, j, w, wmax, num = 0;
730 struct hw_perf_event *hwc; 796 struct hw_perf_event *hwc;
731 797
732 bitmap_zero(used_mask, X86_PMC_IDX_MAX); 798 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
733 799
734 for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { 800 for (i = 0; i < n; i++) {
735 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); 801 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
736 constraints[i] = c; 802 constraints[i] = c;
737 wmin = min(wmin, c->weight);
738 wmax = max(wmax, c->weight);
739 } 803 }
740 804
741 /* 805 /*
@@ -761,11 +825,59 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
761 if (assign) 825 if (assign)
762 assign[i] = hwc->idx; 826 assign[i] = hwc->idx;
763 } 827 }
828 if (i == n)
829 goto done;
830
831 /*
832 * begin slow path
833 */
834
835 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
836
837 /*
838 * weight = number of possible counters
839 *
840 * 1 = most constrained, only works on one counter
841 * wmax = least constrained, works on any counter
842 *
843 * assign events to counters starting with most
844 * constrained events.
845 */
846 wmax = x86_pmu.num_counters;
847
848 /*
849 * when fixed event counters are present,
850 * wmax is incremented by 1 to account
851 * for one more choice
852 */
853 if (x86_pmu.num_counters_fixed)
854 wmax++;
764 855
765 /* slow path */ 856 for (w = 1, num = n; num && w <= wmax; w++) {
766 if (i != n) 857 /* for each event */
767 num = perf_assign_events(constraints, n, wmin, wmax, assign); 858 for (i = 0; num && i < n; i++) {
859 c = constraints[i];
860 hwc = &cpuc->event_list[i]->hw;
768 861
862 if (c->weight != w)
863 continue;
864
865 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
866 if (!test_bit(j, used_mask))
867 break;
868 }
869
870 if (j == X86_PMC_IDX_MAX)
871 break;
872
873 __set_bit(j, used_mask);
874
875 if (assign)
876 assign[i] = j;
877 num--;
878 }
879 }
880done:
769 /* 881 /*
770 * scheduling failed or is just a simulation, 882 * scheduling failed or is just a simulation,
771 * free resources if necessary 883 * free resources if necessary
@@ -776,7 +888,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
776 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); 888 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
777 } 889 }
778 } 890 }
779 return num ? -EINVAL : 0; 891 return num ? -ENOSPC : 0;
780} 892}
781 893
782/* 894/*
@@ -795,7 +907,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
795 907
796 if (is_x86_event(leader)) { 908 if (is_x86_event(leader)) {
797 if (n >= max_count) 909 if (n >= max_count)
798 return -EINVAL; 910 return -ENOSPC;
799 cpuc->event_list[n] = leader; 911 cpuc->event_list[n] = leader;
800 n++; 912 n++;
801 } 913 }
@@ -808,7 +920,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
808 continue; 920 continue;
809 921
810 if (n >= max_count) 922 if (n >= max_count)
811 return -EINVAL; 923 return -ENOSPC;
812 924
813 cpuc->event_list[n] = event; 925 cpuc->event_list[n] = event;
814 n++; 926 n++;
@@ -825,17 +937,15 @@ static inline void x86_assign_hw_event(struct perf_event *event,
825 hwc->last_cpu = smp_processor_id(); 937 hwc->last_cpu = smp_processor_id();
826 hwc->last_tag = ++cpuc->tags[i]; 938 hwc->last_tag = ++cpuc->tags[i];
827 939
828 if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { 940 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
829 hwc->config_base = 0; 941 hwc->config_base = 0;
830 hwc->event_base = 0; 942 hwc->event_base = 0;
831 } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { 943 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
832 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 944 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
833 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); 945 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
834 hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
835 } else { 946 } else {
836 hwc->config_base = x86_pmu_config_addr(hwc->idx); 947 hwc->config_base = x86_pmu_config_addr(hwc->idx);
837 hwc->event_base = x86_pmu_event_addr(hwc->idx); 948 hwc->event_base = x86_pmu_event_addr(hwc->idx);
838 hwc->event_base_rdpmc = hwc->idx;
839 } 949 }
840} 950}
841 951
@@ -849,6 +959,7 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
849} 959}
850 960
851static void x86_pmu_start(struct perf_event *event, int flags); 961static void x86_pmu_start(struct perf_event *event, int flags);
962static void x86_pmu_stop(struct perf_event *event, int flags);
852 963
853static void x86_pmu_enable(struct pmu *pmu) 964static void x86_pmu_enable(struct pmu *pmu)
854{ 965{
@@ -920,20 +1031,28 @@ static void x86_pmu_enable(struct pmu *pmu)
920 x86_pmu.enable_all(added); 1031 x86_pmu.enable_all(added);
921} 1032}
922 1033
1034static inline void x86_pmu_disable_event(struct perf_event *event)
1035{
1036 struct hw_perf_event *hwc = &event->hw;
1037
1038 wrmsrl(hwc->config_base, hwc->config);
1039}
1040
923static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1041static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
924 1042
925/* 1043/*
926 * Set the next IRQ period, based on the hwc->period_left value. 1044 * Set the next IRQ period, based on the hwc->period_left value.
927 * To be called with the event disabled in hw: 1045 * To be called with the event disabled in hw:
928 */ 1046 */
929int x86_perf_event_set_period(struct perf_event *event) 1047static int
1048x86_perf_event_set_period(struct perf_event *event)
930{ 1049{
931 struct hw_perf_event *hwc = &event->hw; 1050 struct hw_perf_event *hwc = &event->hw;
932 s64 left = local64_read(&hwc->period_left); 1051 s64 left = local64_read(&hwc->period_left);
933 s64 period = hwc->sample_period; 1052 s64 period = hwc->sample_period;
934 int ret = 0, idx = hwc->idx; 1053 int ret = 0, idx = hwc->idx;
935 1054
936 if (idx == INTEL_PMC_IDX_FIXED_BTS) 1055 if (idx == X86_PMC_IDX_FIXED_BTS)
937 return 0; 1056 return 0;
938 1057
939 /* 1058 /*
@@ -986,7 +1105,7 @@ int x86_perf_event_set_period(struct perf_event *event)
986 return ret; 1105 return ret;
987} 1106}
988 1107
989void x86_pmu_enable_event(struct perf_event *event) 1108static void x86_pmu_enable_event(struct perf_event *event)
990{ 1109{
991 if (__this_cpu_read(cpu_hw_events.enabled)) 1110 if (__this_cpu_read(cpu_hw_events.enabled))
992 __x86_pmu_enable_event(&event->hw, 1111 __x86_pmu_enable_event(&event->hw,
@@ -1125,7 +1244,7 @@ void perf_event_print_debug(void)
1125 local_irq_restore(flags); 1244 local_irq_restore(flags);
1126} 1245}
1127 1246
1128void x86_pmu_stop(struct perf_event *event, int flags) 1247static void x86_pmu_stop(struct perf_event *event, int flags)
1129{ 1248{
1130 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1249 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1131 struct hw_perf_event *hwc = &event->hw; 1250 struct hw_perf_event *hwc = &event->hw;
@@ -1178,7 +1297,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
1178 perf_event_update_userpage(event); 1297 perf_event_update_userpage(event);
1179} 1298}
1180 1299
1181int x86_pmu_handle_irq(struct pt_regs *regs) 1300static int x86_pmu_handle_irq(struct pt_regs *regs)
1182{ 1301{
1183 struct perf_sample_data data; 1302 struct perf_sample_data data;
1184 struct cpu_hw_events *cpuc; 1303 struct cpu_hw_events *cpuc;
@@ -1186,6 +1305,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
1186 int idx, handled = 0; 1305 int idx, handled = 0;
1187 u64 val; 1306 u64 val;
1188 1307
1308 perf_sample_data_init(&data, 0);
1309
1189 cpuc = &__get_cpu_var(cpu_hw_events); 1310 cpuc = &__get_cpu_var(cpu_hw_events);
1190 1311
1191 /* 1312 /*
@@ -1220,7 +1341,7 @@ int x86_pmu_handle_irq(struct pt_regs *regs)
1220 * event overflow 1341 * event overflow
1221 */ 1342 */
1222 handled++; 1343 handled++;
1223 perf_sample_data_init(&data, 0, event->hw.last_period); 1344 data.period = event->hw.last_period;
1224 1345
1225 if (!x86_perf_event_set_period(event)) 1346 if (!x86_perf_event_set_period(event))
1226 continue; 1347 continue;
@@ -1246,43 +1367,118 @@ void perf_events_lapic_init(void)
1246 apic_write(APIC_LVTPC, APIC_DM_NMI); 1367 apic_write(APIC_LVTPC, APIC_DM_NMI);
1247} 1368}
1248 1369
1370struct pmu_nmi_state {
1371 unsigned int marked;
1372 int handled;
1373};
1374
1375static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);
1376
1249static int __kprobes 1377static int __kprobes
1250perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1378perf_event_nmi_handler(struct notifier_block *self,
1379 unsigned long cmd, void *__args)
1251{ 1380{
1381 struct die_args *args = __args;
1382 unsigned int this_nmi;
1383 int handled;
1384
1252 if (!atomic_read(&active_events)) 1385 if (!atomic_read(&active_events))
1253 return NMI_DONE; 1386 return NOTIFY_DONE;
1387
1388 switch (cmd) {
1389 case DIE_NMI:
1390 break;
1391 case DIE_NMIUNKNOWN:
1392 this_nmi = percpu_read(irq_stat.__nmi_count);
1393 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1394 /* let the kernel handle the unknown nmi */
1395 return NOTIFY_DONE;
1396 /*
1397 * This one is a PMU back-to-back nmi. Two events
1398 * trigger 'simultaneously' raising two back-to-back
1399 * NMIs. If the first NMI handles both, the latter
1400 * will be empty and daze the CPU. So, we drop it to
1401 * avoid false-positive 'unknown nmi' messages.
1402 */
1403 return NOTIFY_STOP;
1404 default:
1405 return NOTIFY_DONE;
1406 }
1407
1408 handled = x86_pmu.handle_irq(args->regs);
1409 if (!handled)
1410 return NOTIFY_DONE;
1254 1411
1255 return x86_pmu.handle_irq(regs); 1412 this_nmi = percpu_read(irq_stat.__nmi_count);
1413 if ((handled > 1) ||
1414 /* the next nmi could be a back-to-back nmi */
1415 ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1416 (__this_cpu_read(pmu_nmi.handled) > 1))) {
1417 /*
1418 * We could have two subsequent back-to-back nmis: The
1419 * first handles more than one counter, the 2nd
1420 * handles only one counter and the 3rd handles no
1421 * counter.
1422 *
1423 * This is the 2nd nmi because the previous was
1424 * handling more than one counter. We will mark the
1425 * next (3rd) and then drop it if unhandled.
1426 */
1427 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1428 __this_cpu_write(pmu_nmi.handled, handled);
1429 }
1430
1431 return NOTIFY_STOP;
1256} 1432}
1257 1433
1258struct event_constraint emptyconstraint; 1434static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1259struct event_constraint unconstrained; 1435 .notifier_call = perf_event_nmi_handler,
1436 .next = NULL,
1437 .priority = NMI_LOCAL_LOW_PRIOR,
1438};
1439
1440static struct event_constraint unconstrained;
1441static struct event_constraint emptyconstraint;
1442
1443static struct event_constraint *
1444x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1445{
1446 struct event_constraint *c;
1447
1448 if (x86_pmu.event_constraints) {
1449 for_each_event_constraint(c, x86_pmu.event_constraints) {
1450 if ((event->hw.config & c->cmask) == c->code)
1451 return c;
1452 }
1453 }
1454
1455 return &unconstrained;
1456}
1457
1458#include "perf_event_amd.c"
1459#include "perf_event_p6.c"
1460#include "perf_event_p4.c"
1461#include "perf_event_intel_lbr.c"
1462#include "perf_event_intel_ds.c"
1463#include "perf_event_intel.c"
1260 1464
1261static int __cpuinit 1465static int __cpuinit
1262x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) 1466x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1263{ 1467{
1264 unsigned int cpu = (long)hcpu; 1468 unsigned int cpu = (long)hcpu;
1265 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1266 int ret = NOTIFY_OK; 1469 int ret = NOTIFY_OK;
1267 1470
1268 switch (action & ~CPU_TASKS_FROZEN) { 1471 switch (action & ~CPU_TASKS_FROZEN) {
1269 case CPU_UP_PREPARE: 1472 case CPU_UP_PREPARE:
1270 cpuc->kfree_on_online = NULL;
1271 if (x86_pmu.cpu_prepare) 1473 if (x86_pmu.cpu_prepare)
1272 ret = x86_pmu.cpu_prepare(cpu); 1474 ret = x86_pmu.cpu_prepare(cpu);
1273 break; 1475 break;
1274 1476
1275 case CPU_STARTING: 1477 case CPU_STARTING:
1276 if (x86_pmu.attr_rdpmc)
1277 set_in_cr4(X86_CR4_PCE);
1278 if (x86_pmu.cpu_starting) 1478 if (x86_pmu.cpu_starting)
1279 x86_pmu.cpu_starting(cpu); 1479 x86_pmu.cpu_starting(cpu);
1280 break; 1480 break;
1281 1481
1282 case CPU_ONLINE:
1283 kfree(cpuc->kfree_on_online);
1284 break;
1285
1286 case CPU_DYING: 1482 case CPU_DYING:
1287 if (x86_pmu.cpu_dying) 1483 if (x86_pmu.cpu_dying)
1288 x86_pmu.cpu_dying(cpu); 1484 x86_pmu.cpu_dying(cpu);
@@ -1311,129 +1507,9 @@ static void __init pmu_check_apic(void)
1311 pr_info("no hardware sampling interrupt available.\n"); 1507 pr_info("no hardware sampling interrupt available.\n");
1312} 1508}
1313 1509
1314static struct attribute_group x86_pmu_format_group = {
1315 .name = "format",
1316 .attrs = NULL,
1317};
1318
1319struct perf_pmu_events_attr {
1320 struct device_attribute attr;
1321 u64 id;
1322};
1323
1324/*
1325 * Remove all undefined events (x86_pmu.event_map(id) == 0)
1326 * out of events_attr attributes.
1327 */
1328static void __init filter_events(struct attribute **attrs)
1329{
1330 int i, j;
1331
1332 for (i = 0; attrs[i]; i++) {
1333 if (x86_pmu.event_map(i))
1334 continue;
1335
1336 for (j = i; attrs[j]; j++)
1337 attrs[j] = attrs[j + 1];
1338
1339 /* Check the shifted attr. */
1340 i--;
1341 }
1342}
1343
1344static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
1345 char *page)
1346{
1347 struct perf_pmu_events_attr *pmu_attr = \
1348 container_of(attr, struct perf_pmu_events_attr, attr);
1349
1350 u64 config = x86_pmu.event_map(pmu_attr->id);
1351 return x86_pmu.events_sysfs_show(page, config);
1352}
1353
1354#define EVENT_VAR(_id) event_attr_##_id
1355#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
1356
1357#define EVENT_ATTR(_name, _id) \
1358static struct perf_pmu_events_attr EVENT_VAR(_id) = { \
1359 .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \
1360 .id = PERF_COUNT_HW_##_id, \
1361};
1362
1363EVENT_ATTR(cpu-cycles, CPU_CYCLES );
1364EVENT_ATTR(instructions, INSTRUCTIONS );
1365EVENT_ATTR(cache-references, CACHE_REFERENCES );
1366EVENT_ATTR(cache-misses, CACHE_MISSES );
1367EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS );
1368EVENT_ATTR(branch-misses, BRANCH_MISSES );
1369EVENT_ATTR(bus-cycles, BUS_CYCLES );
1370EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND );
1371EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND );
1372EVENT_ATTR(ref-cycles, REF_CPU_CYCLES );
1373
1374static struct attribute *empty_attrs;
1375
1376static struct attribute *events_attr[] = {
1377 EVENT_PTR(CPU_CYCLES),
1378 EVENT_PTR(INSTRUCTIONS),
1379 EVENT_PTR(CACHE_REFERENCES),
1380 EVENT_PTR(CACHE_MISSES),
1381 EVENT_PTR(BRANCH_INSTRUCTIONS),
1382 EVENT_PTR(BRANCH_MISSES),
1383 EVENT_PTR(BUS_CYCLES),
1384 EVENT_PTR(STALLED_CYCLES_FRONTEND),
1385 EVENT_PTR(STALLED_CYCLES_BACKEND),
1386 EVENT_PTR(REF_CPU_CYCLES),
1387 NULL,
1388};
1389
1390static struct attribute_group x86_pmu_events_group = {
1391 .name = "events",
1392 .attrs = events_attr,
1393};
1394
1395ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
1396{
1397 u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
1398 u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24;
1399 bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE);
1400 bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL);
1401 bool any = (config & ARCH_PERFMON_EVENTSEL_ANY);
1402 bool inv = (config & ARCH_PERFMON_EVENTSEL_INV);
1403 ssize_t ret;
1404
1405 /*
1406 * We have whole page size to spend and just little data
1407 * to write, so we can safely use sprintf.
1408 */
1409 ret = sprintf(page, "event=0x%02llx", event);
1410
1411 if (umask)
1412 ret += sprintf(page + ret, ",umask=0x%02llx", umask);
1413
1414 if (edge)
1415 ret += sprintf(page + ret, ",edge");
1416
1417 if (pc)
1418 ret += sprintf(page + ret, ",pc");
1419
1420 if (any)
1421 ret += sprintf(page + ret, ",any");
1422
1423 if (inv)
1424 ret += sprintf(page + ret, ",inv");
1425
1426 if (cmask)
1427 ret += sprintf(page + ret, ",cmask=0x%02llx", cmask);
1428
1429 ret += sprintf(page + ret, "\n");
1430
1431 return ret;
1432}
1433
1434static int __init init_hw_perf_events(void) 1510static int __init init_hw_perf_events(void)
1435{ 1511{
1436 struct x86_pmu_quirk *quirk; 1512 struct event_constraint *c;
1437 int err; 1513 int err;
1438 1514
1439 pr_info("Performance Events: "); 1515 pr_info("Performance Events: ");
@@ -1461,26 +1537,41 @@ static int __init init_hw_perf_events(void)
1461 1537
1462 pr_cont("%s PMU driver.\n", x86_pmu.name); 1538 pr_cont("%s PMU driver.\n", x86_pmu.name);
1463 1539
1464 for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) 1540 if (x86_pmu.quirks)
1465 quirk->func(); 1541 x86_pmu.quirks();
1466 1542
1467 if (!x86_pmu.intel_ctrl) 1543 if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1468 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1544 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1545 x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
1546 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1547 }
1548 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1549
1550 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1551 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1552 x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
1553 x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1554 }
1555
1556 x86_pmu.intel_ctrl |=
1557 ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1469 1558
1470 perf_events_lapic_init(); 1559 perf_events_lapic_init();
1471 register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); 1560 register_die_notifier(&perf_event_nmi_notifier);
1472 1561
1473 unconstrained = (struct event_constraint) 1562 unconstrained = (struct event_constraint)
1474 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 1563 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
1475 0, x86_pmu.num_counters, 0); 1564 0, x86_pmu.num_counters);
1476 1565
1477 x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ 1566 if (x86_pmu.event_constraints) {
1478 x86_pmu_format_group.attrs = x86_pmu.format_attrs; 1567 for_each_event_constraint(c, x86_pmu.event_constraints) {
1568 if (c->cmask != X86_RAW_EVENT_MASK)
1569 continue;
1479 1570
1480 if (!x86_pmu.events_sysfs_show) 1571 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
1481 x86_pmu_events_group.attrs = &empty_attrs; 1572 c->weight += x86_pmu.num_counters;
1482 else 1573 }
1483 filter_events(x86_pmu_events_group.attrs); 1574 }
1484 1575
1485 pr_info("... version: %d\n", x86_pmu.version); 1576 pr_info("... version: %d\n", x86_pmu.version);
1486 pr_info("... bit width: %d\n", x86_pmu.cntval_bits); 1577 pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
@@ -1589,7 +1680,6 @@ static struct cpu_hw_events *allocate_fake_cpuc(void)
1589 if (!cpuc->shared_regs) 1680 if (!cpuc->shared_regs)
1590 goto error; 1681 goto error;
1591 } 1682 }
1592 cpuc->is_fake = 1;
1593 return cpuc; 1683 return cpuc;
1594error: 1684error:
1595 free_fake_cpuc(cpuc); 1685 free_fake_cpuc(cpuc);
@@ -1612,7 +1702,7 @@ static int validate_event(struct perf_event *event)
1612 c = x86_pmu.get_event_constraints(fake_cpuc, event); 1702 c = x86_pmu.get_event_constraints(fake_cpuc, event);
1613 1703
1614 if (!c || !c->weight) 1704 if (!c || !c->weight)
1615 ret = -EINVAL; 1705 ret = -ENOSPC;
1616 1706
1617 if (x86_pmu.put_event_constraints) 1707 if (x86_pmu.put_event_constraints)
1618 x86_pmu.put_event_constraints(fake_cpuc, event); 1708 x86_pmu.put_event_constraints(fake_cpuc, event);
@@ -1637,7 +1727,7 @@ static int validate_group(struct perf_event *event)
1637{ 1727{
1638 struct perf_event *leader = event->group_leader; 1728 struct perf_event *leader = event->group_leader;
1639 struct cpu_hw_events *fake_cpuc; 1729 struct cpu_hw_events *fake_cpuc;
1640 int ret = -EINVAL, n; 1730 int ret = -ENOSPC, n;
1641 1731
1642 fake_cpuc = allocate_fake_cpuc(); 1732 fake_cpuc = allocate_fake_cpuc();
1643 if (IS_ERR(fake_cpuc)) 1733 if (IS_ERR(fake_cpuc))
@@ -1706,128 +1796,23 @@ static int x86_pmu_event_init(struct perf_event *event)
1706 return err; 1796 return err;
1707} 1797}
1708 1798
1709static int x86_pmu_event_idx(struct perf_event *event)
1710{
1711 int idx = event->hw.idx;
1712
1713 if (!x86_pmu.attr_rdpmc)
1714 return 0;
1715
1716 if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) {
1717 idx -= INTEL_PMC_IDX_FIXED;
1718 idx |= 1 << 30;
1719 }
1720
1721 return idx + 1;
1722}
1723
1724static ssize_t get_attr_rdpmc(struct device *cdev,
1725 struct device_attribute *attr,
1726 char *buf)
1727{
1728 return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc);
1729}
1730
1731static void change_rdpmc(void *info)
1732{
1733 bool enable = !!(unsigned long)info;
1734
1735 if (enable)
1736 set_in_cr4(X86_CR4_PCE);
1737 else
1738 clear_in_cr4(X86_CR4_PCE);
1739}
1740
1741static ssize_t set_attr_rdpmc(struct device *cdev,
1742 struct device_attribute *attr,
1743 const char *buf, size_t count)
1744{
1745 unsigned long val;
1746 ssize_t ret;
1747
1748 ret = kstrtoul(buf, 0, &val);
1749 if (ret)
1750 return ret;
1751
1752 if (!!val != !!x86_pmu.attr_rdpmc) {
1753 x86_pmu.attr_rdpmc = !!val;
1754 smp_call_function(change_rdpmc, (void *)val, 1);
1755 }
1756
1757 return count;
1758}
1759
1760static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc);
1761
1762static struct attribute *x86_pmu_attrs[] = {
1763 &dev_attr_rdpmc.attr,
1764 NULL,
1765};
1766
1767static struct attribute_group x86_pmu_attr_group = {
1768 .attrs = x86_pmu_attrs,
1769};
1770
1771static const struct attribute_group *x86_pmu_attr_groups[] = {
1772 &x86_pmu_attr_group,
1773 &x86_pmu_format_group,
1774 &x86_pmu_events_group,
1775 NULL,
1776};
1777
1778static void x86_pmu_flush_branch_stack(void)
1779{
1780 if (x86_pmu.flush_branch_stack)
1781 x86_pmu.flush_branch_stack();
1782}
1783
1784void perf_check_microcode(void)
1785{
1786 if (x86_pmu.check_microcode)
1787 x86_pmu.check_microcode();
1788}
1789EXPORT_SYMBOL_GPL(perf_check_microcode);
1790
1791static struct pmu pmu = { 1799static struct pmu pmu = {
1792 .pmu_enable = x86_pmu_enable, 1800 .pmu_enable = x86_pmu_enable,
1793 .pmu_disable = x86_pmu_disable, 1801 .pmu_disable = x86_pmu_disable,
1794
1795 .attr_groups = x86_pmu_attr_groups,
1796
1797 .event_init = x86_pmu_event_init,
1798 1802
1799 .add = x86_pmu_add, 1803 .event_init = x86_pmu_event_init,
1800 .del = x86_pmu_del,
1801 .start = x86_pmu_start,
1802 .stop = x86_pmu_stop,
1803 .read = x86_pmu_read,
1804 1804
1805 .start_txn = x86_pmu_start_txn, 1805 .add = x86_pmu_add,
1806 .cancel_txn = x86_pmu_cancel_txn, 1806 .del = x86_pmu_del,
1807 .commit_txn = x86_pmu_commit_txn, 1807 .start = x86_pmu_start,
1808 .stop = x86_pmu_stop,
1809 .read = x86_pmu_read,
1808 1810
1809 .event_idx = x86_pmu_event_idx, 1811 .start_txn = x86_pmu_start_txn,
1810 .flush_branch_stack = x86_pmu_flush_branch_stack, 1812 .cancel_txn = x86_pmu_cancel_txn,
1813 .commit_txn = x86_pmu_commit_txn,
1811}; 1814};
1812 1815
1813void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
1814{
1815 userpg->cap_usr_time = 0;
1816 userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
1817 userpg->pmc_width = x86_pmu.cntval_bits;
1818
1819 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
1820 return;
1821
1822 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
1823 return;
1824
1825 userpg->cap_usr_time = 1;
1826 userpg->time_mult = this_cpu_read(cyc2ns);
1827 userpg->time_shift = CYC2NS_SCALE_FACTOR;
1828 userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
1829}
1830
1831/* 1816/*
1832 * callchain support 1817 * callchain support
1833 */ 1818 */
@@ -1863,54 +1848,18 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1863 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1848 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1864} 1849}
1865 1850
1866static inline int
1867valid_user_frame(const void __user *fp, unsigned long size)
1868{
1869 return (__range_not_ok(fp, size, TASK_SIZE) == 0);
1870}
1871
1872static unsigned long get_segment_base(unsigned int segment)
1873{
1874 struct desc_struct *desc;
1875 int idx = segment >> 3;
1876
1877 if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1878 if (idx > LDT_ENTRIES)
1879 return 0;
1880
1881 if (idx > current->active_mm->context.size)
1882 return 0;
1883
1884 desc = current->active_mm->context.ldt;
1885 } else {
1886 if (idx > GDT_ENTRIES)
1887 return 0;
1888
1889 desc = __this_cpu_ptr(&gdt_page.gdt[0]);
1890 }
1891
1892 return get_desc_base(desc + idx);
1893}
1894
1895#ifdef CONFIG_COMPAT 1851#ifdef CONFIG_COMPAT
1896
1897#include <asm/compat.h>
1898
1899static inline int 1852static inline int
1900perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) 1853perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1901{ 1854{
1902 /* 32-bit process in 64-bit kernel. */ 1855 /* 32-bit process in 64-bit kernel. */
1903 unsigned long ss_base, cs_base;
1904 struct stack_frame_ia32 frame; 1856 struct stack_frame_ia32 frame;
1905 const void __user *fp; 1857 const void __user *fp;
1906 1858
1907 if (!test_thread_flag(TIF_IA32)) 1859 if (!test_thread_flag(TIF_IA32))
1908 return 0; 1860 return 0;
1909 1861
1910 cs_base = get_segment_base(regs->cs); 1862 fp = compat_ptr(regs->bp);
1911 ss_base = get_segment_base(regs->ss);
1912
1913 fp = compat_ptr(ss_base + regs->bp);
1914 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1863 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1915 unsigned long bytes; 1864 unsigned long bytes;
1916 frame.next_frame = 0; 1865 frame.next_frame = 0;
@@ -1920,11 +1869,11 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1920 if (bytes != sizeof(frame)) 1869 if (bytes != sizeof(frame))
1921 break; 1870 break;
1922 1871
1923 if (!valid_user_frame(fp, sizeof(frame))) 1872 if (fp < compat_ptr(regs->sp))
1924 break; 1873 break;
1925 1874
1926 perf_callchain_store(entry, cs_base + frame.return_address); 1875 perf_callchain_store(entry, frame.return_address);
1927 fp = compat_ptr(ss_base + frame.next_frame); 1876 fp = compat_ptr(frame.next_frame);
1928 } 1877 }
1929 return 1; 1878 return 1;
1930} 1879}
@@ -1947,12 +1896,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1947 return; 1896 return;
1948 } 1897 }
1949 1898
1950 /*
1951 * We don't know what to do with VM86 stacks.. ignore them for now.
1952 */
1953 if (regs->flags & (X86_VM_MASK | PERF_EFLAGS_VM))
1954 return;
1955
1956 fp = (void __user *)regs->bp; 1899 fp = (void __user *)regs->bp;
1957 1900
1958 perf_callchain_store(entry, regs->ip); 1901 perf_callchain_store(entry, regs->ip);
@@ -1972,7 +1915,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1972 if (bytes != sizeof(frame)) 1915 if (bytes != sizeof(frame))
1973 break; 1916 break;
1974 1917
1975 if (!valid_user_frame(fp, sizeof(frame))) 1918 if ((unsigned long)fp < regs->sp)
1976 break; 1919 break;
1977 1920
1978 perf_callchain_store(entry, frame.return_address); 1921 perf_callchain_store(entry, frame.return_address);
@@ -1980,50 +1923,16 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1980 } 1923 }
1981} 1924}
1982 1925
1983/*
1984 * Deal with code segment offsets for the various execution modes:
1985 *
1986 * VM86 - the good olde 16 bit days, where the linear address is
1987 * 20 bits and we use regs->ip + 0x10 * regs->cs.
1988 *
1989 * IA32 - Where we need to look at GDT/LDT segment descriptor tables
1990 * to figure out what the 32bit base address is.
1991 *
1992 * X32 - has TIF_X32 set, but is running in x86_64
1993 *
1994 * X86_64 - CS,DS,SS,ES are all zero based.
1995 */
1996static unsigned long code_segment_base(struct pt_regs *regs)
1997{
1998 /*
1999 * If we are in VM86 mode, add the segment offset to convert to a
2000 * linear address.
2001 */
2002 if (regs->flags & X86_VM_MASK)
2003 return 0x10 * regs->cs;
2004
2005 /*
2006 * For IA32 we look at the GDT/LDT segment base to convert the
2007 * effective IP to a linear address.
2008 */
2009#ifdef CONFIG_X86_32
2010 if (user_mode(regs) && regs->cs != __USER_CS)
2011 return get_segment_base(regs->cs);
2012#else
2013 if (test_thread_flag(TIF_IA32)) {
2014 if (user_mode(regs) && regs->cs != __USER32_CS)
2015 return get_segment_base(regs->cs);
2016 }
2017#endif
2018 return 0;
2019}
2020
2021unsigned long perf_instruction_pointer(struct pt_regs *regs) 1926unsigned long perf_instruction_pointer(struct pt_regs *regs)
2022{ 1927{
1928 unsigned long ip;
1929
2023 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) 1930 if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
2024 return perf_guest_cbs->get_guest_ip(); 1931 ip = perf_guest_cbs->get_guest_ip();
1932 else
1933 ip = instruction_pointer(regs);
2025 1934
2026 return regs->ip + code_segment_base(regs); 1935 return ip;
2027} 1936}
2028 1937
2029unsigned long perf_misc_flags(struct pt_regs *regs) 1938unsigned long perf_misc_flags(struct pt_regs *regs)
@@ -2047,15 +1956,3 @@ unsigned long perf_misc_flags(struct pt_regs *regs)
2047 1956
2048 return misc; 1957 return misc;
2049} 1958}
2050
2051void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
2052{
2053 cap->version = x86_pmu.version;
2054 cap->num_counters_gp = x86_pmu.num_counters;
2055 cap->num_counters_fixed = x86_pmu.num_counters_fixed;
2056 cap->bit_width_gp = x86_pmu.cntval_bits;
2057 cap->bit_width_fixed = x86_pmu.cntval_bits;
2058 cap->events_mask = (unsigned int)x86_pmu.events_maskl;
2059 cap->events_mask_len = x86_pmu.events_mask_len;
2060}
2061EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
deleted file mode 100644
index 115c1ea9774..00000000000
--- a/arch/x86/kernel/cpu/perf_event.h
+++ /dev/null
@@ -1,656 +0,0 @@
1/*
2 * Performance events x86 architecture header
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2009 Jaswinder Singh Rajput
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
11 *
12 * For licencing details see kernel-base/COPYING
13 */
14
15#include <linux/perf_event.h>
16
17#if 0
18#undef wrmsrl
19#define wrmsrl(msr, val) \
20do { \
21 unsigned int _msr = (msr); \
22 u64 _val = (val); \
23 trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \
24 (unsigned long long)(_val)); \
25 native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \
26} while (0)
27#endif
28
29/*
30 * | NHM/WSM | SNB |
31 * register -------------------------------
32 * | HT | no HT | HT | no HT |
33 *-----------------------------------------
34 * offcore | core | core | cpu | core |
35 * lbr_sel | core | core | cpu | core |
36 * ld_lat | cpu | core | cpu | core |
37 *-----------------------------------------
38 *
39 * Given that there is a small number of shared regs,
40 * we can pre-allocate their slot in the per-cpu
41 * per-core reg tables.
42 */
43enum extra_reg_type {
44 EXTRA_REG_NONE = -1, /* not used */
45
46 EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */
47 EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */
48 EXTRA_REG_LBR = 2, /* lbr_select */
49
50 EXTRA_REG_MAX /* number of entries needed */
51};
52
53struct event_constraint {
54 union {
55 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
56 u64 idxmsk64;
57 };
58 u64 code;
59 u64 cmask;
60 int weight;
61 int overlap;
62};
63
64struct amd_nb {
65 int nb_id; /* NorthBridge id */
66 int refcnt; /* reference count */
67 struct perf_event *owners[X86_PMC_IDX_MAX];
68 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
69};
70
71/* The maximal number of PEBS events: */
72#define MAX_PEBS_EVENTS 8
73
74/*
75 * A debug store configuration.
76 *
77 * We only support architectures that use 64bit fields.
78 */
79struct debug_store {
80 u64 bts_buffer_base;
81 u64 bts_index;
82 u64 bts_absolute_maximum;
83 u64 bts_interrupt_threshold;
84 u64 pebs_buffer_base;
85 u64 pebs_index;
86 u64 pebs_absolute_maximum;
87 u64 pebs_interrupt_threshold;
88 u64 pebs_event_reset[MAX_PEBS_EVENTS];
89};
90
91/*
92 * Per register state.
93 */
94struct er_account {
95 raw_spinlock_t lock; /* per-core: protect structure */
96 u64 config; /* extra MSR config */
97 u64 reg; /* extra MSR number */
98 atomic_t ref; /* reference count */
99};
100
101/*
102 * Per core/cpu state
103 *
104 * Used to coordinate shared registers between HT threads or
105 * among events on a single PMU.
106 */
107struct intel_shared_regs {
108 struct er_account regs[EXTRA_REG_MAX];
109 int refcnt; /* per-core: #HT threads */
110 unsigned core_id; /* per-core: core id */
111};
112
113#define MAX_LBR_ENTRIES 16
114
115struct cpu_hw_events {
116 /*
117 * Generic x86 PMC bits
118 */
119 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
120 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
121 unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
122 int enabled;
123
124 int n_events;
125 int n_added;
126 int n_txn;
127 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
128 u64 tags[X86_PMC_IDX_MAX];
129 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
130
131 unsigned int group_flag;
132 int is_fake;
133
134 /*
135 * Intel DebugStore bits
136 */
137 struct debug_store *ds;
138 u64 pebs_enabled;
139
140 /*
141 * Intel LBR bits
142 */
143 int lbr_users;
144 void *lbr_context;
145 struct perf_branch_stack lbr_stack;
146 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
147 struct er_account *lbr_sel;
148 u64 br_sel;
149
150 /*
151 * Intel host/guest exclude bits
152 */
153 u64 intel_ctrl_guest_mask;
154 u64 intel_ctrl_host_mask;
155 struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX];
156
157 /*
158 * manage shared (per-core, per-cpu) registers
159 * used on Intel NHM/WSM/SNB
160 */
161 struct intel_shared_regs *shared_regs;
162
163 /*
164 * AMD specific bits
165 */
166 struct amd_nb *amd_nb;
167 /* Inverted mask of bits to clear in the perf_ctr ctrl registers */
168 u64 perf_ctr_virt_mask;
169
170 void *kfree_on_online;
171};
172
173#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
174 { .idxmsk64 = (n) }, \
175 .code = (c), \
176 .cmask = (m), \
177 .weight = (w), \
178 .overlap = (o), \
179}
180
181#define EVENT_CONSTRAINT(c, n, m) \
182 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
183
184/*
185 * The overlap flag marks event constraints with overlapping counter
186 * masks. This is the case if the counter mask of such an event is not
187 * a subset of any other counter mask of a constraint with an equal or
188 * higher weight, e.g.:
189 *
190 * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0);
191 * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0);
192 * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0);
193 *
194 * The event scheduler may not select the correct counter in the first
195 * cycle because it needs to know which subsequent events will be
196 * scheduled. It may fail to schedule the events then. So we set the
197 * overlap flag for such constraints to give the scheduler a hint which
198 * events to select for counter rescheduling.
199 *
200 * Care must be taken as the rescheduling algorithm is O(n!) which
201 * will increase scheduling cycles for an over-commited system
202 * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros
203 * and its counter masks must be kept at a minimum.
204 */
205#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \
206 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
207
208/*
209 * Constraint on the Event code.
210 */
211#define INTEL_EVENT_CONSTRAINT(c, n) \
212 EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
213
214/*
215 * Constraint on the Event code + UMask + fixed-mask
216 *
217 * filter mask to validate fixed counter events.
218 * the following filters disqualify for fixed counters:
219 * - inv
220 * - edge
221 * - cnt-mask
222 * The other filters are supported by fixed counters.
223 * The any-thread option is supported starting with v3.
224 */
225#define FIXED_EVENT_CONSTRAINT(c, n) \
226 EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
227
228/*
229 * Constraint on the Event code + UMask
230 */
231#define INTEL_UEVENT_CONSTRAINT(c, n) \
232 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
233
234#define EVENT_CONSTRAINT_END \
235 EVENT_CONSTRAINT(0, 0, 0)
236
237#define for_each_event_constraint(e, c) \
238 for ((e) = (c); (e)->weight; (e)++)
239
240/*
241 * Extra registers for specific events.
242 *
243 * Some events need large masks and require external MSRs.
244 * Those extra MSRs end up being shared for all events on
245 * a PMU and sometimes between PMU of sibling HT threads.
246 * In either case, the kernel needs to handle conflicting
247 * accesses to those extra, shared, regs. The data structure
248 * to manage those registers is stored in cpu_hw_event.
249 */
250struct extra_reg {
251 unsigned int event;
252 unsigned int msr;
253 u64 config_mask;
254 u64 valid_mask;
255 int idx; /* per_xxx->regs[] reg index */
256};
257
258#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \
259 .event = (e), \
260 .msr = (ms), \
261 .config_mask = (m), \
262 .valid_mask = (vm), \
263 .idx = EXTRA_REG_##i \
264 }
265
266#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
267 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx)
268
269#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0)
270
271union perf_capabilities {
272 struct {
273 u64 lbr_format:6;
274 u64 pebs_trap:1;
275 u64 pebs_arch_reg:1;
276 u64 pebs_format:4;
277 u64 smm_freeze:1;
278 };
279 u64 capabilities;
280};
281
282struct x86_pmu_quirk {
283 struct x86_pmu_quirk *next;
284 void (*func)(void);
285};
286
287union x86_pmu_config {
288 struct {
289 u64 event:8,
290 umask:8,
291 usr:1,
292 os:1,
293 edge:1,
294 pc:1,
295 interrupt:1,
296 __reserved1:1,
297 en:1,
298 inv:1,
299 cmask:8,
300 event2:4,
301 __reserved2:4,
302 go:1,
303 ho:1;
304 } bits;
305 u64 value;
306};
307
308#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
309
310/*
311 * struct x86_pmu - generic x86 pmu
312 */
313struct x86_pmu {
314 /*
315 * Generic x86 PMC bits
316 */
317 const char *name;
318 int version;
319 int (*handle_irq)(struct pt_regs *);
320 void (*disable_all)(void);
321 void (*enable_all)(int added);
322 void (*enable)(struct perf_event *);
323 void (*disable)(struct perf_event *);
324 int (*hw_config)(struct perf_event *event);
325 int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
326 unsigned eventsel;
327 unsigned perfctr;
328 u64 (*event_map)(int);
329 int max_events;
330 int num_counters;
331 int num_counters_fixed;
332 int cntval_bits;
333 u64 cntval_mask;
334 union {
335 unsigned long events_maskl;
336 unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)];
337 };
338 int events_mask_len;
339 int apic;
340 u64 max_period;
341 struct event_constraint *
342 (*get_event_constraints)(struct cpu_hw_events *cpuc,
343 struct perf_event *event);
344
345 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
346 struct perf_event *event);
347 struct event_constraint *event_constraints;
348 struct x86_pmu_quirk *quirks;
349 int perfctr_second_write;
350
351 /*
352 * sysfs attrs
353 */
354 int attr_rdpmc;
355 struct attribute **format_attrs;
356
357 ssize_t (*events_sysfs_show)(char *page, u64 config);
358
359 /*
360 * CPU Hotplug hooks
361 */
362 int (*cpu_prepare)(int cpu);
363 void (*cpu_starting)(int cpu);
364 void (*cpu_dying)(int cpu);
365 void (*cpu_dead)(int cpu);
366
367 void (*check_microcode)(void);
368 void (*flush_branch_stack)(void);
369
370 /*
371 * Intel Arch Perfmon v2+
372 */
373 u64 intel_ctrl;
374 union perf_capabilities intel_cap;
375
376 /*
377 * Intel DebugStore bits
378 */
379 unsigned int bts :1,
380 bts_active :1,
381 pebs :1,
382 pebs_active :1,
383 pebs_broken :1;
384 int pebs_record_size;
385 void (*drain_pebs)(struct pt_regs *regs);
386 struct event_constraint *pebs_constraints;
387 void (*pebs_aliases)(struct perf_event *event);
388 int max_pebs_events;
389
390 /*
391 * Intel LBR
392 */
393 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
394 int lbr_nr; /* hardware stack size */
395 u64 lbr_sel_mask; /* LBR_SELECT valid bits */
396 const int *lbr_sel_map; /* lbr_select mappings */
397
398 /*
399 * Extra registers for events
400 */
401 struct extra_reg *extra_regs;
402 unsigned int er_flags;
403
404 /*
405 * Intel host/guest support (KVM)
406 */
407 struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
408};
409
410#define x86_add_quirk(func_) \
411do { \
412 static struct x86_pmu_quirk __quirk __initdata = { \
413 .func = func_, \
414 }; \
415 __quirk.next = x86_pmu.quirks; \
416 x86_pmu.quirks = &__quirk; \
417} while (0)
418
419#define ERF_NO_HT_SHARING 1
420#define ERF_HAS_RSP_1 2
421
422extern struct x86_pmu x86_pmu __read_mostly;
423
424DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
425
426int x86_perf_event_set_period(struct perf_event *event);
427
428/*
429 * Generalized hw caching related hw_event table, filled
430 * in on a per model basis. A value of 0 means
431 * 'not supported', -1 means 'hw_event makes no sense on
432 * this CPU', any other value means the raw hw_event
433 * ID.
434 */
435
436#define C(x) PERF_COUNT_HW_CACHE_##x
437
438extern u64 __read_mostly hw_cache_event_ids
439 [PERF_COUNT_HW_CACHE_MAX]
440 [PERF_COUNT_HW_CACHE_OP_MAX]
441 [PERF_COUNT_HW_CACHE_RESULT_MAX];
442extern u64 __read_mostly hw_cache_extra_regs
443 [PERF_COUNT_HW_CACHE_MAX]
444 [PERF_COUNT_HW_CACHE_OP_MAX]
445 [PERF_COUNT_HW_CACHE_RESULT_MAX];
446
447u64 x86_perf_event_update(struct perf_event *event);
448
449static inline int x86_pmu_addr_offset(int index)
450{
451 int offset;
452
453 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
454 alternative_io(ASM_NOP2,
455 "shll $1, %%eax",
456 X86_FEATURE_PERFCTR_CORE,
457 "=a" (offset),
458 "a" (index));
459
460 return offset;
461}
462
463static inline unsigned int x86_pmu_config_addr(int index)
464{
465 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
466}
467
468static inline unsigned int x86_pmu_event_addr(int index)
469{
470 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
471}
472
473int x86_setup_perfctr(struct perf_event *event);
474
475int x86_pmu_hw_config(struct perf_event *event);
476
477void x86_pmu_disable_all(void);
478
479static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
480 u64 enable_mask)
481{
482 u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask);
483
484 if (hwc->extra_reg.reg)
485 wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config);
486 wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask);
487}
488
489void x86_pmu_enable_all(int added);
490
491int perf_assign_events(struct event_constraint **constraints, int n,
492 int wmin, int wmax, int *assign);
493int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign);
494
495void x86_pmu_stop(struct perf_event *event, int flags);
496
497static inline void x86_pmu_disable_event(struct perf_event *event)
498{
499 struct hw_perf_event *hwc = &event->hw;
500
501 wrmsrl(hwc->config_base, hwc->config);
502}
503
504void x86_pmu_enable_event(struct perf_event *event);
505
506int x86_pmu_handle_irq(struct pt_regs *regs);
507
508extern struct event_constraint emptyconstraint;
509
510extern struct event_constraint unconstrained;
511
512static inline bool kernel_ip(unsigned long ip)
513{
514#ifdef CONFIG_X86_32
515 return ip > PAGE_OFFSET;
516#else
517 return (long)ip < 0;
518#endif
519}
520
521/*
522 * Not all PMUs provide the right context information to place the reported IP
523 * into full context. Specifically segment registers are typically not
524 * supplied.
525 *
526 * Assuming the address is a linear address (it is for IBS), we fake the CS and
527 * vm86 mode using the known zero-based code segment and 'fix up' the registers
528 * to reflect this.
529 *
530 * Intel PEBS/LBR appear to typically provide the effective address, nothing
531 * much we can do about that but pray and treat it like a linear address.
532 */
533static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
534{
535 regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS;
536 if (regs->flags & X86_VM_MASK)
537 regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK);
538 regs->ip = ip;
539}
540
541ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
542ssize_t intel_event_sysfs_show(char *page, u64 config);
543
544#ifdef CONFIG_CPU_SUP_AMD
545
546int amd_pmu_init(void);
547
548#else /* CONFIG_CPU_SUP_AMD */
549
550static inline int amd_pmu_init(void)
551{
552 return 0;
553}
554
555#endif /* CONFIG_CPU_SUP_AMD */
556
557#ifdef CONFIG_CPU_SUP_INTEL
558
559int intel_pmu_save_and_restart(struct perf_event *event);
560
561struct event_constraint *
562x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
563
564struct intel_shared_regs *allocate_shared_regs(int cpu);
565
566int intel_pmu_init(void);
567
568void init_debug_store_on_cpu(int cpu);
569
570void fini_debug_store_on_cpu(int cpu);
571
572void release_ds_buffers(void);
573
574void reserve_ds_buffers(void);
575
576extern struct event_constraint bts_constraint;
577
578void intel_pmu_enable_bts(u64 config);
579
580void intel_pmu_disable_bts(void);
581
582int intel_pmu_drain_bts_buffer(void);
583
584extern struct event_constraint intel_core2_pebs_event_constraints[];
585
586extern struct event_constraint intel_atom_pebs_event_constraints[];
587
588extern struct event_constraint intel_nehalem_pebs_event_constraints[];
589
590extern struct event_constraint intel_westmere_pebs_event_constraints[];
591
592extern struct event_constraint intel_snb_pebs_event_constraints[];
593
594extern struct event_constraint intel_ivb_pebs_event_constraints[];
595
596struct event_constraint *intel_pebs_constraints(struct perf_event *event);
597
598void intel_pmu_pebs_enable(struct perf_event *event);
599
600void intel_pmu_pebs_disable(struct perf_event *event);
601
602void intel_pmu_pebs_enable_all(void);
603
604void intel_pmu_pebs_disable_all(void);
605
606void intel_ds_init(void);
607
608void intel_pmu_lbr_reset(void);
609
610void intel_pmu_lbr_enable(struct perf_event *event);
611
612void intel_pmu_lbr_disable(struct perf_event *event);
613
614void intel_pmu_lbr_enable_all(void);
615
616void intel_pmu_lbr_disable_all(void);
617
618void intel_pmu_lbr_read(void);
619
620void intel_pmu_lbr_init_core(void);
621
622void intel_pmu_lbr_init_nhm(void);
623
624void intel_pmu_lbr_init_atom(void);
625
626void intel_pmu_lbr_init_snb(void);
627
628int intel_pmu_setup_lbr_filter(struct perf_event *event);
629
630int p4_pmu_init(void);
631
632int p6_pmu_init(void);
633
634int knc_pmu_init(void);
635
636#else /* CONFIG_CPU_SUP_INTEL */
637
638static inline void reserve_ds_buffers(void)
639{
640}
641
642static inline void release_ds_buffers(void)
643{
644}
645
646static inline int intel_pmu_init(void)
647{
648 return 0;
649}
650
651static inline struct intel_shared_regs *allocate_shared_regs(int cpu)
652{
653 return NULL;
654}
655
656#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c93bc4e813a..941caa2e449 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,11 +1,4 @@
1#include <linux/perf_event.h> 1#ifdef CONFIG_CPU_SUP_AMD
2#include <linux/export.h>
3#include <linux/types.h>
4#include <linux/init.h>
5#include <linux/slab.h>
6#include <asm/apicdef.h>
7
8#include "perf_event.h"
9 2
10static __initconst const u64 amd_hw_cache_event_ids 3static __initconst const u64 amd_hw_cache_event_ids
11 [PERF_COUNT_HW_CACHE_MAX] 4 [PERF_COUNT_HW_CACHE_MAX]
@@ -134,32 +127,11 @@ static u64 amd_pmu_event_map(int hw_event)
134 127
135static int amd_pmu_hw_config(struct perf_event *event) 128static int amd_pmu_hw_config(struct perf_event *event)
136{ 129{
137 int ret; 130 int ret = x86_pmu_hw_config(event);
138
139 /* pass precise event sampling to ibs: */
140 if (event->attr.precise_ip && get_ibs_caps())
141 return -ENOENT;
142 131
143 ret = x86_pmu_hw_config(event);
144 if (ret) 132 if (ret)
145 return ret; 133 return ret;
146 134
147 if (has_branch_stack(event))
148 return -EOPNOTSUPP;
149
150 if (event->attr.exclude_host && event->attr.exclude_guest)
151 /*
152 * When HO == GO == 1 the hardware treats that as GO == HO == 0
153 * and will count in both modes. We don't want to count in that
154 * case so we emulate no-counting by setting US = OS = 0.
155 */
156 event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR |
157 ARCH_PERFMON_EVENTSEL_OS);
158 else if (event->attr.exclude_host)
159 event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY;
160 else if (event->attr.exclude_guest)
161 event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY;
162
163 if (event->attr.type != PERF_TYPE_RAW) 135 if (event->attr.type != PERF_TYPE_RAW)
164 return 0; 136 return 0;
165 137
@@ -210,8 +182,10 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
210 * when we come here 182 * when we come here
211 */ 183 */
212 for (i = 0; i < x86_pmu.num_counters; i++) { 184 for (i = 0; i < x86_pmu.num_counters; i++) {
213 if (cmpxchg(nb->owners + i, event, NULL) == event) 185 if (nb->owners[i] == event) {
186 cmpxchg(nb->owners+i, event, NULL);
214 break; 187 break;
188 }
215 } 189 }
216} 190}
217 191
@@ -364,8 +338,6 @@ static void amd_pmu_cpu_starting(int cpu)
364 struct amd_nb *nb; 338 struct amd_nb *nb;
365 int i, nb_id; 339 int i, nb_id;
366 340
367 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
368
369 if (boot_cpu_data.x86_max_cores < 2) 341 if (boot_cpu_data.x86_max_cores < 2)
370 return; 342 return;
371 343
@@ -378,7 +350,7 @@ static void amd_pmu_cpu_starting(int cpu)
378 continue; 350 continue;
379 351
380 if (nb->nb_id == nb_id) { 352 if (nb->nb_id == nb_id) {
381 cpuc->kfree_on_online = cpuc->amd_nb; 353 kfree(cpuc->amd_nb);
382 cpuc->amd_nb = nb; 354 cpuc->amd_nb = nb;
383 break; 355 break;
384 } 356 }
@@ -407,19 +379,31 @@ static void amd_pmu_cpu_dead(int cpu)
407 } 379 }
408} 380}
409 381
410PMU_FORMAT_ATTR(event, "config:0-7,32-35"); 382static __initconst const struct x86_pmu amd_pmu = {
411PMU_FORMAT_ATTR(umask, "config:8-15" ); 383 .name = "AMD",
412PMU_FORMAT_ATTR(edge, "config:18" ); 384 .handle_irq = x86_pmu_handle_irq,
413PMU_FORMAT_ATTR(inv, "config:23" ); 385 .disable_all = x86_pmu_disable_all,
414PMU_FORMAT_ATTR(cmask, "config:24-31" ); 386 .enable_all = x86_pmu_enable_all,
415 387 .enable = x86_pmu_enable_event,
416static struct attribute *amd_format_attr[] = { 388 .disable = x86_pmu_disable_event,
417 &format_attr_event.attr, 389 .hw_config = amd_pmu_hw_config,
418 &format_attr_umask.attr, 390 .schedule_events = x86_schedule_events,
419 &format_attr_edge.attr, 391 .eventsel = MSR_K7_EVNTSEL0,
420 &format_attr_inv.attr, 392 .perfctr = MSR_K7_PERFCTR0,
421 &format_attr_cmask.attr, 393 .event_map = amd_pmu_event_map,
422 NULL, 394 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
395 .num_counters = 4,
396 .cntval_bits = 48,
397 .cntval_mask = (1ULL << 48) - 1,
398 .apic = 1,
399 /* use highest bit to detect overflow */
400 .max_period = (1ULL << 47) - 1,
401 .get_event_constraints = amd_get_event_constraints,
402 .put_event_constraints = amd_put_event_constraints,
403
404 .cpu_prepare = amd_pmu_cpu_prepare,
405 .cpu_starting = amd_pmu_cpu_starting,
406 .cpu_dead = amd_pmu_cpu_dead,
423}; 407};
424 408
425/* AMD Family 15h */ 409/* AMD Family 15h */
@@ -467,7 +451,6 @@ static struct attribute *amd_format_attr[] = {
467 * 0x023 DE PERF_CTL[2:0] 451 * 0x023 DE PERF_CTL[2:0]
468 * 0x02D LS PERF_CTL[3] 452 * 0x02D LS PERF_CTL[3]
469 * 0x02E LS PERF_CTL[3,0] 453 * 0x02E LS PERF_CTL[3,0]
470 * 0x031 LS PERF_CTL[2:0] (**)
471 * 0x043 CU PERF_CTL[2:0] 454 * 0x043 CU PERF_CTL[2:0]
472 * 0x045 CU PERF_CTL[2:0] 455 * 0x045 CU PERF_CTL[2:0]
473 * 0x046 CU PERF_CTL[2:0] 456 * 0x046 CU PERF_CTL[2:0]
@@ -481,18 +464,16 @@ static struct attribute *amd_format_attr[] = {
481 * 0x0DD LS PERF_CTL[5:0] 464 * 0x0DD LS PERF_CTL[5:0]
482 * 0x0DE LS PERF_CTL[5:0] 465 * 0x0DE LS PERF_CTL[5:0]
483 * 0x0DF LS PERF_CTL[5:0] 466 * 0x0DF LS PERF_CTL[5:0]
484 * 0x1C0 EX PERF_CTL[5:3]
485 * 0x1D6 EX PERF_CTL[5:0] 467 * 0x1D6 EX PERF_CTL[5:0]
486 * 0x1D8 EX PERF_CTL[5:0] 468 * 0x1D8 EX PERF_CTL[5:0]
487 * 469 *
488 * (*) depending on the umask all FPU counters may be used 470 * (*) depending on the umask all FPU counters may be used
489 * (**) only one unitmask enabled at a time
490 */ 471 */
491 472
492static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); 473static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
493static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); 474static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
494static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); 475static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
495static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); 476static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
496static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); 477static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
497static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); 478static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
498 479
@@ -536,12 +517,6 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
536 return &amd_f15_PMC3; 517 return &amd_f15_PMC3;
537 case 0x02E: 518 case 0x02E:
538 return &amd_f15_PMC30; 519 return &amd_f15_PMC30;
539 case 0x031:
540 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
541 return &amd_f15_PMC20;
542 return &emptyconstraint;
543 case 0x1C0:
544 return &amd_f15_PMC53;
545 default: 520 default:
546 return &amd_f15_PMC50; 521 return &amd_f15_PMC50;
547 } 522 }
@@ -568,16 +543,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev
568 } 543 }
569} 544}
570 545
571static ssize_t amd_event_sysfs_show(char *page, u64 config) 546static __initconst const struct x86_pmu amd_pmu_f15h = {
572{ 547 .name = "AMD Family 15h",
573 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) |
574 (config & AMD64_EVENTSEL_EVENT) >> 24;
575
576 return x86_event_sysfs_show(page, config, event);
577}
578
579static __initconst const struct x86_pmu amd_pmu = {
580 .name = "AMD",
581 .handle_irq = x86_pmu_handle_irq, 548 .handle_irq = x86_pmu_handle_irq,
582 .disable_all = x86_pmu_disable_all, 549 .disable_all = x86_pmu_disable_all,
583 .enable_all = x86_pmu_enable_all, 550 .enable_all = x86_pmu_enable_all,
@@ -585,69 +552,49 @@ static __initconst const struct x86_pmu amd_pmu = {
585 .disable = x86_pmu_disable_event, 552 .disable = x86_pmu_disable_event,
586 .hw_config = amd_pmu_hw_config, 553 .hw_config = amd_pmu_hw_config,
587 .schedule_events = x86_schedule_events, 554 .schedule_events = x86_schedule_events,
588 .eventsel = MSR_K7_EVNTSEL0, 555 .eventsel = MSR_F15H_PERF_CTL,
589 .perfctr = MSR_K7_PERFCTR0, 556 .perfctr = MSR_F15H_PERF_CTR,
590 .event_map = amd_pmu_event_map, 557 .event_map = amd_pmu_event_map,
591 .max_events = ARRAY_SIZE(amd_perfmon_event_map), 558 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
592 .num_counters = AMD64_NUM_COUNTERS, 559 .num_counters = 6,
593 .cntval_bits = 48, 560 .cntval_bits = 48,
594 .cntval_mask = (1ULL << 48) - 1, 561 .cntval_mask = (1ULL << 48) - 1,
595 .apic = 1, 562 .apic = 1,
596 /* use highest bit to detect overflow */ 563 /* use highest bit to detect overflow */
597 .max_period = (1ULL << 47) - 1, 564 .max_period = (1ULL << 47) - 1,
598 .get_event_constraints = amd_get_event_constraints, 565 .get_event_constraints = amd_get_event_constraints_f15h,
566 /* nortbridge counters not yet implemented: */
567#if 0
599 .put_event_constraints = amd_put_event_constraints, 568 .put_event_constraints = amd_put_event_constraints,
600 569
601 .format_attrs = amd_format_attr,
602 .events_sysfs_show = amd_event_sysfs_show,
603
604 .cpu_prepare = amd_pmu_cpu_prepare, 570 .cpu_prepare = amd_pmu_cpu_prepare,
605 .cpu_starting = amd_pmu_cpu_starting, 571 .cpu_starting = amd_pmu_cpu_starting,
606 .cpu_dead = amd_pmu_cpu_dead, 572 .cpu_dead = amd_pmu_cpu_dead,
573#endif
607}; 574};
608 575
609static int setup_event_constraints(void) 576static __init int amd_pmu_init(void)
610{
611 if (boot_cpu_data.x86 >= 0x15)
612 x86_pmu.get_event_constraints = amd_get_event_constraints_f15h;
613 return 0;
614}
615
616static int setup_perfctr_core(void)
617{
618 if (!cpu_has_perfctr_core) {
619 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h,
620 KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!");
621 return -ENODEV;
622 }
623
624 WARN(x86_pmu.get_event_constraints == amd_get_event_constraints,
625 KERN_ERR "hw perf events core counters need constraints handler!");
626
627 /*
628 * If core performance counter extensions exists, we must use
629 * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also
630 * x86_pmu_addr_offset().
631 */
632 x86_pmu.eventsel = MSR_F15H_PERF_CTL;
633 x86_pmu.perfctr = MSR_F15H_PERF_CTR;
634 x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE;
635
636 printk(KERN_INFO "perf: AMD core performance counters detected\n");
637
638 return 0;
639}
640
641__init int amd_pmu_init(void)
642{ 577{
643 /* Performance-monitoring supported from K7 and later: */ 578 /* Performance-monitoring supported from K7 and later: */
644 if (boot_cpu_data.x86 < 6) 579 if (boot_cpu_data.x86 < 6)
645 return -ENODEV; 580 return -ENODEV;
646 581
647 x86_pmu = amd_pmu; 582 /*
648 583 * If core performance counter extensions exists, it must be
649 setup_event_constraints(); 584 * family 15h, otherwise fail. See x86_pmu_addr_offset().
650 setup_perfctr_core(); 585 */
586 switch (boot_cpu_data.x86) {
587 case 0x15:
588 if (!cpu_has_perfctr_core)
589 return -ENODEV;
590 x86_pmu = amd_pmu_f15h;
591 break;
592 default:
593 if (cpu_has_perfctr_core)
594 return -ENODEV;
595 x86_pmu = amd_pmu;
596 break;
597 }
651 598
652 /* Events are common for all AMDs */ 599 /* Events are common for all AMDs */
653 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 600 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
@@ -656,32 +603,11 @@ __init int amd_pmu_init(void)
656 return 0; 603 return 0;
657} 604}
658 605
659void amd_pmu_enable_virt(void) 606#else /* CONFIG_CPU_SUP_AMD */
660{
661 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
662
663 cpuc->perf_ctr_virt_mask = 0;
664
665 /* Reload all events */
666 x86_pmu_disable_all();
667 x86_pmu_enable_all(0);
668}
669EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
670 607
671void amd_pmu_disable_virt(void) 608static int amd_pmu_init(void)
672{ 609{
673 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 610 return 0;
674
675 /*
676 * We only mask out the Host-only bit so that host-only counting works
677 * when SVM is disabled. If someone sets up a guest-only counter when
678 * SVM is disabled the Guest-only bits still gets set and the counter
679 * will not count anything.
680 */
681 cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY;
682
683 /* Reload all events */
684 x86_pmu_disable_all();
685 x86_pmu_enable_all(0);
686} 611}
687EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); 612
613#endif
diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
deleted file mode 100644
index 6336bcbd061..00000000000
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ /dev/null
@@ -1,908 +0,0 @@
1/*
2 * Performance events - AMD IBS
3 *
4 * Copyright (C) 2011 Advanced Micro Devices, Inc., Robert Richter
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/perf_event.h>
10#include <linux/module.h>
11#include <linux/pci.h>
12#include <linux/ptrace.h>
13
14#include <asm/apic.h>
15
16#include "perf_event.h"
17
18static u32 ibs_caps;
19
20#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD)
21
22#include <linux/kprobes.h>
23#include <linux/hardirq.h>
24
25#include <asm/nmi.h>
26
27#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT)
28#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT
29
30enum ibs_states {
31 IBS_ENABLED = 0,
32 IBS_STARTED = 1,
33 IBS_STOPPING = 2,
34
35 IBS_MAX_STATES,
36};
37
38struct cpu_perf_ibs {
39 struct perf_event *event;
40 unsigned long state[BITS_TO_LONGS(IBS_MAX_STATES)];
41};
42
43struct perf_ibs {
44 struct pmu pmu;
45 unsigned int msr;
46 u64 config_mask;
47 u64 cnt_mask;
48 u64 enable_mask;
49 u64 valid_mask;
50 u64 max_period;
51 unsigned long offset_mask[1];
52 int offset_max;
53 struct cpu_perf_ibs __percpu *pcpu;
54
55 struct attribute **format_attrs;
56 struct attribute_group format_group;
57 const struct attribute_group *attr_groups[2];
58
59 u64 (*get_count)(u64 config);
60};
61
62struct perf_ibs_data {
63 u32 size;
64 union {
65 u32 data[0]; /* data buffer starts here */
66 u32 caps;
67 };
68 u64 regs[MSR_AMD64_IBS_REG_COUNT_MAX];
69};
70
71static int
72perf_event_set_period(struct hw_perf_event *hwc, u64 min, u64 max, u64 *hw_period)
73{
74 s64 left = local64_read(&hwc->period_left);
75 s64 period = hwc->sample_period;
76 int overflow = 0;
77
78 /*
79 * If we are way outside a reasonable range then just skip forward:
80 */
81 if (unlikely(left <= -period)) {
82 left = period;
83 local64_set(&hwc->period_left, left);
84 hwc->last_period = period;
85 overflow = 1;
86 }
87
88 if (unlikely(left < (s64)min)) {
89 left += period;
90 local64_set(&hwc->period_left, left);
91 hwc->last_period = period;
92 overflow = 1;
93 }
94
95 /*
96 * If the hw period that triggers the sw overflow is too short
97 * we might hit the irq handler. This biases the results.
98 * Thus we shorten the next-to-last period and set the last
99 * period to the max period.
100 */
101 if (left > max) {
102 left -= max;
103 if (left > max)
104 left = max;
105 else if (left < min)
106 left = min;
107 }
108
109 *hw_period = (u64)left;
110
111 return overflow;
112}
113
114static int
115perf_event_try_update(struct perf_event *event, u64 new_raw_count, int width)
116{
117 struct hw_perf_event *hwc = &event->hw;
118 int shift = 64 - width;
119 u64 prev_raw_count;
120 u64 delta;
121
122 /*
123 * Careful: an NMI might modify the previous event value.
124 *
125 * Our tactic to handle this is to first atomically read and
126 * exchange a new raw count - then add that new-prev delta
127 * count to the generic event atomically:
128 */
129 prev_raw_count = local64_read(&hwc->prev_count);
130 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
131 new_raw_count) != prev_raw_count)
132 return 0;
133
134 /*
135 * Now we have the new raw value and have updated the prev
136 * timestamp already. We can now calculate the elapsed delta
137 * (event-)time and add that to the generic event.
138 *
139 * Careful, not all hw sign-extends above the physical width
140 * of the count.
141 */
142 delta = (new_raw_count << shift) - (prev_raw_count << shift);
143 delta >>= shift;
144
145 local64_add(delta, &event->count);
146 local64_sub(delta, &hwc->period_left);
147
148 return 1;
149}
150
151static struct perf_ibs perf_ibs_fetch;
152static struct perf_ibs perf_ibs_op;
153
154static struct perf_ibs *get_ibs_pmu(int type)
155{
156 if (perf_ibs_fetch.pmu.type == type)
157 return &perf_ibs_fetch;
158 if (perf_ibs_op.pmu.type == type)
159 return &perf_ibs_op;
160 return NULL;
161}
162
163/*
164 * Use IBS for precise event sampling:
165 *
166 * perf record -a -e cpu-cycles:p ... # use ibs op counting cycle count
167 * perf record -a -e r076:p ... # same as -e cpu-cycles:p
168 * perf record -a -e r0C1:p ... # use ibs op counting micro-ops
169 *
170 * IbsOpCntCtl (bit 19) of IBS Execution Control Register (IbsOpCtl,
171 * MSRC001_1033) is used to select either cycle or micro-ops counting
172 * mode.
173 *
174 * The rip of IBS samples has skid 0. Thus, IBS supports precise
175 * levels 1 and 2 and the PERF_EFLAGS_EXACT is set. In rare cases the
176 * rip is invalid when IBS was not able to record the rip correctly.
177 * We clear PERF_EFLAGS_EXACT and take the rip from pt_regs then.
178 *
179 */
180static int perf_ibs_precise_event(struct perf_event *event, u64 *config)
181{
182 switch (event->attr.precise_ip) {
183 case 0:
184 return -ENOENT;
185 case 1:
186 case 2:
187 break;
188 default:
189 return -EOPNOTSUPP;
190 }
191
192 switch (event->attr.type) {
193 case PERF_TYPE_HARDWARE:
194 switch (event->attr.config) {
195 case PERF_COUNT_HW_CPU_CYCLES:
196 *config = 0;
197 return 0;
198 }
199 break;
200 case PERF_TYPE_RAW:
201 switch (event->attr.config) {
202 case 0x0076:
203 *config = 0;
204 return 0;
205 case 0x00C1:
206 *config = IBS_OP_CNT_CTL;
207 return 0;
208 }
209 break;
210 default:
211 return -ENOENT;
212 }
213
214 return -EOPNOTSUPP;
215}
216
217static const struct perf_event_attr ibs_notsupp = {
218 .exclude_user = 1,
219 .exclude_kernel = 1,
220 .exclude_hv = 1,
221 .exclude_idle = 1,
222 .exclude_host = 1,
223 .exclude_guest = 1,
224};
225
226static int perf_ibs_init(struct perf_event *event)
227{
228 struct hw_perf_event *hwc = &event->hw;
229 struct perf_ibs *perf_ibs;
230 u64 max_cnt, config;
231 int ret;
232
233 perf_ibs = get_ibs_pmu(event->attr.type);
234 if (perf_ibs) {
235 config = event->attr.config;
236 } else {
237 perf_ibs = &perf_ibs_op;
238 ret = perf_ibs_precise_event(event, &config);
239 if (ret)
240 return ret;
241 }
242
243 if (event->pmu != &perf_ibs->pmu)
244 return -ENOENT;
245
246 if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp))
247 return -EINVAL;
248
249 if (config & ~perf_ibs->config_mask)
250 return -EINVAL;
251
252 if (hwc->sample_period) {
253 if (config & perf_ibs->cnt_mask)
254 /* raw max_cnt may not be set */
255 return -EINVAL;
256 if (!event->attr.sample_freq && hwc->sample_period & 0x0f)
257 /*
258 * lower 4 bits can not be set in ibs max cnt,
259 * but allowing it in case we adjust the
260 * sample period to set a frequency.
261 */
262 return -EINVAL;
263 hwc->sample_period &= ~0x0FULL;
264 if (!hwc->sample_period)
265 hwc->sample_period = 0x10;
266 } else {
267 max_cnt = config & perf_ibs->cnt_mask;
268 config &= ~perf_ibs->cnt_mask;
269 event->attr.sample_period = max_cnt << 4;
270 hwc->sample_period = event->attr.sample_period;
271 }
272
273 if (!hwc->sample_period)
274 return -EINVAL;
275
276 /*
277 * If we modify hwc->sample_period, we also need to update
278 * hwc->last_period and hwc->period_left.
279 */
280 hwc->last_period = hwc->sample_period;
281 local64_set(&hwc->period_left, hwc->sample_period);
282
283 hwc->config_base = perf_ibs->msr;
284 hwc->config = config;
285
286 return 0;
287}
288
289static int perf_ibs_set_period(struct perf_ibs *perf_ibs,
290 struct hw_perf_event *hwc, u64 *period)
291{
292 int overflow;
293
294 /* ignore lower 4 bits in min count: */
295 overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period);
296 local64_set(&hwc->prev_count, 0);
297
298 return overflow;
299}
300
301static u64 get_ibs_fetch_count(u64 config)
302{
303 return (config & IBS_FETCH_CNT) >> 12;
304}
305
306static u64 get_ibs_op_count(u64 config)
307{
308 u64 count = 0;
309
310 if (config & IBS_OP_VAL)
311 count += (config & IBS_OP_MAX_CNT) << 4; /* cnt rolled over */
312
313 if (ibs_caps & IBS_CAPS_RDWROPCNT)
314 count += (config & IBS_OP_CUR_CNT) >> 32;
315
316 return count;
317}
318
319static void
320perf_ibs_event_update(struct perf_ibs *perf_ibs, struct perf_event *event,
321 u64 *config)
322{
323 u64 count = perf_ibs->get_count(*config);
324
325 /*
326 * Set width to 64 since we do not overflow on max width but
327 * instead on max count. In perf_ibs_set_period() we clear
328 * prev count manually on overflow.
329 */
330 while (!perf_event_try_update(event, count, 64)) {
331 rdmsrl(event->hw.config_base, *config);
332 count = perf_ibs->get_count(*config);
333 }
334}
335
336static inline void perf_ibs_enable_event(struct perf_ibs *perf_ibs,
337 struct hw_perf_event *hwc, u64 config)
338{
339 wrmsrl(hwc->config_base, hwc->config | config | perf_ibs->enable_mask);
340}
341
342/*
343 * Erratum #420 Instruction-Based Sampling Engine May Generate
344 * Interrupt that Cannot Be Cleared:
345 *
346 * Must clear counter mask first, then clear the enable bit. See
347 * Revision Guide for AMD Family 10h Processors, Publication #41322.
348 */
349static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
350 struct hw_perf_event *hwc, u64 config)
351{
352 config &= ~perf_ibs->cnt_mask;
353 wrmsrl(hwc->config_base, config);
354 config &= ~perf_ibs->enable_mask;
355 wrmsrl(hwc->config_base, config);
356}
357
358/*
359 * We cannot restore the ibs pmu state, so we always needs to update
360 * the event while stopping it and then reset the state when starting
361 * again. Thus, ignoring PERF_EF_RELOAD and PERF_EF_UPDATE flags in
362 * perf_ibs_start()/perf_ibs_stop() and instead always do it.
363 */
364static void perf_ibs_start(struct perf_event *event, int flags)
365{
366 struct hw_perf_event *hwc = &event->hw;
367 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
368 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
369 u64 period;
370
371 if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
372 return;
373
374 WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
375 hwc->state = 0;
376
377 perf_ibs_set_period(perf_ibs, hwc, &period);
378 set_bit(IBS_STARTED, pcpu->state);
379 perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
380
381 perf_event_update_userpage(event);
382}
383
384static void perf_ibs_stop(struct perf_event *event, int flags)
385{
386 struct hw_perf_event *hwc = &event->hw;
387 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
388 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
389 u64 config;
390 int stopping;
391
392 stopping = test_and_clear_bit(IBS_STARTED, pcpu->state);
393
394 if (!stopping && (hwc->state & PERF_HES_UPTODATE))
395 return;
396
397 rdmsrl(hwc->config_base, config);
398
399 if (stopping) {
400 set_bit(IBS_STOPPING, pcpu->state);
401 perf_ibs_disable_event(perf_ibs, hwc, config);
402 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
403 hwc->state |= PERF_HES_STOPPED;
404 }
405
406 if (hwc->state & PERF_HES_UPTODATE)
407 return;
408
409 /*
410 * Clear valid bit to not count rollovers on update, rollovers
411 * are only updated in the irq handler.
412 */
413 config &= ~perf_ibs->valid_mask;
414
415 perf_ibs_event_update(perf_ibs, event, &config);
416 hwc->state |= PERF_HES_UPTODATE;
417}
418
419static int perf_ibs_add(struct perf_event *event, int flags)
420{
421 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
422 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
423
424 if (test_and_set_bit(IBS_ENABLED, pcpu->state))
425 return -ENOSPC;
426
427 event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
428
429 pcpu->event = event;
430
431 if (flags & PERF_EF_START)
432 perf_ibs_start(event, PERF_EF_RELOAD);
433
434 return 0;
435}
436
437static void perf_ibs_del(struct perf_event *event, int flags)
438{
439 struct perf_ibs *perf_ibs = container_of(event->pmu, struct perf_ibs, pmu);
440 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
441
442 if (!test_and_clear_bit(IBS_ENABLED, pcpu->state))
443 return;
444
445 perf_ibs_stop(event, PERF_EF_UPDATE);
446
447 pcpu->event = NULL;
448
449 perf_event_update_userpage(event);
450}
451
452static void perf_ibs_read(struct perf_event *event) { }
453
454PMU_FORMAT_ATTR(rand_en, "config:57");
455PMU_FORMAT_ATTR(cnt_ctl, "config:19");
456
457static struct attribute *ibs_fetch_format_attrs[] = {
458 &format_attr_rand_en.attr,
459 NULL,
460};
461
462static struct attribute *ibs_op_format_attrs[] = {
463 NULL, /* &format_attr_cnt_ctl.attr if IBS_CAPS_OPCNT */
464 NULL,
465};
466
467static struct perf_ibs perf_ibs_fetch = {
468 .pmu = {
469 .task_ctx_nr = perf_invalid_context,
470
471 .event_init = perf_ibs_init,
472 .add = perf_ibs_add,
473 .del = perf_ibs_del,
474 .start = perf_ibs_start,
475 .stop = perf_ibs_stop,
476 .read = perf_ibs_read,
477 },
478 .msr = MSR_AMD64_IBSFETCHCTL,
479 .config_mask = IBS_FETCH_CONFIG_MASK,
480 .cnt_mask = IBS_FETCH_MAX_CNT,
481 .enable_mask = IBS_FETCH_ENABLE,
482 .valid_mask = IBS_FETCH_VAL,
483 .max_period = IBS_FETCH_MAX_CNT << 4,
484 .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK },
485 .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT,
486 .format_attrs = ibs_fetch_format_attrs,
487
488 .get_count = get_ibs_fetch_count,
489};
490
491static struct perf_ibs perf_ibs_op = {
492 .pmu = {
493 .task_ctx_nr = perf_invalid_context,
494
495 .event_init = perf_ibs_init,
496 .add = perf_ibs_add,
497 .del = perf_ibs_del,
498 .start = perf_ibs_start,
499 .stop = perf_ibs_stop,
500 .read = perf_ibs_read,
501 },
502 .msr = MSR_AMD64_IBSOPCTL,
503 .config_mask = IBS_OP_CONFIG_MASK,
504 .cnt_mask = IBS_OP_MAX_CNT,
505 .enable_mask = IBS_OP_ENABLE,
506 .valid_mask = IBS_OP_VAL,
507 .max_period = IBS_OP_MAX_CNT << 4,
508 .offset_mask = { MSR_AMD64_IBSOP_REG_MASK },
509 .offset_max = MSR_AMD64_IBSOP_REG_COUNT,
510 .format_attrs = ibs_op_format_attrs,
511
512 .get_count = get_ibs_op_count,
513};
514
515static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs)
516{
517 struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu);
518 struct perf_event *event = pcpu->event;
519 struct hw_perf_event *hwc = &event->hw;
520 struct perf_sample_data data;
521 struct perf_raw_record raw;
522 struct pt_regs regs;
523 struct perf_ibs_data ibs_data;
524 int offset, size, check_rip, offset_max, throttle = 0;
525 unsigned int msr;
526 u64 *buf, *config, period;
527
528 if (!test_bit(IBS_STARTED, pcpu->state)) {
529 /*
530 * Catch spurious interrupts after stopping IBS: After
531 * disabling IBS there could be still incomming NMIs
532 * with samples that even have the valid bit cleared.
533 * Mark all this NMIs as handled.
534 */
535 return test_and_clear_bit(IBS_STOPPING, pcpu->state) ? 1 : 0;
536 }
537
538 msr = hwc->config_base;
539 buf = ibs_data.regs;
540 rdmsrl(msr, *buf);
541 if (!(*buf++ & perf_ibs->valid_mask))
542 return 0;
543
544 config = &ibs_data.regs[0];
545 perf_ibs_event_update(perf_ibs, event, config);
546 perf_sample_data_init(&data, 0, hwc->last_period);
547 if (!perf_ibs_set_period(perf_ibs, hwc, &period))
548 goto out; /* no sw counter overflow */
549
550 ibs_data.caps = ibs_caps;
551 size = 1;
552 offset = 1;
553 check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK));
554 if (event->attr.sample_type & PERF_SAMPLE_RAW)
555 offset_max = perf_ibs->offset_max;
556 else if (check_rip)
557 offset_max = 2;
558 else
559 offset_max = 1;
560 do {
561 rdmsrl(msr + offset, *buf++);
562 size++;
563 offset = find_next_bit(perf_ibs->offset_mask,
564 perf_ibs->offset_max,
565 offset + 1);
566 } while (offset < offset_max);
567 ibs_data.size = sizeof(u64) * size;
568
569 regs = *iregs;
570 if (check_rip && (ibs_data.regs[2] & IBS_RIP_INVALID)) {
571 regs.flags &= ~PERF_EFLAGS_EXACT;
572 } else {
573 set_linear_ip(&regs, ibs_data.regs[1]);
574 regs.flags |= PERF_EFLAGS_EXACT;
575 }
576
577 if (event->attr.sample_type & PERF_SAMPLE_RAW) {
578 raw.size = sizeof(u32) + ibs_data.size;
579 raw.data = ibs_data.data;
580 data.raw = &raw;
581 }
582
583 throttle = perf_event_overflow(event, &data, &regs);
584out:
585 if (throttle)
586 perf_ibs_disable_event(perf_ibs, hwc, *config);
587 else
588 perf_ibs_enable_event(perf_ibs, hwc, period >> 4);
589
590 perf_event_update_userpage(event);
591
592 return 1;
593}
594
595static int __kprobes
596perf_ibs_nmi_handler(unsigned int cmd, struct pt_regs *regs)
597{
598 int handled = 0;
599
600 handled += perf_ibs_handle_irq(&perf_ibs_fetch, regs);
601 handled += perf_ibs_handle_irq(&perf_ibs_op, regs);
602
603 if (handled)
604 inc_irq_stat(apic_perf_irqs);
605
606 return handled;
607}
608
609static __init int perf_ibs_pmu_init(struct perf_ibs *perf_ibs, char *name)
610{
611 struct cpu_perf_ibs __percpu *pcpu;
612 int ret;
613
614 pcpu = alloc_percpu(struct cpu_perf_ibs);
615 if (!pcpu)
616 return -ENOMEM;
617
618 perf_ibs->pcpu = pcpu;
619
620 /* register attributes */
621 if (perf_ibs->format_attrs[0]) {
622 memset(&perf_ibs->format_group, 0, sizeof(perf_ibs->format_group));
623 perf_ibs->format_group.name = "format";
624 perf_ibs->format_group.attrs = perf_ibs->format_attrs;
625
626 memset(&perf_ibs->attr_groups, 0, sizeof(perf_ibs->attr_groups));
627 perf_ibs->attr_groups[0] = &perf_ibs->format_group;
628 perf_ibs->pmu.attr_groups = perf_ibs->attr_groups;
629 }
630
631 ret = perf_pmu_register(&perf_ibs->pmu, name, -1);
632 if (ret) {
633 perf_ibs->pcpu = NULL;
634 free_percpu(pcpu);
635 }
636
637 return ret;
638}
639
640static __init int perf_event_ibs_init(void)
641{
642 struct attribute **attr = ibs_op_format_attrs;
643
644 if (!ibs_caps)
645 return -ENODEV; /* ibs not supported by the cpu */
646
647 perf_ibs_pmu_init(&perf_ibs_fetch, "ibs_fetch");
648
649 if (ibs_caps & IBS_CAPS_OPCNT) {
650 perf_ibs_op.config_mask |= IBS_OP_CNT_CTL;
651 *attr++ = &format_attr_cnt_ctl.attr;
652 }
653 perf_ibs_pmu_init(&perf_ibs_op, "ibs_op");
654
655 register_nmi_handler(NMI_LOCAL, perf_ibs_nmi_handler, 0, "perf_ibs");
656 printk(KERN_INFO "perf: AMD IBS detected (0x%08x)\n", ibs_caps);
657
658 return 0;
659}
660
661#else /* defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) */
662
663static __init int perf_event_ibs_init(void) { return 0; }
664
665#endif
666
667/* IBS - apic initialization, for perf and oprofile */
668
669static __init u32 __get_ibs_caps(void)
670{
671 u32 caps;
672 unsigned int max_level;
673
674 if (!boot_cpu_has(X86_FEATURE_IBS))
675 return 0;
676
677 /* check IBS cpuid feature flags */
678 max_level = cpuid_eax(0x80000000);
679 if (max_level < IBS_CPUID_FEATURES)
680 return IBS_CAPS_DEFAULT;
681
682 caps = cpuid_eax(IBS_CPUID_FEATURES);
683 if (!(caps & IBS_CAPS_AVAIL))
684 /* cpuid flags not valid */
685 return IBS_CAPS_DEFAULT;
686
687 return caps;
688}
689
690u32 get_ibs_caps(void)
691{
692 return ibs_caps;
693}
694
695EXPORT_SYMBOL(get_ibs_caps);
696
697static inline int get_eilvt(int offset)
698{
699 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
700}
701
702static inline int put_eilvt(int offset)
703{
704 return !setup_APIC_eilvt(offset, 0, 0, 1);
705}
706
707/*
708 * Check and reserve APIC extended interrupt LVT offset for IBS if available.
709 */
710static inline int ibs_eilvt_valid(void)
711{
712 int offset;
713 u64 val;
714 int valid = 0;
715
716 preempt_disable();
717
718 rdmsrl(MSR_AMD64_IBSCTL, val);
719 offset = val & IBSCTL_LVT_OFFSET_MASK;
720
721 if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
722 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
723 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
724 goto out;
725 }
726
727 if (!get_eilvt(offset)) {
728 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
729 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
730 goto out;
731 }
732
733 valid = 1;
734out:
735 preempt_enable();
736
737 return valid;
738}
739
740static int setup_ibs_ctl(int ibs_eilvt_off)
741{
742 struct pci_dev *cpu_cfg;
743 int nodes;
744 u32 value = 0;
745
746 nodes = 0;
747 cpu_cfg = NULL;
748 do {
749 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
750 PCI_DEVICE_ID_AMD_10H_NB_MISC,
751 cpu_cfg);
752 if (!cpu_cfg)
753 break;
754 ++nodes;
755 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
756 | IBSCTL_LVT_OFFSET_VALID);
757 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
758 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
759 pci_dev_put(cpu_cfg);
760 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
761 "IBSCTL = 0x%08x\n", value);
762 return -EINVAL;
763 }
764 } while (1);
765
766 if (!nodes) {
767 printk(KERN_DEBUG "No CPU node configured for IBS\n");
768 return -ENODEV;
769 }
770
771 return 0;
772}
773
774/*
775 * This runs only on the current cpu. We try to find an LVT offset and
776 * setup the local APIC. For this we must disable preemption. On
777 * success we initialize all nodes with this offset. This updates then
778 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
779 * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
780 * is using the new offset.
781 */
782static int force_ibs_eilvt_setup(void)
783{
784 int offset;
785 int ret;
786
787 preempt_disable();
788 /* find the next free available EILVT entry, skip offset 0 */
789 for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
790 if (get_eilvt(offset))
791 break;
792 }
793 preempt_enable();
794
795 if (offset == APIC_EILVT_NR_MAX) {
796 printk(KERN_DEBUG "No EILVT entry available\n");
797 return -EBUSY;
798 }
799
800 ret = setup_ibs_ctl(offset);
801 if (ret)
802 goto out;
803
804 if (!ibs_eilvt_valid()) {
805 ret = -EFAULT;
806 goto out;
807 }
808
809 pr_info("IBS: LVT offset %d assigned\n", offset);
810
811 return 0;
812out:
813 preempt_disable();
814 put_eilvt(offset);
815 preempt_enable();
816 return ret;
817}
818
819static inline int get_ibs_lvt_offset(void)
820{
821 u64 val;
822
823 rdmsrl(MSR_AMD64_IBSCTL, val);
824 if (!(val & IBSCTL_LVT_OFFSET_VALID))
825 return -EINVAL;
826
827 return val & IBSCTL_LVT_OFFSET_MASK;
828}
829
830static void setup_APIC_ibs(void *dummy)
831{
832 int offset;
833
834 offset = get_ibs_lvt_offset();
835 if (offset < 0)
836 goto failed;
837
838 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
839 return;
840failed:
841 pr_warn("perf: IBS APIC setup failed on cpu #%d\n",
842 smp_processor_id());
843}
844
845static void clear_APIC_ibs(void *dummy)
846{
847 int offset;
848
849 offset = get_ibs_lvt_offset();
850 if (offset >= 0)
851 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
852}
853
854static int __cpuinit
855perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
856{
857 switch (action & ~CPU_TASKS_FROZEN) {
858 case CPU_STARTING:
859 setup_APIC_ibs(NULL);
860 break;
861 case CPU_DYING:
862 clear_APIC_ibs(NULL);
863 break;
864 default:
865 break;
866 }
867
868 return NOTIFY_OK;
869}
870
871static __init int amd_ibs_init(void)
872{
873 u32 caps;
874 int ret = -EINVAL;
875
876 caps = __get_ibs_caps();
877 if (!caps)
878 return -ENODEV; /* ibs not supported by the cpu */
879
880 /*
881 * Force LVT offset assignment for family 10h: The offsets are
882 * not assigned by the BIOS for this family, so the OS is
883 * responsible for doing it. If the OS assignment fails, fall
884 * back to BIOS settings and try to setup this.
885 */
886 if (boot_cpu_data.x86 == 0x10)
887 force_ibs_eilvt_setup();
888
889 if (!ibs_eilvt_valid())
890 goto out;
891
892 get_online_cpus();
893 ibs_caps = caps;
894 /* make ibs_caps visible to other cpus: */
895 smp_mb();
896 perf_cpu_notifier(perf_ibs_cpu_notifier);
897 smp_call_function(setup_APIC_ibs, NULL, 1);
898 put_online_cpus();
899
900 ret = perf_event_ibs_init();
901out:
902 if (ret)
903 pr_err("Failed to setup IBS, %d\n", ret);
904 return ret;
905}
906
907/* Since we need the pci subsystem to init ibs we can't do this earlier: */
908device_initcall(amd_ibs_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 93b9e1181f8..f88af2c2a56 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,36 +1,29 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
1/* 3/*
2 * Per core/cpu state 4 * Per core/cpu state
3 * 5 *
4 * Used to coordinate shared registers between HT threads or 6 * Used to coordinate shared registers between HT threads or
5 * among events on a single PMU. 7 * among events on a single PMU.
6 */ 8 */
7 9struct intel_shared_regs {
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 struct er_account regs[EXTRA_REG_MAX];
9 11 int refcnt; /* per-core: #HT threads */
10#include <linux/stddef.h> 12 unsigned core_id; /* per-core: core id */
11#include <linux/types.h> 13};
12#include <linux/init.h>
13#include <linux/slab.h>
14#include <linux/export.h>
15
16#include <asm/hardirq.h>
17#include <asm/apic.h>
18
19#include "perf_event.h"
20 14
21/* 15/*
22 * Intel PerfMon, used on Core and later. 16 * Intel PerfMon, used on Core and later.
23 */ 17 */
24static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = 18static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
25{ 19{
26 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 20 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
27 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 21 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
28 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, 22 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
29 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, 23 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
30 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, 24 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
31 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, 25 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
32 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 26 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
33 [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */
34}; 27};
35 28
36static struct event_constraint intel_core_event_constraints[] __read_mostly = 29static struct event_constraint intel_core_event_constraints[] __read_mostly =
@@ -48,7 +41,12 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly =
48{ 41{
49 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 42 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
50 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 43 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
51 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 44 /*
45 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
46 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
47 * ratio between these counters.
48 */
49 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
52 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 50 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
53 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 51 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
54 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 52 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -66,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
66{ 64{
67 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 65 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
68 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 66 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
69 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 67 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
70 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ 68 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
71 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ 69 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
72 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ 70 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
@@ -88,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly
88{ 86{
89 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 87 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
90 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 88 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
91 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 89 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
92 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ 90 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
93 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ 91 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
94 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ 92 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
@@ -100,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
100{ 98{
101 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 99 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
102 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 100 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
103 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 101 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
104 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ 102 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
105 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ 103 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
106 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ 104 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
@@ -123,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly =
123{ 121{
124 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 122 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
125 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 123 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
126 FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ 124 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
127 EVENT_CONSTRAINT_END 125 EVENT_CONSTRAINT_END
128}; 126};
129 127
@@ -138,84 +136,6 @@ static u64 intel_pmu_event_map(int hw_event)
138 return intel_perfmon_event_map[hw_event]; 136 return intel_perfmon_event_map[hw_event];
139} 137}
140 138
141#define SNB_DMND_DATA_RD (1ULL << 0)
142#define SNB_DMND_RFO (1ULL << 1)
143#define SNB_DMND_IFETCH (1ULL << 2)
144#define SNB_DMND_WB (1ULL << 3)
145#define SNB_PF_DATA_RD (1ULL << 4)
146#define SNB_PF_RFO (1ULL << 5)
147#define SNB_PF_IFETCH (1ULL << 6)
148#define SNB_LLC_DATA_RD (1ULL << 7)
149#define SNB_LLC_RFO (1ULL << 8)
150#define SNB_LLC_IFETCH (1ULL << 9)
151#define SNB_BUS_LOCKS (1ULL << 10)
152#define SNB_STRM_ST (1ULL << 11)
153#define SNB_OTHER (1ULL << 15)
154#define SNB_RESP_ANY (1ULL << 16)
155#define SNB_NO_SUPP (1ULL << 17)
156#define SNB_LLC_HITM (1ULL << 18)
157#define SNB_LLC_HITE (1ULL << 19)
158#define SNB_LLC_HITS (1ULL << 20)
159#define SNB_LLC_HITF (1ULL << 21)
160#define SNB_LOCAL (1ULL << 22)
161#define SNB_REMOTE (0xffULL << 23)
162#define SNB_SNP_NONE (1ULL << 31)
163#define SNB_SNP_NOT_NEEDED (1ULL << 32)
164#define SNB_SNP_MISS (1ULL << 33)
165#define SNB_NO_FWD (1ULL << 34)
166#define SNB_SNP_FWD (1ULL << 35)
167#define SNB_HITM (1ULL << 36)
168#define SNB_NON_DRAM (1ULL << 37)
169
170#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD)
171#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO)
172#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO)
173
174#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \
175 SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \
176 SNB_HITM)
177
178#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY)
179#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY)
180
181#define SNB_L3_ACCESS SNB_RESP_ANY
182#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM)
183
184static __initconst const u64 snb_hw_cache_extra_regs
185 [PERF_COUNT_HW_CACHE_MAX]
186 [PERF_COUNT_HW_CACHE_OP_MAX]
187 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
188{
189 [ C(LL ) ] = {
190 [ C(OP_READ) ] = {
191 [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS,
192 [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS,
193 },
194 [ C(OP_WRITE) ] = {
195 [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS,
196 [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS,
197 },
198 [ C(OP_PREFETCH) ] = {
199 [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS,
200 [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS,
201 },
202 },
203 [ C(NODE) ] = {
204 [ C(OP_READ) ] = {
205 [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY,
206 [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE,
207 },
208 [ C(OP_WRITE) ] = {
209 [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY,
210 [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE,
211 },
212 [ C(OP_PREFETCH) ] = {
213 [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY,
214 [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE,
215 },
216 },
217};
218
219static __initconst const u64 snb_hw_cache_event_ids 139static __initconst const u64 snb_hw_cache_event_ids
220 [PERF_COUNT_HW_CACHE_MAX] 140 [PERF_COUNT_HW_CACHE_MAX]
221 [PERF_COUNT_HW_CACHE_OP_MAX] 141 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -313,16 +233,16 @@ static __initconst const u64 snb_hw_cache_event_ids
313 }, 233 },
314 [ C(NODE) ] = { 234 [ C(NODE) ] = {
315 [ C(OP_READ) ] = { 235 [ C(OP_READ) ] = {
316 [ C(RESULT_ACCESS) ] = 0x01b7, 236 [ C(RESULT_ACCESS) ] = -1,
317 [ C(RESULT_MISS) ] = 0x01b7, 237 [ C(RESULT_MISS) ] = -1,
318 }, 238 },
319 [ C(OP_WRITE) ] = { 239 [ C(OP_WRITE) ] = {
320 [ C(RESULT_ACCESS) ] = 0x01b7, 240 [ C(RESULT_ACCESS) ] = -1,
321 [ C(RESULT_MISS) ] = 0x01b7, 241 [ C(RESULT_MISS) ] = -1,
322 }, 242 },
323 [ C(OP_PREFETCH) ] = { 243 [ C(OP_PREFETCH) ] = {
324 [ C(RESULT_ACCESS) ] = 0x01b7, 244 [ C(RESULT_ACCESS) ] = -1,
325 [ C(RESULT_MISS) ] = 0x01b7, 245 [ C(RESULT_MISS) ] = -1,
326 }, 246 },
327 }, 247 },
328 248
@@ -465,15 +385,14 @@ static __initconst const u64 westmere_hw_cache_event_ids
465#define NHM_LOCAL_DRAM (1 << 14) 385#define NHM_LOCAL_DRAM (1 << 14)
466#define NHM_NON_DRAM (1 << 15) 386#define NHM_NON_DRAM (1 << 15)
467 387
468#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) 388#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
469#define NHM_REMOTE (NHM_REMOTE_DRAM)
470 389
471#define NHM_DMND_READ (NHM_DMND_DATA_RD) 390#define NHM_DMND_READ (NHM_DMND_DATA_RD)
472#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) 391#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
473#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) 392#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
474 393
475#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) 394#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
476#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) 395#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
477#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) 396#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
478 397
479static __initconst const u64 nehalem_hw_cache_extra_regs 398static __initconst const u64 nehalem_hw_cache_extra_regs
@@ -497,16 +416,16 @@ static __initconst const u64 nehalem_hw_cache_extra_regs
497 }, 416 },
498 [ C(NODE) ] = { 417 [ C(NODE) ] = {
499 [ C(OP_READ) ] = { 418 [ C(OP_READ) ] = {
500 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, 419 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM,
501 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, 420 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM,
502 }, 421 },
503 [ C(OP_WRITE) ] = { 422 [ C(OP_WRITE) ] = {
504 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, 423 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM,
505 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, 424 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM,
506 }, 425 },
507 [ C(OP_PREFETCH) ] = { 426 [ C(OP_PREFETCH) ] = {
508 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, 427 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM,
509 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, 428 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM,
510 }, 429 },
511 }, 430 },
512}; 431};
@@ -808,26 +727,13 @@ static __initconst const u64 atom_hw_cache_event_ids
808 }, 727 },
809}; 728};
810 729
811static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
812{
813 /* user explicitly requested branch sampling */
814 if (has_branch_stack(event))
815 return true;
816
817 /* implicit branch sampling to correct PEBS skid */
818 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
819 return true;
820
821 return false;
822}
823
824static void intel_pmu_disable_all(void) 730static void intel_pmu_disable_all(void)
825{ 731{
826 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
827 733
828 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 734 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
829 735
830 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) 736 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
831 intel_pmu_disable_bts(); 737 intel_pmu_disable_bts();
832 738
833 intel_pmu_pebs_disable_all(); 739 intel_pmu_pebs_disable_all();
@@ -840,12 +746,11 @@ static void intel_pmu_enable_all(int added)
840 746
841 intel_pmu_pebs_enable_all(); 747 intel_pmu_pebs_enable_all();
842 intel_pmu_lbr_enable_all(); 748 intel_pmu_lbr_enable_all();
843 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 749 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
844 x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
845 750
846 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 751 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
847 struct perf_event *event = 752 struct perf_event *event =
848 cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; 753 cpuc->events[X86_PMC_IDX_FIXED_BTS];
849 754
850 if (WARN_ON_ONCE(!event)) 755 if (WARN_ON_ONCE(!event))
851 return; 756 return;
@@ -951,7 +856,7 @@ static inline void intel_pmu_ack_status(u64 ack)
951 856
952static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) 857static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
953{ 858{
954 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 859 int idx = hwc->idx - X86_PMC_IDX_FIXED;
955 u64 ctrl_val, mask; 860 u64 ctrl_val, mask;
956 861
957 mask = 0xfULL << (idx * 4); 862 mask = 0xfULL << (idx * 4);
@@ -964,24 +869,13 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
964static void intel_pmu_disable_event(struct perf_event *event) 869static void intel_pmu_disable_event(struct perf_event *event)
965{ 870{
966 struct hw_perf_event *hwc = &event->hw; 871 struct hw_perf_event *hwc = &event->hw;
967 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
968 872
969 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { 873 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
970 intel_pmu_disable_bts(); 874 intel_pmu_disable_bts();
971 intel_pmu_drain_bts_buffer(); 875 intel_pmu_drain_bts_buffer();
972 return; 876 return;
973 } 877 }
974 878
975 cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
976 cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
977
978 /*
979 * must disable before any actual event
980 * because any event may be combined with LBR
981 */
982 if (intel_pmu_needs_lbr_smpl(event))
983 intel_pmu_lbr_disable(event);
984
985 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 879 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
986 intel_pmu_disable_fixed(hwc); 880 intel_pmu_disable_fixed(hwc);
987 return; 881 return;
@@ -995,7 +889,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
995 889
996static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) 890static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
997{ 891{
998 int idx = hwc->idx - INTEL_PMC_IDX_FIXED; 892 int idx = hwc->idx - X86_PMC_IDX_FIXED;
999 u64 ctrl_val, bits, mask; 893 u64 ctrl_val, bits, mask;
1000 894
1001 /* 895 /*
@@ -1027,26 +921,14 @@ static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
1027static void intel_pmu_enable_event(struct perf_event *event) 921static void intel_pmu_enable_event(struct perf_event *event)
1028{ 922{
1029 struct hw_perf_event *hwc = &event->hw; 923 struct hw_perf_event *hwc = &event->hw;
1030 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1031 924
1032 if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { 925 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
1033 if (!__this_cpu_read(cpu_hw_events.enabled)) 926 if (!__this_cpu_read(cpu_hw_events.enabled))
1034 return; 927 return;
1035 928
1036 intel_pmu_enable_bts(hwc->config); 929 intel_pmu_enable_bts(hwc->config);
1037 return; 930 return;
1038 } 931 }
1039 /*
1040 * must enabled before any actual event
1041 * because any event may be combined with LBR
1042 */
1043 if (intel_pmu_needs_lbr_smpl(event))
1044 intel_pmu_lbr_enable(event);
1045
1046 if (event->attr.exclude_host)
1047 cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
1048 if (event->attr.exclude_guest)
1049 cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
1050 932
1051 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 933 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1052 intel_pmu_enable_fixed(hwc); 934 intel_pmu_enable_fixed(hwc);
@@ -1063,7 +945,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
1063 * Save and restart an expired event. Called by NMI contexts, 945 * Save and restart an expired event. Called by NMI contexts,
1064 * so it has to be careful about preempting normal event ops: 946 * so it has to be careful about preempting normal event ops:
1065 */ 947 */
1066int intel_pmu_save_and_restart(struct perf_event *event) 948static int intel_pmu_save_and_restart(struct perf_event *event)
1067{ 949{
1068 x86_perf_event_update(event); 950 x86_perf_event_update(event);
1069 return x86_perf_event_set_period(event); 951 return x86_perf_event_set_period(event);
@@ -1080,14 +962,14 @@ static void intel_pmu_reset(void)
1080 962
1081 local_irq_save(flags); 963 local_irq_save(flags);
1082 964
1083 pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); 965 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1084 966
1085 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 967 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1086 wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); 968 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
1087 wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); 969 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
1088 } 970 }
1089 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 971 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
1090 wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 972 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1091 973
1092 if (ds) 974 if (ds)
1093 ds->bts_index = ds->bts_buffer_base; 975 ds->bts_index = ds->bts_buffer_base;
@@ -1107,6 +989,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1107 u64 status; 989 u64 status;
1108 int handled; 990 int handled;
1109 991
992 perf_sample_data_init(&data, 0);
993
1110 cpuc = &__get_cpu_var(cpu_hw_events); 994 cpuc = &__get_cpu_var(cpu_hw_events);
1111 995
1112 /* 996 /*
@@ -1160,10 +1044,7 @@ again:
1160 if (!intel_pmu_save_and_restart(event)) 1044 if (!intel_pmu_save_and_restart(event))
1161 continue; 1045 continue;
1162 1046
1163 perf_sample_data_init(&data, 0, event->hw.last_period); 1047 data.period = event->hw.last_period;
1164
1165 if (has_branch_stack(event))
1166 data.br_stack = &cpuc->lbr_stack;
1167 1048
1168 if (perf_event_overflow(event, &data, regs)) 1049 if (perf_event_overflow(event, &data, regs))
1169 x86_pmu_stop(event, 0); 1050 x86_pmu_stop(event, 0);
@@ -1199,33 +1080,27 @@ intel_bts_constraints(struct perf_event *event)
1199 return NULL; 1080 return NULL;
1200} 1081}
1201 1082
1202static int intel_alt_er(int idx) 1083static bool intel_try_alt_er(struct perf_event *event, int orig_idx)
1203{ 1084{
1204 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) 1085 if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
1205 return idx; 1086 return false;
1206
1207 if (idx == EXTRA_REG_RSP_0)
1208 return EXTRA_REG_RSP_1;
1209 1087
1210 if (idx == EXTRA_REG_RSP_1) 1088 if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) {
1211 return EXTRA_REG_RSP_0;
1212
1213 return idx;
1214}
1215
1216static void intel_fixup_er(struct perf_event *event, int idx)
1217{
1218 event->hw.extra_reg.idx = idx;
1219
1220 if (idx == EXTRA_REG_RSP_0) {
1221 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1222 event->hw.config |= 0x01b7;
1223 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1224 } else if (idx == EXTRA_REG_RSP_1) {
1225 event->hw.config &= ~INTEL_ARCH_EVENT_MASK; 1089 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1226 event->hw.config |= 0x01bb; 1090 event->hw.config |= 0x01bb;
1091 event->hw.extra_reg.idx = EXTRA_REG_RSP_1;
1227 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; 1092 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
1093 } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) {
1094 event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
1095 event->hw.config |= 0x01b7;
1096 event->hw.extra_reg.idx = EXTRA_REG_RSP_0;
1097 event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
1228 } 1098 }
1099
1100 if (event->hw.extra_reg.idx == orig_idx)
1101 return false;
1102
1103 return true;
1229} 1104}
1230 1105
1231/* 1106/*
@@ -1237,24 +1112,20 @@ static void intel_fixup_er(struct perf_event *event, int idx)
1237 */ 1112 */
1238static struct event_constraint * 1113static struct event_constraint *
1239__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, 1114__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc,
1240 struct perf_event *event, 1115 struct perf_event *event)
1241 struct hw_perf_event_extra *reg)
1242{ 1116{
1243 struct event_constraint *c = &emptyconstraint; 1117 struct event_constraint *c = &emptyconstraint;
1118 struct hw_perf_event_extra *reg = &event->hw.extra_reg;
1244 struct er_account *era; 1119 struct er_account *era;
1245 unsigned long flags; 1120 unsigned long flags;
1246 int idx = reg->idx; 1121 int orig_idx = reg->idx;
1247 1122
1248 /* 1123 /* already allocated shared msr */
1249 * reg->alloc can be set due to existing state, so for fake cpuc we 1124 if (reg->alloc)
1250 * need to ignore this, otherwise we might fail to allocate proper fake 1125 return &unconstrained;
1251 * state for this extra reg constraint. Also see the comment below.
1252 */
1253 if (reg->alloc && !cpuc->is_fake)
1254 return NULL; /* call x86_get_event_constraint() */
1255 1126
1256again: 1127again:
1257 era = &cpuc->shared_regs->regs[idx]; 1128 era = &cpuc->shared_regs->regs[reg->idx];
1258 /* 1129 /*
1259 * we use spin_lock_irqsave() to avoid lockdep issues when 1130 * we use spin_lock_irqsave() to avoid lockdep issues when
1260 * passing a fake cpuc 1131 * passing a fake cpuc
@@ -1263,29 +1134,6 @@ again:
1263 1134
1264 if (!atomic_read(&era->ref) || era->config == reg->config) { 1135 if (!atomic_read(&era->ref) || era->config == reg->config) {
1265 1136
1266 /*
1267 * If its a fake cpuc -- as per validate_{group,event}() we
1268 * shouldn't touch event state and we can avoid doing so
1269 * since both will only call get_event_constraints() once
1270 * on each event, this avoids the need for reg->alloc.
1271 *
1272 * Not doing the ER fixup will only result in era->reg being
1273 * wrong, but since we won't actually try and program hardware
1274 * this isn't a problem either.
1275 */
1276 if (!cpuc->is_fake) {
1277 if (idx != reg->idx)
1278 intel_fixup_er(event, idx);
1279
1280 /*
1281 * x86_schedule_events() can call get_event_constraints()
1282 * multiple times on events in the case of incremental
1283 * scheduling(). reg->alloc ensures we only do the ER
1284 * allocation once.
1285 */
1286 reg->alloc = 1;
1287 }
1288
1289 /* lock in msr value */ 1137 /* lock in msr value */
1290 era->config = reg->config; 1138 era->config = reg->config;
1291 era->reg = reg->reg; 1139 era->reg = reg->reg;
@@ -1293,17 +1141,21 @@ again:
1293 /* one more user */ 1141 /* one more user */
1294 atomic_inc(&era->ref); 1142 atomic_inc(&era->ref);
1295 1143
1144 /* no need to reallocate during incremental event scheduling */
1145 reg->alloc = 1;
1146
1296 /* 1147 /*
1297 * need to call x86_get_event_constraint() 1148 * All events using extra_reg are unconstrained.
1298 * to check if associated event has constraints 1149 * Avoids calling x86_get_event_constraints()
1150 *
1151 * Must revisit if extra_reg controlling events
1152 * ever have constraints. Worst case we go through
1153 * the regular event constraint table.
1299 */ 1154 */
1300 c = NULL; 1155 c = &unconstrained;
1301 } else { 1156 } else if (intel_try_alt_er(event, orig_idx)) {
1302 idx = intel_alt_er(idx); 1157 raw_spin_unlock(&era->lock);
1303 if (idx != reg->idx) { 1158 goto again;
1304 raw_spin_unlock_irqrestore(&era->lock, flags);
1305 goto again;
1306 }
1307 } 1159 }
1308 raw_spin_unlock_irqrestore(&era->lock, flags); 1160 raw_spin_unlock_irqrestore(&era->lock, flags);
1309 1161
@@ -1317,14 +1169,11 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc,
1317 struct er_account *era; 1169 struct er_account *era;
1318 1170
1319 /* 1171 /*
1320 * Only put constraint if extra reg was actually allocated. Also takes 1172 * only put constraint if extra reg was actually
1321 * care of event which do not use an extra shared reg. 1173 * allocated. Also takes care of event which do
1322 * 1174 * not use an extra shared reg
1323 * Also, if this is a fake cpuc we shouldn't touch any event state
1324 * (reg->alloc) and we don't care about leaving inconsistent cpuc state
1325 * either since it'll be thrown out.
1326 */ 1175 */
1327 if (!reg->alloc || cpuc->is_fake) 1176 if (!reg->alloc)
1328 return; 1177 return;
1329 1178
1330 era = &cpuc->shared_regs->regs[reg->idx]; 1179 era = &cpuc->shared_regs->regs[reg->idx];
@@ -1340,39 +1189,12 @@ static struct event_constraint *
1340intel_shared_regs_constraints(struct cpu_hw_events *cpuc, 1189intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
1341 struct perf_event *event) 1190 struct perf_event *event)
1342{ 1191{
1343 struct event_constraint *c = NULL, *d; 1192 struct event_constraint *c = NULL;
1344 struct hw_perf_event_extra *xreg, *breg;
1345
1346 xreg = &event->hw.extra_reg;
1347 if (xreg->idx != EXTRA_REG_NONE) {
1348 c = __intel_shared_reg_get_constraints(cpuc, event, xreg);
1349 if (c == &emptyconstraint)
1350 return c;
1351 }
1352 breg = &event->hw.branch_reg;
1353 if (breg->idx != EXTRA_REG_NONE) {
1354 d = __intel_shared_reg_get_constraints(cpuc, event, breg);
1355 if (d == &emptyconstraint) {
1356 __intel_shared_reg_put_constraints(cpuc, xreg);
1357 c = d;
1358 }
1359 }
1360 return c;
1361}
1362 1193
1363struct event_constraint * 1194 if (event->hw.extra_reg.idx != EXTRA_REG_NONE)
1364x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1195 c = __intel_shared_reg_get_constraints(cpuc, event);
1365{
1366 struct event_constraint *c;
1367 1196
1368 if (x86_pmu.event_constraints) { 1197 return c;
1369 for_each_event_constraint(c, x86_pmu.event_constraints) {
1370 if ((event->hw.config & c->cmask) == c->code)
1371 return c;
1372 }
1373 }
1374
1375 return &unconstrained;
1376} 1198}
1377 1199
1378static struct event_constraint * 1200static struct event_constraint *
@@ -1404,10 +1226,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
1404 reg = &event->hw.extra_reg; 1226 reg = &event->hw.extra_reg;
1405 if (reg->idx != EXTRA_REG_NONE) 1227 if (reg->idx != EXTRA_REG_NONE)
1406 __intel_shared_reg_put_constraints(cpuc, reg); 1228 __intel_shared_reg_put_constraints(cpuc, reg);
1407
1408 reg = &event->hw.branch_reg;
1409 if (reg->idx != EXTRA_REG_NONE)
1410 __intel_shared_reg_put_constraints(cpuc, reg);
1411} 1229}
1412 1230
1413static void intel_put_event_constraints(struct cpu_hw_events *cpuc, 1231static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
@@ -1416,9 +1234,15 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1416 intel_put_shared_regs_event_constraints(cpuc, event); 1234 intel_put_shared_regs_event_constraints(cpuc, event);
1417} 1235}
1418 1236
1419static void intel_pebs_aliases_core2(struct perf_event *event) 1237static int intel_pmu_hw_config(struct perf_event *event)
1420{ 1238{
1421 if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { 1239 int ret = x86_pmu_hw_config(event);
1240
1241 if (ret)
1242 return ret;
1243
1244 if (event->attr.precise_ip &&
1245 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1422 /* 1246 /*
1423 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P 1247 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1424 * (0x003c) so that we can use it with PEBS. 1248 * (0x003c) so that we can use it with PEBS.
@@ -1437,56 +1261,11 @@ static void intel_pebs_aliases_core2(struct perf_event *event)
1437 * 1261 *
1438 * Thereby we gain a PEBS capable cycle counter. 1262 * Thereby we gain a PEBS capable cycle counter.
1439 */ 1263 */
1440 u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); 1264 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
1441
1442 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1443 event->hw.config = alt_config;
1444 }
1445}
1446
1447static void intel_pebs_aliases_snb(struct perf_event *event)
1448{
1449 if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1450 /*
1451 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1452 * (0x003c) so that we can use it with PEBS.
1453 *
1454 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
1455 * PEBS capable. However we can use UOPS_RETIRED.ALL
1456 * (0x01c2), which is a PEBS capable event, to get the same
1457 * count.
1458 *
1459 * UOPS_RETIRED.ALL counts the number of cycles that retires
1460 * CNTMASK micro-ops. By setting CNTMASK to a value (16)
1461 * larger than the maximum number of micro-ops that can be
1462 * retired per cycle (4) and then inverting the condition, we
1463 * count all cycles that retire 16 or less micro-ops, which
1464 * is every cycle.
1465 *
1466 * Thereby we gain a PEBS capable cycle counter.
1467 */
1468 u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16);
1469 1265
1470 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); 1266 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1471 event->hw.config = alt_config; 1267 event->hw.config = alt_config;
1472 } 1268 }
1473}
1474
1475static int intel_pmu_hw_config(struct perf_event *event)
1476{
1477 int ret = x86_pmu_hw_config(event);
1478
1479 if (ret)
1480 return ret;
1481
1482 if (event->attr.precise_ip && x86_pmu.pebs_aliases)
1483 x86_pmu.pebs_aliases(event);
1484
1485 if (intel_pmu_needs_lbr_smpl(event)) {
1486 ret = intel_pmu_setup_lbr_filter(event);
1487 if (ret)
1488 return ret;
1489 }
1490 1269
1491 if (event->attr.type != PERF_TYPE_RAW) 1270 if (event->attr.type != PERF_TYPE_RAW)
1492 return 0; 1271 return 0;
@@ -1505,117 +1284,12 @@ static int intel_pmu_hw_config(struct perf_event *event)
1505 return 0; 1284 return 0;
1506} 1285}
1507 1286
1508struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
1509{
1510 if (x86_pmu.guest_get_msrs)
1511 return x86_pmu.guest_get_msrs(nr);
1512 *nr = 0;
1513 return NULL;
1514}
1515EXPORT_SYMBOL_GPL(perf_guest_get_msrs);
1516
1517static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr)
1518{
1519 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1520 struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
1521
1522 arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL;
1523 arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask;
1524 arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask;
1525 /*
1526 * If PMU counter has PEBS enabled it is not enough to disable counter
1527 * on a guest entry since PEBS memory write can overshoot guest entry
1528 * and corrupt guest memory. Disabling PEBS solves the problem.
1529 */
1530 arr[1].msr = MSR_IA32_PEBS_ENABLE;
1531 arr[1].host = cpuc->pebs_enabled;
1532 arr[1].guest = 0;
1533
1534 *nr = 2;
1535 return arr;
1536}
1537
1538static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr)
1539{
1540 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1541 struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs;
1542 int idx;
1543
1544 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1545 struct perf_event *event = cpuc->events[idx];
1546
1547 arr[idx].msr = x86_pmu_config_addr(idx);
1548 arr[idx].host = arr[idx].guest = 0;
1549
1550 if (!test_bit(idx, cpuc->active_mask))
1551 continue;
1552
1553 arr[idx].host = arr[idx].guest =
1554 event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE;
1555
1556 if (event->attr.exclude_host)
1557 arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
1558 else if (event->attr.exclude_guest)
1559 arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
1560 }
1561
1562 *nr = x86_pmu.num_counters;
1563 return arr;
1564}
1565
1566static void core_pmu_enable_event(struct perf_event *event)
1567{
1568 if (!event->attr.exclude_host)
1569 x86_pmu_enable_event(event);
1570}
1571
1572static void core_pmu_enable_all(int added)
1573{
1574 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1575 int idx;
1576
1577 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1578 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
1579
1580 if (!test_bit(idx, cpuc->active_mask) ||
1581 cpuc->events[idx]->attr.exclude_host)
1582 continue;
1583
1584 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
1585 }
1586}
1587
1588PMU_FORMAT_ATTR(event, "config:0-7" );
1589PMU_FORMAT_ATTR(umask, "config:8-15" );
1590PMU_FORMAT_ATTR(edge, "config:18" );
1591PMU_FORMAT_ATTR(pc, "config:19" );
1592PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */
1593PMU_FORMAT_ATTR(inv, "config:23" );
1594PMU_FORMAT_ATTR(cmask, "config:24-31" );
1595
1596static struct attribute *intel_arch_formats_attr[] = {
1597 &format_attr_event.attr,
1598 &format_attr_umask.attr,
1599 &format_attr_edge.attr,
1600 &format_attr_pc.attr,
1601 &format_attr_inv.attr,
1602 &format_attr_cmask.attr,
1603 NULL,
1604};
1605
1606ssize_t intel_event_sysfs_show(char *page, u64 config)
1607{
1608 u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT);
1609
1610 return x86_event_sysfs_show(page, config, event);
1611}
1612
1613static __initconst const struct x86_pmu core_pmu = { 1287static __initconst const struct x86_pmu core_pmu = {
1614 .name = "core", 1288 .name = "core",
1615 .handle_irq = x86_pmu_handle_irq, 1289 .handle_irq = x86_pmu_handle_irq,
1616 .disable_all = x86_pmu_disable_all, 1290 .disable_all = x86_pmu_disable_all,
1617 .enable_all = core_pmu_enable_all, 1291 .enable_all = x86_pmu_enable_all,
1618 .enable = core_pmu_enable_event, 1292 .enable = x86_pmu_enable_event,
1619 .disable = x86_pmu_disable_event, 1293 .disable = x86_pmu_disable_event,
1620 .hw_config = x86_pmu_hw_config, 1294 .hw_config = x86_pmu_hw_config,
1621 .schedule_events = x86_schedule_events, 1295 .schedule_events = x86_schedule_events,
@@ -1633,12 +1307,9 @@ static __initconst const struct x86_pmu core_pmu = {
1633 .get_event_constraints = intel_get_event_constraints, 1307 .get_event_constraints = intel_get_event_constraints,
1634 .put_event_constraints = intel_put_event_constraints, 1308 .put_event_constraints = intel_put_event_constraints,
1635 .event_constraints = intel_core_event_constraints, 1309 .event_constraints = intel_core_event_constraints,
1636 .guest_get_msrs = core_guest_get_msrs,
1637 .format_attrs = intel_arch_formats_attr,
1638 .events_sysfs_show = intel_event_sysfs_show,
1639}; 1310};
1640 1311
1641struct intel_shared_regs *allocate_shared_regs(int cpu) 1312static struct intel_shared_regs *allocate_shared_regs(int cpu)
1642{ 1313{
1643 struct intel_shared_regs *regs; 1314 struct intel_shared_regs *regs;
1644 int i; 1315 int i;
@@ -1661,7 +1332,7 @@ static int intel_pmu_cpu_prepare(int cpu)
1661{ 1332{
1662 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); 1333 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1663 1334
1664 if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) 1335 if (!x86_pmu.extra_regs)
1665 return NOTIFY_OK; 1336 return NOTIFY_OK;
1666 1337
1667 cpuc->shared_regs = allocate_shared_regs(cpu); 1338 cpuc->shared_regs = allocate_shared_regs(cpu);
@@ -1683,28 +1354,22 @@ static void intel_pmu_cpu_starting(int cpu)
1683 */ 1354 */
1684 intel_pmu_lbr_reset(); 1355 intel_pmu_lbr_reset();
1685 1356
1686 cpuc->lbr_sel = NULL; 1357 if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING))
1687
1688 if (!cpuc->shared_regs)
1689 return; 1358 return;
1690 1359
1691 if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { 1360 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1692 for_each_cpu(i, topology_thread_cpumask(cpu)) { 1361 struct intel_shared_regs *pc;
1693 struct intel_shared_regs *pc;
1694 1362
1695 pc = per_cpu(cpu_hw_events, i).shared_regs; 1363 pc = per_cpu(cpu_hw_events, i).shared_regs;
1696 if (pc && pc->core_id == core_id) { 1364 if (pc && pc->core_id == core_id) {
1697 cpuc->kfree_on_online = cpuc->shared_regs; 1365 kfree(cpuc->shared_regs);
1698 cpuc->shared_regs = pc; 1366 cpuc->shared_regs = pc;
1699 break; 1367 break;
1700 }
1701 } 1368 }
1702 cpuc->shared_regs->core_id = core_id;
1703 cpuc->shared_regs->refcnt++;
1704 } 1369 }
1705 1370
1706 if (x86_pmu.lbr_sel_map) 1371 cpuc->shared_regs->core_id = core_id;
1707 cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; 1372 cpuc->shared_regs->refcnt++;
1708} 1373}
1709 1374
1710static void intel_pmu_cpu_dying(int cpu) 1375static void intel_pmu_cpu_dying(int cpu)
@@ -1722,33 +1387,6 @@ static void intel_pmu_cpu_dying(int cpu)
1722 fini_debug_store_on_cpu(cpu); 1387 fini_debug_store_on_cpu(cpu);
1723} 1388}
1724 1389
1725static void intel_pmu_flush_branch_stack(void)
1726{
1727 /*
1728 * Intel LBR does not tag entries with the
1729 * PID of the current task, then we need to
1730 * flush it on ctxsw
1731 * For now, we simply reset it
1732 */
1733 if (x86_pmu.lbr_nr)
1734 intel_pmu_lbr_reset();
1735}
1736
1737PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
1738
1739static struct attribute *intel_arch3_formats_attr[] = {
1740 &format_attr_event.attr,
1741 &format_attr_umask.attr,
1742 &format_attr_edge.attr,
1743 &format_attr_pc.attr,
1744 &format_attr_any.attr,
1745 &format_attr_inv.attr,
1746 &format_attr_cmask.attr,
1747
1748 &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */
1749 NULL,
1750};
1751
1752static __initconst const struct x86_pmu intel_pmu = { 1390static __initconst const struct x86_pmu intel_pmu = {
1753 .name = "Intel", 1391 .name = "Intel",
1754 .handle_irq = intel_pmu_handle_irq, 1392 .handle_irq = intel_pmu_handle_irq,
@@ -1771,19 +1409,13 @@ static __initconst const struct x86_pmu intel_pmu = {
1771 .max_period = (1ULL << 31) - 1, 1409 .max_period = (1ULL << 31) - 1,
1772 .get_event_constraints = intel_get_event_constraints, 1410 .get_event_constraints = intel_get_event_constraints,
1773 .put_event_constraints = intel_put_event_constraints, 1411 .put_event_constraints = intel_put_event_constraints,
1774 .pebs_aliases = intel_pebs_aliases_core2,
1775
1776 .format_attrs = intel_arch3_formats_attr,
1777 .events_sysfs_show = intel_event_sysfs_show,
1778 1412
1779 .cpu_prepare = intel_pmu_cpu_prepare, 1413 .cpu_prepare = intel_pmu_cpu_prepare,
1780 .cpu_starting = intel_pmu_cpu_starting, 1414 .cpu_starting = intel_pmu_cpu_starting,
1781 .cpu_dying = intel_pmu_cpu_dying, 1415 .cpu_dying = intel_pmu_cpu_dying,
1782 .guest_get_msrs = intel_guest_get_msrs,
1783 .flush_branch_stack = intel_pmu_flush_branch_stack,
1784}; 1416};
1785 1417
1786static __init void intel_clovertown_quirk(void) 1418static void intel_clovertown_quirks(void)
1787{ 1419{
1788 /* 1420 /*
1789 * PEBS is unreliable due to: 1421 * PEBS is unreliable due to:
@@ -1804,119 +1436,23 @@ static __init void intel_clovertown_quirk(void)
1804 * But taken together it might just make sense to not enable PEBS on 1436 * But taken together it might just make sense to not enable PEBS on
1805 * these chips. 1437 * these chips.
1806 */ 1438 */
1807 pr_warn("PEBS disabled due to CPU errata\n"); 1439 printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
1808 x86_pmu.pebs = 0; 1440 x86_pmu.pebs = 0;
1809 x86_pmu.pebs_constraints = NULL; 1441 x86_pmu.pebs_constraints = NULL;
1810} 1442}
1811 1443
1812static int intel_snb_pebs_broken(int cpu) 1444static __init int intel_pmu_init(void)
1813{
1814 u32 rev = UINT_MAX; /* default to broken for unknown models */
1815
1816 switch (cpu_data(cpu).x86_model) {
1817 case 42: /* SNB */
1818 rev = 0x28;
1819 break;
1820
1821 case 45: /* SNB-EP */
1822 switch (cpu_data(cpu).x86_mask) {
1823 case 6: rev = 0x618; break;
1824 case 7: rev = 0x70c; break;
1825 }
1826 }
1827
1828 return (cpu_data(cpu).microcode < rev);
1829}
1830
1831static void intel_snb_check_microcode(void)
1832{
1833 int pebs_broken = 0;
1834 int cpu;
1835
1836 get_online_cpus();
1837 for_each_online_cpu(cpu) {
1838 if ((pebs_broken = intel_snb_pebs_broken(cpu)))
1839 break;
1840 }
1841 put_online_cpus();
1842
1843 if (pebs_broken == x86_pmu.pebs_broken)
1844 return;
1845
1846 /*
1847 * Serialized by the microcode lock..
1848 */
1849 if (x86_pmu.pebs_broken) {
1850 pr_info("PEBS enabled due to microcode update\n");
1851 x86_pmu.pebs_broken = 0;
1852 } else {
1853 pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n");
1854 x86_pmu.pebs_broken = 1;
1855 }
1856}
1857
1858static __init void intel_sandybridge_quirk(void)
1859{
1860 x86_pmu.check_microcode = intel_snb_check_microcode;
1861 intel_snb_check_microcode();
1862}
1863
1864static const struct { int id; char *name; } intel_arch_events_map[] __initconst = {
1865 { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" },
1866 { PERF_COUNT_HW_INSTRUCTIONS, "instructions" },
1867 { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" },
1868 { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" },
1869 { PERF_COUNT_HW_CACHE_MISSES, "cache misses" },
1870 { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" },
1871 { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" },
1872};
1873
1874static __init void intel_arch_events_quirk(void)
1875{
1876 int bit;
1877
1878 /* disable event that reported as not presend by cpuid */
1879 for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) {
1880 intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0;
1881 pr_warn("CPUID marked event: \'%s\' unavailable\n",
1882 intel_arch_events_map[bit].name);
1883 }
1884}
1885
1886static __init void intel_nehalem_quirk(void)
1887{
1888 union cpuid10_ebx ebx;
1889
1890 ebx.full = x86_pmu.events_maskl;
1891 if (ebx.split.no_branch_misses_retired) {
1892 /*
1893 * Erratum AAJ80 detected, we work it around by using
1894 * the BR_MISP_EXEC.ANY event. This will over-count
1895 * branch-misses, but it's still much better than the
1896 * architectural event which is often completely bogus:
1897 */
1898 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1899 ebx.split.no_branch_misses_retired = 0;
1900 x86_pmu.events_maskl = ebx.full;
1901 pr_info("CPU erratum AAJ80 worked around\n");
1902 }
1903}
1904
1905__init int intel_pmu_init(void)
1906{ 1445{
1907 union cpuid10_edx edx; 1446 union cpuid10_edx edx;
1908 union cpuid10_eax eax; 1447 union cpuid10_eax eax;
1909 union cpuid10_ebx ebx;
1910 struct event_constraint *c;
1911 unsigned int unused; 1448 unsigned int unused;
1449 unsigned int ebx;
1912 int version; 1450 int version;
1913 1451
1914 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 1452 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1915 switch (boot_cpu_data.x86) { 1453 switch (boot_cpu_data.x86) {
1916 case 0x6: 1454 case 0x6:
1917 return p6_pmu_init(); 1455 return p6_pmu_init();
1918 case 0xb:
1919 return knc_pmu_init();
1920 case 0xf: 1456 case 0xf:
1921 return p4_pmu_init(); 1457 return p4_pmu_init();
1922 } 1458 }
@@ -1927,8 +1463,8 @@ __init int intel_pmu_init(void)
1927 * Check whether the Architectural PerfMon supports 1463 * Check whether the Architectural PerfMon supports
1928 * Branch Misses Retired hw_event or not. 1464 * Branch Misses Retired hw_event or not.
1929 */ 1465 */
1930 cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); 1466 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1931 if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) 1467 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1932 return -ENODEV; 1468 return -ENODEV;
1933 1469
1934 version = eax.split.version_id; 1470 version = eax.split.version_id;
@@ -1942,11 +1478,6 @@ __init int intel_pmu_init(void)
1942 x86_pmu.cntval_bits = eax.split.bit_width; 1478 x86_pmu.cntval_bits = eax.split.bit_width;
1943 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; 1479 x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
1944 1480
1945 x86_pmu.events_maskl = ebx.full;
1946 x86_pmu.events_mask_len = eax.split.mask_length;
1947
1948 x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters);
1949
1950 /* 1481 /*
1951 * Quirk: v2 perfmon does not report fixed-purpose events, so 1482 * Quirk: v2 perfmon does not report fixed-purpose events, so
1952 * assume at least 3 events: 1483 * assume at least 3 events:
@@ -1966,8 +1497,6 @@ __init int intel_pmu_init(void)
1966 1497
1967 intel_ds_init(); 1498 intel_ds_init();
1968 1499
1969 x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
1970
1971 /* 1500 /*
1972 * Install the hw-cache-events table: 1501 * Install the hw-cache-events table:
1973 */ 1502 */
@@ -1977,7 +1506,7 @@ __init int intel_pmu_init(void)
1977 break; 1506 break;
1978 1507
1979 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1508 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
1980 x86_add_quirk(intel_clovertown_quirk); 1509 x86_pmu.quirks = intel_clovertown_quirks;
1981 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1510 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
1982 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1511 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
1983 case 29: /* six-core 45 nm xeon "Dunnington" */ 1512 case 29: /* six-core 45 nm xeon "Dunnington" */
@@ -2007,19 +1536,25 @@ __init int intel_pmu_init(void)
2007 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1536 x86_pmu.extra_regs = intel_nehalem_extra_regs;
2008 1537
2009 /* UOPS_ISSUED.STALLED_CYCLES */ 1538 /* UOPS_ISSUED.STALLED_CYCLES */
2010 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 1539 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
2011 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
2012 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1540 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
2013 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 1541 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
2014 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
2015 1542
2016 x86_add_quirk(intel_nehalem_quirk); 1543 if (ebx & 0x40) {
1544 /*
1545 * Erratum AAJ80 detected, we work it around by using
1546 * the BR_MISP_EXEC.ANY event. This will over-count
1547 * branch-misses, but it's still much better than the
1548 * architectural event which is often completely bogus:
1549 */
1550 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
2017 1551
1552 pr_cont("erratum AAJ80 worked around, ");
1553 }
2018 pr_cont("Nehalem events, "); 1554 pr_cont("Nehalem events, ");
2019 break; 1555 break;
2020 1556
2021 case 28: /* Atom */ 1557 case 28: /* Atom */
2022 case 54: /* Cedariew */
2023 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, 1558 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2024 sizeof(hw_cache_event_ids)); 1559 sizeof(hw_cache_event_ids));
2025 1560
@@ -2047,65 +1582,34 @@ __init int intel_pmu_init(void)
2047 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1582 x86_pmu.er_flags |= ERF_HAS_RSP_1;
2048 1583
2049 /* UOPS_ISSUED.STALLED_CYCLES */ 1584 /* UOPS_ISSUED.STALLED_CYCLES */
2050 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 1585 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
2051 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
2052 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ 1586 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
2053 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 1587 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
2054 X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1);
2055 1588
2056 pr_cont("Westmere events, "); 1589 pr_cont("Westmere events, ");
2057 break; 1590 break;
2058 1591
2059 case 42: /* SandyBridge */ 1592 case 42: /* SandyBridge */
2060 case 45: /* SandyBridge, "Romely-EP" */ 1593 case 45: /* SandyBridge, "Romely-EP" */
2061 x86_add_quirk(intel_sandybridge_quirk);
2062 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 1594 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2063 sizeof(hw_cache_event_ids)); 1595 sizeof(hw_cache_event_ids));
2064 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
2065 sizeof(hw_cache_extra_regs));
2066 1596
2067 intel_pmu_lbr_init_snb(); 1597 intel_pmu_lbr_init_nhm();
2068 1598
2069 x86_pmu.event_constraints = intel_snb_event_constraints; 1599 x86_pmu.event_constraints = intel_snb_event_constraints;
2070 x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; 1600 x86_pmu.pebs_constraints = intel_snb_pebs_events;
2071 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
2072 x86_pmu.extra_regs = intel_snb_extra_regs; 1601 x86_pmu.extra_regs = intel_snb_extra_regs;
2073 /* all extra regs are per-cpu when HT is on */ 1602 /* all extra regs are per-cpu when HT is on */
2074 x86_pmu.er_flags |= ERF_HAS_RSP_1; 1603 x86_pmu.er_flags |= ERF_HAS_RSP_1;
2075 x86_pmu.er_flags |= ERF_NO_HT_SHARING; 1604 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
2076 1605
2077 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ 1606 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
2078 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 1607 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
2079 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
2080 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ 1608 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
2081 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 1609 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
2082 X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1);
2083 1610
2084 pr_cont("SandyBridge events, "); 1611 pr_cont("SandyBridge events, ");
2085 break; 1612 break;
2086 case 58: /* IvyBridge */
2087 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
2088 sizeof(hw_cache_event_ids));
2089 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
2090 sizeof(hw_cache_extra_regs));
2091
2092 intel_pmu_lbr_init_snb();
2093
2094 x86_pmu.event_constraints = intel_snb_event_constraints;
2095 x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints;
2096 x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
2097 x86_pmu.extra_regs = intel_snb_extra_regs;
2098 /* all extra regs are per-cpu when HT is on */
2099 x86_pmu.er_flags |= ERF_HAS_RSP_1;
2100 x86_pmu.er_flags |= ERF_NO_HT_SHARING;
2101
2102 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
2103 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =
2104 X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1);
2105
2106 pr_cont("IvyBridge events, ");
2107 break;
2108
2109 1613
2110 default: 1614 default:
2111 switch (x86_pmu.version) { 1615 switch (x86_pmu.version) {
@@ -2122,38 +1626,18 @@ __init int intel_pmu_init(void)
2122 break; 1626 break;
2123 } 1627 }
2124 } 1628 }
1629 return 0;
1630}
2125 1631
2126 if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { 1632#else /* CONFIG_CPU_SUP_INTEL */
2127 WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
2128 x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
2129 x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
2130 }
2131 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
2132
2133 if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
2134 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
2135 x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
2136 x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
2137 }
2138
2139 x86_pmu.intel_ctrl |=
2140 ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED;
2141
2142 if (x86_pmu.event_constraints) {
2143 /*
2144 * event on fixed counter2 (REF_CYCLES) only works on this
2145 * counter, so do not extend mask to generic counters
2146 */
2147 for_each_event_constraint(c, x86_pmu.event_constraints) {
2148 if (c->cmask != X86_RAW_EVENT_MASK
2149 || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) {
2150 continue;
2151 }
2152
2153 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
2154 c->weight += x86_pmu.num_counters;
2155 }
2156 }
2157 1633
1634static int intel_pmu_init(void)
1635{
2158 return 0; 1636 return 0;
2159} 1637}
1638
1639static struct intel_shared_regs *allocate_shared_regs(int cpu)
1640{
1641 return NULL;
1642}
1643#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 826054a4f2e..3213c52db76 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -1,11 +1,7 @@
1#include <linux/bitops.h> 1#ifdef CONFIG_CPU_SUP_INTEL
2#include <linux/types.h>
3#include <linux/slab.h>
4 2
5#include <asm/perf_event.h> 3/* The maximal number of PEBS events: */
6#include <asm/insn.h> 4#define MAX_PEBS_EVENTS 4
7
8#include "perf_event.h"
9 5
10/* The size of a BTS record in bytes: */ 6/* The size of a BTS record in bytes: */
11#define BTS_RECORD_SIZE 24 7#define BTS_RECORD_SIZE 24
@@ -41,7 +37,24 @@ struct pebs_record_nhm {
41 u64 status, dla, dse, lat; 37 u64 status, dla, dse, lat;
42}; 38};
43 39
44void init_debug_store_on_cpu(int cpu) 40/*
41 * A debug store configuration.
42 *
43 * We only support architectures that use 64bit fields.
44 */
45struct debug_store {
46 u64 bts_buffer_base;
47 u64 bts_index;
48 u64 bts_absolute_maximum;
49 u64 bts_interrupt_threshold;
50 u64 pebs_buffer_base;
51 u64 pebs_index;
52 u64 pebs_absolute_maximum;
53 u64 pebs_interrupt_threshold;
54 u64 pebs_event_reset[MAX_PEBS_EVENTS];
55};
56
57static void init_debug_store_on_cpu(int cpu)
45{ 58{
46 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 59 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
47 60
@@ -53,7 +66,7 @@ void init_debug_store_on_cpu(int cpu)
53 (u32)((u64)(unsigned long)ds >> 32)); 66 (u32)((u64)(unsigned long)ds >> 32));
54} 67}
55 68
56void fini_debug_store_on_cpu(int cpu) 69static void fini_debug_store_on_cpu(int cpu)
57{ 70{
58 if (!per_cpu(cpu_hw_events, cpu).ds) 71 if (!per_cpu(cpu_hw_events, cpu).ds)
59 return; 72 return;
@@ -162,7 +175,7 @@ static void release_ds_buffer(int cpu)
162 kfree(ds); 175 kfree(ds);
163} 176}
164 177
165void release_ds_buffers(void) 178static void release_ds_buffers(void)
166{ 179{
167 int cpu; 180 int cpu;
168 181
@@ -181,7 +194,7 @@ void release_ds_buffers(void)
181 put_online_cpus(); 194 put_online_cpus();
182} 195}
183 196
184void reserve_ds_buffers(void) 197static void reserve_ds_buffers(void)
185{ 198{
186 int bts_err = 0, pebs_err = 0; 199 int bts_err = 0, pebs_err = 0;
187 int cpu; 200 int cpu;
@@ -247,10 +260,10 @@ void reserve_ds_buffers(void)
247 * BTS 260 * BTS
248 */ 261 */
249 262
250struct event_constraint bts_constraint = 263static struct event_constraint bts_constraint =
251 EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); 264 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
252 265
253void intel_pmu_enable_bts(u64 config) 266static void intel_pmu_enable_bts(u64 config)
254{ 267{
255 unsigned long debugctlmsr; 268 unsigned long debugctlmsr;
256 269
@@ -269,7 +282,7 @@ void intel_pmu_enable_bts(u64 config)
269 update_debugctlmsr(debugctlmsr); 282 update_debugctlmsr(debugctlmsr);
270} 283}
271 284
272void intel_pmu_disable_bts(void) 285static void intel_pmu_disable_bts(void)
273{ 286{
274 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 287 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
275 unsigned long debugctlmsr; 288 unsigned long debugctlmsr;
@@ -286,7 +299,7 @@ void intel_pmu_disable_bts(void)
286 update_debugctlmsr(debugctlmsr); 299 update_debugctlmsr(debugctlmsr);
287} 300}
288 301
289int intel_pmu_drain_bts_buffer(void) 302static int intel_pmu_drain_bts_buffer(void)
290{ 303{
291 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 304 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
292 struct debug_store *ds = cpuc->ds; 305 struct debug_store *ds = cpuc->ds;
@@ -295,7 +308,7 @@ int intel_pmu_drain_bts_buffer(void)
295 u64 to; 308 u64 to;
296 u64 flags; 309 u64 flags;
297 }; 310 };
298 struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; 311 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
299 struct bts_record *at, *top; 312 struct bts_record *at, *top;
300 struct perf_output_handle handle; 313 struct perf_output_handle handle;
301 struct perf_event_header header; 314 struct perf_event_header header;
@@ -316,7 +329,8 @@ int intel_pmu_drain_bts_buffer(void)
316 329
317 ds->bts_index = ds->bts_buffer_base; 330 ds->bts_index = ds->bts_buffer_base;
318 331
319 perf_sample_data_init(&data, 0, event->hw.last_period); 332 perf_sample_data_init(&data, 0);
333 data.period = event->hw.last_period;
320 regs.ip = 0; 334 regs.ip = 0;
321 335
322 /* 336 /*
@@ -347,7 +361,7 @@ int intel_pmu_drain_bts_buffer(void)
347/* 361/*
348 * PEBS 362 * PEBS
349 */ 363 */
350struct event_constraint intel_core2_pebs_event_constraints[] = { 364static struct event_constraint intel_core2_pebs_event_constraints[] = {
351 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ 365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
352 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
353 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
@@ -356,14 +370,14 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
356 EVENT_CONSTRAINT_END 370 EVENT_CONSTRAINT_END
357}; 371};
358 372
359struct event_constraint intel_atom_pebs_event_constraints[] = { 373static struct event_constraint intel_atom_pebs_event_constraints[] = {
360 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ 374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
361 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ 375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
362 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ 376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
363 EVENT_CONSTRAINT_END 377 EVENT_CONSTRAINT_END
364}; 378};
365 379
366struct event_constraint intel_nehalem_pebs_event_constraints[] = { 380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
367 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ 381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
368 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ 382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
369 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ 383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
@@ -378,7 +392,7 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
378 EVENT_CONSTRAINT_END 392 EVENT_CONSTRAINT_END
379}; 393};
380 394
381struct event_constraint intel_westmere_pebs_event_constraints[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
382 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
383 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
384 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
@@ -393,35 +407,29 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
393 EVENT_CONSTRAINT_END 407 EVENT_CONSTRAINT_END
394}; 408};
395 409
396struct event_constraint intel_snb_pebs_event_constraints[] = { 410static struct event_constraint intel_snb_pebs_events[] = {
397 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ 411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
398 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ 412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
399 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ 413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
400 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ 414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
401 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ 415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
402 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ 416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
403 INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ 417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
404 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ 425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
405 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ 426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
406 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ 427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
407 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
408}; 429};
409 430
410struct event_constraint intel_ivb_pebs_event_constraints[] = { 431static struct event_constraint *
411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ 432intel_pebs_constraints(struct perf_event *event)
412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
417 INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */
418 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
419 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
420 INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
421 EVENT_CONSTRAINT_END
422};
423
424struct event_constraint *intel_pebs_constraints(struct perf_event *event)
425{ 433{
426 struct event_constraint *c; 434 struct event_constraint *c;
427 435
@@ -438,7 +446,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
438 return &emptyconstraint; 446 return &emptyconstraint;
439} 447}
440 448
441void intel_pmu_pebs_enable(struct perf_event *event) 449static void intel_pmu_pebs_enable(struct perf_event *event)
442{ 450{
443 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 451 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
444 struct hw_perf_event *hwc = &event->hw; 452 struct hw_perf_event *hwc = &event->hw;
@@ -446,9 +454,13 @@ void intel_pmu_pebs_enable(struct perf_event *event)
446 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; 454 hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
447 455
448 cpuc->pebs_enabled |= 1ULL << hwc->idx; 456 cpuc->pebs_enabled |= 1ULL << hwc->idx;
457 WARN_ON_ONCE(cpuc->enabled);
458
459 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
460 intel_pmu_lbr_enable(event);
449} 461}
450 462
451void intel_pmu_pebs_disable(struct perf_event *event) 463static void intel_pmu_pebs_disable(struct perf_event *event)
452{ 464{
453 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 465 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
454 struct hw_perf_event *hwc = &event->hw; 466 struct hw_perf_event *hwc = &event->hw;
@@ -458,9 +470,12 @@ void intel_pmu_pebs_disable(struct perf_event *event)
458 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 470 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
459 471
460 hwc->config |= ARCH_PERFMON_EVENTSEL_INT; 472 hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
473
474 if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
475 intel_pmu_lbr_disable(event);
461} 476}
462 477
463void intel_pmu_pebs_enable_all(void) 478static void intel_pmu_pebs_enable_all(void)
464{ 479{
465 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 480 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
466 481
@@ -468,7 +483,7 @@ void intel_pmu_pebs_enable_all(void)
468 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); 483 wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
469} 484}
470 485
471void intel_pmu_pebs_disable_all(void) 486static void intel_pmu_pebs_disable_all(void)
472{ 487{
473 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 488 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
474 489
@@ -476,6 +491,17 @@ void intel_pmu_pebs_disable_all(void)
476 wrmsrl(MSR_IA32_PEBS_ENABLE, 0); 491 wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
477} 492}
478 493
494#include <asm/insn.h>
495
496static inline bool kernel_ip(unsigned long ip)
497{
498#ifdef CONFIG_X86_32
499 return ip > PAGE_OFFSET;
500#else
501 return (long)ip < 0;
502#endif
503}
504
479static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) 505static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
480{ 506{
481 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 507 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -513,7 +539,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
513 * We sampled a branch insn, rewind using the LBR stack 539 * We sampled a branch insn, rewind using the LBR stack
514 */ 540 */
515 if (ip == to) { 541 if (ip == to) {
516 set_linear_ip(regs, from); 542 regs->ip = from;
517 return 1; 543 return 1;
518 } 544 }
519 545
@@ -543,7 +569,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
543 } while (to < ip); 569 } while (to < ip);
544 570
545 if (to == ip) { 571 if (to == ip) {
546 set_linear_ip(regs, old_to); 572 regs->ip = old_to;
547 return 1; 573 return 1;
548 } 574 }
549 575
@@ -554,6 +580,8 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
554 return 0; 580 return 0;
555} 581}
556 582
583static int intel_pmu_save_and_restart(struct perf_event *event);
584
557static void __intel_pmu_pebs_event(struct perf_event *event, 585static void __intel_pmu_pebs_event(struct perf_event *event,
558 struct pt_regs *iregs, void *__pebs) 586 struct pt_regs *iregs, void *__pebs)
559{ 587{
@@ -562,7 +590,6 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
562 * both formats and we don't use the other fields in this 590 * both formats and we don't use the other fields in this
563 * routine. 591 * routine.
564 */ 592 */
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct pebs_record_core *pebs = __pebs; 593 struct pebs_record_core *pebs = __pebs;
567 struct perf_sample_data data; 594 struct perf_sample_data data;
568 struct pt_regs regs; 595 struct pt_regs regs;
@@ -570,7 +597,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
570 if (!intel_pmu_save_and_restart(event)) 597 if (!intel_pmu_save_and_restart(event))
571 return; 598 return;
572 599
573 perf_sample_data_init(&data, 0, event->hw.last_period); 600 perf_sample_data_init(&data, 0);
601 data.period = event->hw.last_period;
574 602
575 /* 603 /*
576 * We use the interrupt regs as a base because the PEBS record 604 * We use the interrupt regs as a base because the PEBS record
@@ -583,8 +611,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
583 * A possible PERF_SAMPLE_REGS will have to transfer all regs. 611 * A possible PERF_SAMPLE_REGS will have to transfer all regs.
584 */ 612 */
585 regs = *iregs; 613 regs = *iregs;
586 regs.flags = pebs->flags; 614 regs.ip = pebs->ip;
587 set_linear_ip(&regs, pebs->ip);
588 regs.bp = pebs->bp; 615 regs.bp = pebs->bp;
589 regs.sp = pebs->sp; 616 regs.sp = pebs->sp;
590 617
@@ -593,9 +620,6 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
593 else 620 else
594 regs.flags &= ~PERF_EFLAGS_EXACT; 621 regs.flags &= ~PERF_EFLAGS_EXACT;
595 622
596 if (has_branch_stack(event))
597 data.br_stack = &cpuc->lbr_stack;
598
599 if (perf_event_overflow(event, &data, &regs)) 623 if (perf_event_overflow(event, &data, &regs))
600 x86_pmu_stop(event, 0); 624 x86_pmu_stop(event, 0);
601} 625}
@@ -635,7 +659,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
635 * Should not happen, we program the threshold at 1 and do not 659 * Should not happen, we program the threshold at 1 and do not
636 * set a reset value. 660 * set a reset value.
637 */ 661 */
638 WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); 662 WARN_ON_ONCE(n > 1);
639 at += n - 1; 663 at += n - 1;
640 664
641 __intel_pmu_pebs_event(event, iregs, at); 665 __intel_pmu_pebs_event(event, iregs, at);
@@ -666,10 +690,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
666 * Should not happen, we program the threshold at 1 and do not 690 * Should not happen, we program the threshold at 1 and do not
667 * set a reset value. 691 * set a reset value.
668 */ 692 */
669 WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); 693 WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
670 694
671 for ( ; at < top; at++) { 695 for ( ; at < top; at++) {
672 for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { 696 for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
673 event = cpuc->events[bit]; 697 event = cpuc->events[bit];
674 if (!test_bit(bit, cpuc->active_mask)) 698 if (!test_bit(bit, cpuc->active_mask))
675 continue; 699 continue;
@@ -685,7 +709,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
685 break; 709 break;
686 } 710 }
687 711
688 if (!event || bit >= x86_pmu.max_pebs_events) 712 if (!event || bit >= MAX_PEBS_EVENTS)
689 continue; 713 continue;
690 714
691 __intel_pmu_pebs_event(event, iregs, at); 715 __intel_pmu_pebs_event(event, iregs, at);
@@ -696,7 +720,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
696 * BTS, PEBS probe and setup 720 * BTS, PEBS probe and setup
697 */ 721 */
698 722
699void intel_ds_init(void) 723static void intel_ds_init(void)
700{ 724{
701 /* 725 /*
702 * No support for 32bit formats 726 * No support for 32bit formats
@@ -729,3 +753,15 @@ void intel_ds_init(void)
729 } 753 }
730 } 754 }
731} 755}
756
757#else /* CONFIG_CPU_SUP_INTEL */
758
759static void reserve_ds_buffers(void)
760{
761}
762
763static void release_ds_buffers(void)
764{
765}
766
767#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index da02e9cc375..d202c1bece1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -1,11 +1,4 @@
1#include <linux/perf_event.h> 1#ifdef CONFIG_CPU_SUP_INTEL
2#include <linux/types.h>
3
4#include <asm/perf_event.h>
5#include <asm/msr.h>
6#include <asm/insn.h>
7
8#include "perf_event.h"
9 2
10enum { 3enum {
11 LBR_FORMAT_32 = 0x00, 4 LBR_FORMAT_32 = 0x00,
@@ -15,100 +8,6 @@ enum {
15}; 8};
16 9
17/* 10/*
18 * Intel LBR_SELECT bits
19 * Intel Vol3a, April 2011, Section 16.7 Table 16-10
20 *
21 * Hardware branch filter (not available on all CPUs)
22 */
23#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */
24#define LBR_USER_BIT 1 /* do not capture at ring > 0 */
25#define LBR_JCC_BIT 2 /* do not capture conditional branches */
26#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */
27#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */
28#define LBR_RETURN_BIT 5 /* do not capture near returns */
29#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */
30#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */
31#define LBR_FAR_BIT 8 /* do not capture far branches */
32
33#define LBR_KERNEL (1 << LBR_KERNEL_BIT)
34#define LBR_USER (1 << LBR_USER_BIT)
35#define LBR_JCC (1 << LBR_JCC_BIT)
36#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT)
37#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT)
38#define LBR_RETURN (1 << LBR_RETURN_BIT)
39#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT)
40#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT)
41#define LBR_FAR (1 << LBR_FAR_BIT)
42
43#define LBR_PLM (LBR_KERNEL | LBR_USER)
44
45#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */
46#define LBR_NOT_SUPP -1 /* LBR filter not supported */
47#define LBR_IGN 0 /* ignored */
48
49#define LBR_ANY \
50 (LBR_JCC |\
51 LBR_REL_CALL |\
52 LBR_IND_CALL |\
53 LBR_RETURN |\
54 LBR_REL_JMP |\
55 LBR_IND_JMP |\
56 LBR_FAR)
57
58#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
59
60#define for_each_branch_sample_type(x) \
61 for ((x) = PERF_SAMPLE_BRANCH_USER; \
62 (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
63
64/*
65 * x86control flow change classification
66 * x86control flow changes include branches, interrupts, traps, faults
67 */
68enum {
69 X86_BR_NONE = 0, /* unknown */
70
71 X86_BR_USER = 1 << 0, /* branch target is user */
72 X86_BR_KERNEL = 1 << 1, /* branch target is kernel */
73
74 X86_BR_CALL = 1 << 2, /* call */
75 X86_BR_RET = 1 << 3, /* return */
76 X86_BR_SYSCALL = 1 << 4, /* syscall */
77 X86_BR_SYSRET = 1 << 5, /* syscall return */
78 X86_BR_INT = 1 << 6, /* sw interrupt */
79 X86_BR_IRET = 1 << 7, /* return from interrupt */
80 X86_BR_JCC = 1 << 8, /* conditional */
81 X86_BR_JMP = 1 << 9, /* jump */
82 X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */
83 X86_BR_IND_CALL = 1 << 11,/* indirect calls */
84};
85
86#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
87
88#define X86_BR_ANY \
89 (X86_BR_CALL |\
90 X86_BR_RET |\
91 X86_BR_SYSCALL |\
92 X86_BR_SYSRET |\
93 X86_BR_INT |\
94 X86_BR_IRET |\
95 X86_BR_JCC |\
96 X86_BR_JMP |\
97 X86_BR_IRQ |\
98 X86_BR_IND_CALL)
99
100#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
101
102#define X86_BR_ANY_CALL \
103 (X86_BR_CALL |\
104 X86_BR_IND_CALL |\
105 X86_BR_SYSCALL |\
106 X86_BR_IRQ |\
107 X86_BR_INT)
108
109static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
110
111/*
112 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI 11 * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
113 * otherwise it becomes near impossible to get a reliable stack. 12 * otherwise it becomes near impossible to get a reliable stack.
114 */ 13 */
@@ -116,10 +15,6 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
116static void __intel_pmu_lbr_enable(void) 15static void __intel_pmu_lbr_enable(void)
117{ 16{
118 u64 debugctl; 17 u64 debugctl;
119 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
120
121 if (cpuc->lbr_sel)
122 wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
123 18
124 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); 19 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
125 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); 20 debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
@@ -153,7 +48,7 @@ static void intel_pmu_lbr_reset_64(void)
153 } 48 }
154} 49}
155 50
156void intel_pmu_lbr_reset(void) 51static void intel_pmu_lbr_reset(void)
157{ 52{
158 if (!x86_pmu.lbr_nr) 53 if (!x86_pmu.lbr_nr)
159 return; 54 return;
@@ -164,27 +59,29 @@ void intel_pmu_lbr_reset(void)
164 intel_pmu_lbr_reset_64(); 59 intel_pmu_lbr_reset_64();
165} 60}
166 61
167void intel_pmu_lbr_enable(struct perf_event *event) 62static void intel_pmu_lbr_enable(struct perf_event *event)
168{ 63{
169 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
170 65
171 if (!x86_pmu.lbr_nr) 66 if (!x86_pmu.lbr_nr)
172 return; 67 return;
173 68
69 WARN_ON_ONCE(cpuc->enabled);
70
174 /* 71 /*
175 * Reset the LBR stack if we changed task context to 72 * Reset the LBR stack if we changed task context to
176 * avoid data leaks. 73 * avoid data leaks.
177 */ 74 */
75
178 if (event->ctx->task && cpuc->lbr_context != event->ctx) { 76 if (event->ctx->task && cpuc->lbr_context != event->ctx) {
179 intel_pmu_lbr_reset(); 77 intel_pmu_lbr_reset();
180 cpuc->lbr_context = event->ctx; 78 cpuc->lbr_context = event->ctx;
181 } 79 }
182 cpuc->br_sel = event->hw.branch_reg.reg;
183 80
184 cpuc->lbr_users++; 81 cpuc->lbr_users++;
185} 82}
186 83
187void intel_pmu_lbr_disable(struct perf_event *event) 84static void intel_pmu_lbr_disable(struct perf_event *event)
188{ 85{
189 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 86 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
190 87
@@ -194,14 +91,11 @@ void intel_pmu_lbr_disable(struct perf_event *event)
194 cpuc->lbr_users--; 91 cpuc->lbr_users--;
195 WARN_ON_ONCE(cpuc->lbr_users < 0); 92 WARN_ON_ONCE(cpuc->lbr_users < 0);
196 93
197 if (cpuc->enabled && !cpuc->lbr_users) { 94 if (cpuc->enabled && !cpuc->lbr_users)
198 __intel_pmu_lbr_disable(); 95 __intel_pmu_lbr_disable();
199 /* avoid stale pointer */
200 cpuc->lbr_context = NULL;
201 }
202} 96}
203 97
204void intel_pmu_lbr_enable_all(void) 98static void intel_pmu_lbr_enable_all(void)
205{ 99{
206 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 100 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
207 101
@@ -209,7 +103,7 @@ void intel_pmu_lbr_enable_all(void)
209 __intel_pmu_lbr_enable(); 103 __intel_pmu_lbr_enable();
210} 104}
211 105
212void intel_pmu_lbr_disable_all(void) 106static void intel_pmu_lbr_disable_all(void)
213{ 107{
214 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 108 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
215 109
@@ -217,9 +111,6 @@ void intel_pmu_lbr_disable_all(void)
217 __intel_pmu_lbr_disable(); 111 __intel_pmu_lbr_disable();
218} 112}
219 113
220/*
221 * TOS = most recently recorded branch
222 */
223static inline u64 intel_pmu_lbr_tos(void) 114static inline u64 intel_pmu_lbr_tos(void)
224{ 115{
225 u64 tos; 116 u64 tos;
@@ -247,15 +138,15 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
247 138
248 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); 139 rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
249 140
250 cpuc->lbr_entries[i].from = msr_lastbranch.from; 141 cpuc->lbr_entries[i].from = msr_lastbranch.from;
251 cpuc->lbr_entries[i].to = msr_lastbranch.to; 142 cpuc->lbr_entries[i].to = msr_lastbranch.to;
252 cpuc->lbr_entries[i].mispred = 0; 143 cpuc->lbr_entries[i].flags = 0;
253 cpuc->lbr_entries[i].predicted = 0;
254 cpuc->lbr_entries[i].reserved = 0;
255 } 144 }
256 cpuc->lbr_stack.nr = i; 145 cpuc->lbr_stack.nr = i;
257} 146}
258 147
148#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
149
259/* 150/*
260 * Due to lack of segmentation in Linux the effective address (offset) 151 * Due to lack of segmentation in Linux the effective address (offset)
261 * is the same as the linear address, allowing us to merge the LIP and EIP 152 * is the same as the linear address, allowing us to merge the LIP and EIP
@@ -270,27 +161,24 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
270 161
271 for (i = 0; i < x86_pmu.lbr_nr; i++) { 162 for (i = 0; i < x86_pmu.lbr_nr; i++) {
272 unsigned long lbr_idx = (tos - i) & mask; 163 unsigned long lbr_idx = (tos - i) & mask;
273 u64 from, to, mis = 0, pred = 0; 164 u64 from, to, flags = 0;
274 165
275 rdmsrl(x86_pmu.lbr_from + lbr_idx, from); 166 rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
276 rdmsrl(x86_pmu.lbr_to + lbr_idx, to); 167 rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
277 168
278 if (lbr_format == LBR_FORMAT_EIP_FLAGS) { 169 if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
279 mis = !!(from & LBR_FROM_FLAG_MISPRED); 170 flags = !!(from & LBR_FROM_FLAG_MISPRED);
280 pred = !mis;
281 from = (u64)((((s64)from) << 1) >> 1); 171 from = (u64)((((s64)from) << 1) >> 1);
282 } 172 }
283 173
284 cpuc->lbr_entries[i].from = from; 174 cpuc->lbr_entries[i].from = from;
285 cpuc->lbr_entries[i].to = to; 175 cpuc->lbr_entries[i].to = to;
286 cpuc->lbr_entries[i].mispred = mis; 176 cpuc->lbr_entries[i].flags = flags;
287 cpuc->lbr_entries[i].predicted = pred;
288 cpuc->lbr_entries[i].reserved = 0;
289 } 177 }
290 cpuc->lbr_stack.nr = i; 178 cpuc->lbr_stack.nr = i;
291} 179}
292 180
293void intel_pmu_lbr_read(void) 181static void intel_pmu_lbr_read(void)
294{ 182{
295 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
296 184
@@ -301,405 +189,30 @@ void intel_pmu_lbr_read(void)
301 intel_pmu_lbr_read_32(cpuc); 189 intel_pmu_lbr_read_32(cpuc);
302 else 190 else
303 intel_pmu_lbr_read_64(cpuc); 191 intel_pmu_lbr_read_64(cpuc);
304
305 intel_pmu_lbr_filter(cpuc);
306}
307
308/*
309 * SW filter is used:
310 * - in case there is no HW filter
311 * - in case the HW filter has errata or limitations
312 */
313static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
314{
315 u64 br_type = event->attr.branch_sample_type;
316 int mask = 0;
317
318 if (br_type & PERF_SAMPLE_BRANCH_USER)
319 mask |= X86_BR_USER;
320
321 if (br_type & PERF_SAMPLE_BRANCH_KERNEL)
322 mask |= X86_BR_KERNEL;
323
324 /* we ignore BRANCH_HV here */
325
326 if (br_type & PERF_SAMPLE_BRANCH_ANY)
327 mask |= X86_BR_ANY;
328
329 if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL)
330 mask |= X86_BR_ANY_CALL;
331
332 if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN)
333 mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET;
334
335 if (br_type & PERF_SAMPLE_BRANCH_IND_CALL)
336 mask |= X86_BR_IND_CALL;
337 /*
338 * stash actual user request into reg, it may
339 * be used by fixup code for some CPU
340 */
341 event->hw.branch_reg.reg = mask;
342}
343
344/*
345 * setup the HW LBR filter
346 * Used only when available, may not be enough to disambiguate
347 * all branches, may need the help of the SW filter
348 */
349static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
350{
351 struct hw_perf_event_extra *reg;
352 u64 br_type = event->attr.branch_sample_type;
353 u64 mask = 0, m;
354 u64 v;
355
356 for_each_branch_sample_type(m) {
357 if (!(br_type & m))
358 continue;
359
360 v = x86_pmu.lbr_sel_map[m];
361 if (v == LBR_NOT_SUPP)
362 return -EOPNOTSUPP;
363
364 if (v != LBR_IGN)
365 mask |= v;
366 }
367 reg = &event->hw.branch_reg;
368 reg->idx = EXTRA_REG_LBR;
369
370 /* LBR_SELECT operates in suppress mode so invert mask */
371 reg->config = ~mask & x86_pmu.lbr_sel_mask;
372
373 return 0;
374} 192}
375 193
376int intel_pmu_setup_lbr_filter(struct perf_event *event) 194static void intel_pmu_lbr_init_core(void)
377{
378 int ret = 0;
379
380 /*
381 * no LBR on this PMU
382 */
383 if (!x86_pmu.lbr_nr)
384 return -EOPNOTSUPP;
385
386 /*
387 * setup SW LBR filter
388 */
389 intel_pmu_setup_sw_lbr_filter(event);
390
391 /*
392 * setup HW LBR filter, if any
393 */
394 if (x86_pmu.lbr_sel_map)
395 ret = intel_pmu_setup_hw_lbr_filter(event);
396
397 return ret;
398}
399
400/*
401 * return the type of control flow change at address "from"
402 * intruction is not necessarily a branch (in case of interrupt).
403 *
404 * The branch type returned also includes the priv level of the
405 * target of the control flow change (X86_BR_USER, X86_BR_KERNEL).
406 *
407 * If a branch type is unknown OR the instruction cannot be
408 * decoded (e.g., text page not present), then X86_BR_NONE is
409 * returned.
410 */
411static int branch_type(unsigned long from, unsigned long to)
412{
413 struct insn insn;
414 void *addr;
415 int bytes, size = MAX_INSN_SIZE;
416 int ret = X86_BR_NONE;
417 int ext, to_plm, from_plm;
418 u8 buf[MAX_INSN_SIZE];
419 int is64 = 0;
420
421 to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
422 from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER;
423
424 /*
425 * maybe zero if lbr did not fill up after a reset by the time
426 * we get a PMU interrupt
427 */
428 if (from == 0 || to == 0)
429 return X86_BR_NONE;
430
431 if (from_plm == X86_BR_USER) {
432 /*
433 * can happen if measuring at the user level only
434 * and we interrupt in a kernel thread, e.g., idle.
435 */
436 if (!current->mm)
437 return X86_BR_NONE;
438
439 /* may fail if text not present */
440 bytes = copy_from_user_nmi(buf, (void __user *)from, size);
441 if (bytes != size)
442 return X86_BR_NONE;
443
444 addr = buf;
445 } else
446 addr = (void *)from;
447
448 /*
449 * decoder needs to know the ABI especially
450 * on 64-bit systems running 32-bit apps
451 */
452#ifdef CONFIG_X86_64
453 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32);
454#endif
455 insn_init(&insn, addr, is64);
456 insn_get_opcode(&insn);
457
458 switch (insn.opcode.bytes[0]) {
459 case 0xf:
460 switch (insn.opcode.bytes[1]) {
461 case 0x05: /* syscall */
462 case 0x34: /* sysenter */
463 ret = X86_BR_SYSCALL;
464 break;
465 case 0x07: /* sysret */
466 case 0x35: /* sysexit */
467 ret = X86_BR_SYSRET;
468 break;
469 case 0x80 ... 0x8f: /* conditional */
470 ret = X86_BR_JCC;
471 break;
472 default:
473 ret = X86_BR_NONE;
474 }
475 break;
476 case 0x70 ... 0x7f: /* conditional */
477 ret = X86_BR_JCC;
478 break;
479 case 0xc2: /* near ret */
480 case 0xc3: /* near ret */
481 case 0xca: /* far ret */
482 case 0xcb: /* far ret */
483 ret = X86_BR_RET;
484 break;
485 case 0xcf: /* iret */
486 ret = X86_BR_IRET;
487 break;
488 case 0xcc ... 0xce: /* int */
489 ret = X86_BR_INT;
490 break;
491 case 0xe8: /* call near rel */
492 case 0x9a: /* call far absolute */
493 ret = X86_BR_CALL;
494 break;
495 case 0xe0 ... 0xe3: /* loop jmp */
496 ret = X86_BR_JCC;
497 break;
498 case 0xe9 ... 0xeb: /* jmp */
499 ret = X86_BR_JMP;
500 break;
501 case 0xff: /* call near absolute, call far absolute ind */
502 insn_get_modrm(&insn);
503 ext = (insn.modrm.bytes[0] >> 3) & 0x7;
504 switch (ext) {
505 case 2: /* near ind call */
506 case 3: /* far ind call */
507 ret = X86_BR_IND_CALL;
508 break;
509 case 4:
510 case 5:
511 ret = X86_BR_JMP;
512 break;
513 }
514 break;
515 default:
516 ret = X86_BR_NONE;
517 }
518 /*
519 * interrupts, traps, faults (and thus ring transition) may
520 * occur on any instructions. Thus, to classify them correctly,
521 * we need to first look at the from and to priv levels. If they
522 * are different and to is in the kernel, then it indicates
523 * a ring transition. If the from instruction is not a ring
524 * transition instr (syscall, systenter, int), then it means
525 * it was a irq, trap or fault.
526 *
527 * we have no way of detecting kernel to kernel faults.
528 */
529 if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL
530 && ret != X86_BR_SYSCALL && ret != X86_BR_INT)
531 ret = X86_BR_IRQ;
532
533 /*
534 * branch priv level determined by target as
535 * is done by HW when LBR_SELECT is implemented
536 */
537 if (ret != X86_BR_NONE)
538 ret |= to_plm;
539
540 return ret;
541}
542
543/*
544 * implement actual branch filter based on user demand.
545 * Hardware may not exactly satisfy that request, thus
546 * we need to inspect opcodes. Mismatched branches are
547 * discarded. Therefore, the number of branches returned
548 * in PERF_SAMPLE_BRANCH_STACK sample may vary.
549 */
550static void
551intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
552{
553 u64 from, to;
554 int br_sel = cpuc->br_sel;
555 int i, j, type;
556 bool compress = false;
557
558 /* if sampling all branches, then nothing to filter */
559 if ((br_sel & X86_BR_ALL) == X86_BR_ALL)
560 return;
561
562 for (i = 0; i < cpuc->lbr_stack.nr; i++) {
563
564 from = cpuc->lbr_entries[i].from;
565 to = cpuc->lbr_entries[i].to;
566
567 type = branch_type(from, to);
568
569 /* if type does not correspond, then discard */
570 if (type == X86_BR_NONE || (br_sel & type) != type) {
571 cpuc->lbr_entries[i].from = 0;
572 compress = true;
573 }
574 }
575
576 if (!compress)
577 return;
578
579 /* remove all entries with from=0 */
580 for (i = 0; i < cpuc->lbr_stack.nr; ) {
581 if (!cpuc->lbr_entries[i].from) {
582 j = i;
583 while (++j < cpuc->lbr_stack.nr)
584 cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j];
585 cpuc->lbr_stack.nr--;
586 if (!cpuc->lbr_entries[i].from)
587 continue;
588 }
589 i++;
590 }
591}
592
593/*
594 * Map interface branch filters onto LBR filters
595 */
596static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
597 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
598 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
599 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
600 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
601 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP
602 | LBR_IND_JMP | LBR_FAR,
603 /*
604 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
605 */
606 [PERF_SAMPLE_BRANCH_ANY_CALL] =
607 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
608 /*
609 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
610 */
611 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
612};
613
614static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
615 [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY,
616 [PERF_SAMPLE_BRANCH_USER] = LBR_USER,
617 [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL,
618 [PERF_SAMPLE_BRANCH_HV] = LBR_IGN,
619 [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR,
620 [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL
621 | LBR_FAR,
622 [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL,
623};
624
625/* core */
626void intel_pmu_lbr_init_core(void)
627{ 195{
628 x86_pmu.lbr_nr = 4; 196 x86_pmu.lbr_nr = 4;
629 x86_pmu.lbr_tos = MSR_LBR_TOS; 197 x86_pmu.lbr_tos = 0x01c9;
630 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 198 x86_pmu.lbr_from = 0x40;
631 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 199 x86_pmu.lbr_to = 0x60;
632
633 /*
634 * SW branch filter usage:
635 * - compensate for lack of HW filter
636 */
637 pr_cont("4-deep LBR, ");
638} 200}
639 201
640/* nehalem/westmere */ 202static void intel_pmu_lbr_init_nhm(void)
641void intel_pmu_lbr_init_nhm(void)
642{ 203{
643 x86_pmu.lbr_nr = 16; 204 x86_pmu.lbr_nr = 16;
644 x86_pmu.lbr_tos = MSR_LBR_TOS; 205 x86_pmu.lbr_tos = 0x01c9;
645 x86_pmu.lbr_from = MSR_LBR_NHM_FROM; 206 x86_pmu.lbr_from = 0x680;
646 x86_pmu.lbr_to = MSR_LBR_NHM_TO; 207 x86_pmu.lbr_to = 0x6c0;
647
648 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
649 x86_pmu.lbr_sel_map = nhm_lbr_sel_map;
650
651 /*
652 * SW branch filter usage:
653 * - workaround LBR_SEL errata (see above)
654 * - support syscall, sysret capture.
655 * That requires LBR_FAR but that means far
656 * jmp need to be filtered out
657 */
658 pr_cont("16-deep LBR, ");
659} 208}
660 209
661/* sandy bridge */ 210static void intel_pmu_lbr_init_atom(void)
662void intel_pmu_lbr_init_snb(void)
663{ 211{
664 x86_pmu.lbr_nr = 16;
665 x86_pmu.lbr_tos = MSR_LBR_TOS;
666 x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
667 x86_pmu.lbr_to = MSR_LBR_NHM_TO;
668
669 x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
670 x86_pmu.lbr_sel_map = snb_lbr_sel_map;
671
672 /*
673 * SW branch filter usage:
674 * - support syscall, sysret capture.
675 * That requires LBR_FAR but that means far
676 * jmp need to be filtered out
677 */
678 pr_cont("16-deep LBR, ");
679}
680
681/* atom */
682void intel_pmu_lbr_init_atom(void)
683{
684 /*
685 * only models starting at stepping 10 seems
686 * to have an operational LBR which can freeze
687 * on PMU interrupt
688 */
689 if (boot_cpu_data.x86_model == 28
690 && boot_cpu_data.x86_mask < 10) {
691 pr_cont("LBR disabled due to erratum");
692 return;
693 }
694
695 x86_pmu.lbr_nr = 8; 212 x86_pmu.lbr_nr = 8;
696 x86_pmu.lbr_tos = MSR_LBR_TOS; 213 x86_pmu.lbr_tos = 0x01c9;
697 x86_pmu.lbr_from = MSR_LBR_CORE_FROM; 214 x86_pmu.lbr_from = 0x40;
698 x86_pmu.lbr_to = MSR_LBR_CORE_TO; 215 x86_pmu.lbr_to = 0x60;
699
700 /*
701 * SW branch filter usage:
702 * - compensate for lack of HW filter
703 */
704 pr_cont("8-deep LBR, ");
705} 216}
217
218#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
deleted file mode 100644
index b43200dbfe7..00000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ /dev/null
@@ -1,2957 +0,0 @@
1#include "perf_event_intel_uncore.h"
2
3static struct intel_uncore_type *empty_uncore[] = { NULL, };
4static struct intel_uncore_type **msr_uncores = empty_uncore;
5static struct intel_uncore_type **pci_uncores = empty_uncore;
6/* pci bus to socket mapping */
7static int pcibus_to_physid[256] = { [0 ... 255] = -1, };
8
9static DEFINE_RAW_SPINLOCK(uncore_box_lock);
10
11/* mask of cpus that collect uncore events */
12static cpumask_t uncore_cpu_mask;
13
14/* constraint for the fixed counter */
15static struct event_constraint constraint_fixed =
16 EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL);
17static struct event_constraint constraint_empty =
18 EVENT_CONSTRAINT(0, 0, 0);
19
20DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
21DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21");
22DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
23DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
24DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19");
25DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
26DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
27DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
28DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31");
29DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28");
30DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15");
31DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30");
32DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51");
33DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4");
34DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17");
35DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22");
36DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31");
37DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7");
38DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15");
39DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23");
40DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31");
41
42static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
43{
44 u64 count;
45
46 rdmsrl(event->hw.event_base, count);
47
48 return count;
49}
50
51/*
52 * generic get constraint function for shared match/mask registers.
53 */
54static struct event_constraint *
55uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
56{
57 struct intel_uncore_extra_reg *er;
58 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
59 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
60 unsigned long flags;
61 bool ok = false;
62
63 /*
64 * reg->alloc can be set due to existing state, so for fake box we
65 * need to ignore this, otherwise we might fail to allocate proper
66 * fake state for this extra reg constraint.
67 */
68 if (reg1->idx == EXTRA_REG_NONE ||
69 (!uncore_box_is_fake(box) && reg1->alloc))
70 return NULL;
71
72 er = &box->shared_regs[reg1->idx];
73 raw_spin_lock_irqsave(&er->lock, flags);
74 if (!atomic_read(&er->ref) ||
75 (er->config1 == reg1->config && er->config2 == reg2->config)) {
76 atomic_inc(&er->ref);
77 er->config1 = reg1->config;
78 er->config2 = reg2->config;
79 ok = true;
80 }
81 raw_spin_unlock_irqrestore(&er->lock, flags);
82
83 if (ok) {
84 if (!uncore_box_is_fake(box))
85 reg1->alloc = 1;
86 return NULL;
87 }
88
89 return &constraint_empty;
90}
91
92static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
93{
94 struct intel_uncore_extra_reg *er;
95 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
96
97 /*
98 * Only put constraint if extra reg was actually allocated. Also
99 * takes care of event which do not use an extra shared reg.
100 *
101 * Also, if this is a fake box we shouldn't touch any event state
102 * (reg->alloc) and we don't care about leaving inconsistent box
103 * state either since it will be thrown out.
104 */
105 if (uncore_box_is_fake(box) || !reg1->alloc)
106 return;
107
108 er = &box->shared_regs[reg1->idx];
109 atomic_dec(&er->ref);
110 reg1->alloc = 0;
111}
112
113/* Sandy Bridge-EP uncore support */
114static struct intel_uncore_type snbep_uncore_cbox;
115static struct intel_uncore_type snbep_uncore_pcu;
116
117static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box)
118{
119 struct pci_dev *pdev = box->pci_dev;
120 int box_ctl = uncore_pci_box_ctl(box);
121 u32 config = 0;
122
123 if (!pci_read_config_dword(pdev, box_ctl, &config)) {
124 config |= SNBEP_PMON_BOX_CTL_FRZ;
125 pci_write_config_dword(pdev, box_ctl, config);
126 }
127}
128
129static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box)
130{
131 struct pci_dev *pdev = box->pci_dev;
132 int box_ctl = uncore_pci_box_ctl(box);
133 u32 config = 0;
134
135 if (!pci_read_config_dword(pdev, box_ctl, &config)) {
136 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
137 pci_write_config_dword(pdev, box_ctl, config);
138 }
139}
140
141static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event)
142{
143 struct pci_dev *pdev = box->pci_dev;
144 struct hw_perf_event *hwc = &event->hw;
145
146 pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
147}
148
149static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event)
150{
151 struct pci_dev *pdev = box->pci_dev;
152 struct hw_perf_event *hwc = &event->hw;
153
154 pci_write_config_dword(pdev, hwc->config_base, hwc->config);
155}
156
157static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event)
158{
159 struct pci_dev *pdev = box->pci_dev;
160 struct hw_perf_event *hwc = &event->hw;
161 u64 count = 0;
162
163 pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count);
164 pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1);
165
166 return count;
167}
168
169static void snbep_uncore_pci_init_box(struct intel_uncore_box *box)
170{
171 struct pci_dev *pdev = box->pci_dev;
172
173 pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT);
174}
175
176static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box)
177{
178 u64 config;
179 unsigned msr;
180
181 msr = uncore_msr_box_ctl(box);
182 if (msr) {
183 rdmsrl(msr, config);
184 config |= SNBEP_PMON_BOX_CTL_FRZ;
185 wrmsrl(msr, config);
186 }
187}
188
189static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box)
190{
191 u64 config;
192 unsigned msr;
193
194 msr = uncore_msr_box_ctl(box);
195 if (msr) {
196 rdmsrl(msr, config);
197 config &= ~SNBEP_PMON_BOX_CTL_FRZ;
198 wrmsrl(msr, config);
199 }
200}
201
202static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
203{
204 struct hw_perf_event *hwc = &event->hw;
205 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
206
207 if (reg1->idx != EXTRA_REG_NONE)
208 wrmsrl(reg1->reg, reg1->config);
209
210 wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN);
211}
212
213static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box,
214 struct perf_event *event)
215{
216 struct hw_perf_event *hwc = &event->hw;
217
218 wrmsrl(hwc->config_base, hwc->config);
219}
220
221static void snbep_uncore_msr_init_box(struct intel_uncore_box *box)
222{
223 unsigned msr = uncore_msr_box_ctl(box);
224
225 if (msr)
226 wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT);
227}
228
229static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event)
230{
231 struct hw_perf_event *hwc = &event->hw;
232 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
233
234 if (box->pmu->type == &snbep_uncore_cbox) {
235 reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER +
236 SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx;
237 reg1->config = event->attr.config1 &
238 SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK;
239 } else {
240 if (box->pmu->type == &snbep_uncore_pcu) {
241 reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER;
242 reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK;
243 } else {
244 return 0;
245 }
246 }
247 reg1->idx = 0;
248
249 return 0;
250}
251
252static struct attribute *snbep_uncore_formats_attr[] = {
253 &format_attr_event.attr,
254 &format_attr_umask.attr,
255 &format_attr_edge.attr,
256 &format_attr_inv.attr,
257 &format_attr_thresh8.attr,
258 NULL,
259};
260
261static struct attribute *snbep_uncore_ubox_formats_attr[] = {
262 &format_attr_event.attr,
263 &format_attr_umask.attr,
264 &format_attr_edge.attr,
265 &format_attr_inv.attr,
266 &format_attr_thresh5.attr,
267 NULL,
268};
269
270static struct attribute *snbep_uncore_cbox_formats_attr[] = {
271 &format_attr_event.attr,
272 &format_attr_umask.attr,
273 &format_attr_edge.attr,
274 &format_attr_tid_en.attr,
275 &format_attr_inv.attr,
276 &format_attr_thresh8.attr,
277 &format_attr_filter_tid.attr,
278 &format_attr_filter_nid.attr,
279 &format_attr_filter_state.attr,
280 &format_attr_filter_opc.attr,
281 NULL,
282};
283
284static struct attribute *snbep_uncore_pcu_formats_attr[] = {
285 &format_attr_event.attr,
286 &format_attr_occ_sel.attr,
287 &format_attr_edge.attr,
288 &format_attr_inv.attr,
289 &format_attr_thresh5.attr,
290 &format_attr_occ_invert.attr,
291 &format_attr_occ_edge.attr,
292 &format_attr_filter_band0.attr,
293 &format_attr_filter_band1.attr,
294 &format_attr_filter_band2.attr,
295 &format_attr_filter_band3.attr,
296 NULL,
297};
298
299static struct attribute *snbep_uncore_qpi_formats_attr[] = {
300 &format_attr_event_ext.attr,
301 &format_attr_umask.attr,
302 &format_attr_edge.attr,
303 &format_attr_inv.attr,
304 &format_attr_thresh8.attr,
305 NULL,
306};
307
308static struct uncore_event_desc snbep_uncore_imc_events[] = {
309 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
310 INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"),
311 INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"),
312 { /* end: all zeroes */ },
313};
314
315static struct uncore_event_desc snbep_uncore_qpi_events[] = {
316 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"),
317 INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"),
318 INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"),
319 INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"),
320 { /* end: all zeroes */ },
321};
322
323static struct attribute_group snbep_uncore_format_group = {
324 .name = "format",
325 .attrs = snbep_uncore_formats_attr,
326};
327
328static struct attribute_group snbep_uncore_ubox_format_group = {
329 .name = "format",
330 .attrs = snbep_uncore_ubox_formats_attr,
331};
332
333static struct attribute_group snbep_uncore_cbox_format_group = {
334 .name = "format",
335 .attrs = snbep_uncore_cbox_formats_attr,
336};
337
338static struct attribute_group snbep_uncore_pcu_format_group = {
339 .name = "format",
340 .attrs = snbep_uncore_pcu_formats_attr,
341};
342
343static struct attribute_group snbep_uncore_qpi_format_group = {
344 .name = "format",
345 .attrs = snbep_uncore_qpi_formats_attr,
346};
347
348static struct intel_uncore_ops snbep_uncore_msr_ops = {
349 .init_box = snbep_uncore_msr_init_box,
350 .disable_box = snbep_uncore_msr_disable_box,
351 .enable_box = snbep_uncore_msr_enable_box,
352 .disable_event = snbep_uncore_msr_disable_event,
353 .enable_event = snbep_uncore_msr_enable_event,
354 .read_counter = uncore_msr_read_counter,
355 .get_constraint = uncore_get_constraint,
356 .put_constraint = uncore_put_constraint,
357 .hw_config = snbep_uncore_hw_config,
358};
359
360static struct intel_uncore_ops snbep_uncore_pci_ops = {
361 .init_box = snbep_uncore_pci_init_box,
362 .disable_box = snbep_uncore_pci_disable_box,
363 .enable_box = snbep_uncore_pci_enable_box,
364 .disable_event = snbep_uncore_pci_disable_event,
365 .enable_event = snbep_uncore_pci_enable_event,
366 .read_counter = snbep_uncore_pci_read_counter,
367};
368
369static struct event_constraint snbep_uncore_cbox_constraints[] = {
370 UNCORE_EVENT_CONSTRAINT(0x01, 0x1),
371 UNCORE_EVENT_CONSTRAINT(0x02, 0x3),
372 UNCORE_EVENT_CONSTRAINT(0x04, 0x3),
373 UNCORE_EVENT_CONSTRAINT(0x05, 0x3),
374 UNCORE_EVENT_CONSTRAINT(0x07, 0x3),
375 UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
376 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
377 UNCORE_EVENT_CONSTRAINT(0x13, 0x3),
378 UNCORE_EVENT_CONSTRAINT(0x1b, 0xc),
379 UNCORE_EVENT_CONSTRAINT(0x1c, 0xc),
380 UNCORE_EVENT_CONSTRAINT(0x1d, 0xc),
381 UNCORE_EVENT_CONSTRAINT(0x1e, 0xc),
382 EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff),
383 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
384 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
385 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
386 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
387 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
388 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
389 UNCORE_EVENT_CONSTRAINT(0x35, 0x3),
390 UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
391 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
392 UNCORE_EVENT_CONSTRAINT(0x38, 0x3),
393 UNCORE_EVENT_CONSTRAINT(0x39, 0x3),
394 UNCORE_EVENT_CONSTRAINT(0x3b, 0x1),
395 EVENT_CONSTRAINT_END
396};
397
398static struct event_constraint snbep_uncore_r2pcie_constraints[] = {
399 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
400 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
401 UNCORE_EVENT_CONSTRAINT(0x12, 0x1),
402 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
403 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
404 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
405 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
406 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
407 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
408 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
409 EVENT_CONSTRAINT_END
410};
411
412static struct event_constraint snbep_uncore_r3qpi_constraints[] = {
413 UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
414 UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
415 UNCORE_EVENT_CONSTRAINT(0x12, 0x3),
416 UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
417 UNCORE_EVENT_CONSTRAINT(0x20, 0x3),
418 UNCORE_EVENT_CONSTRAINT(0x21, 0x3),
419 UNCORE_EVENT_CONSTRAINT(0x22, 0x3),
420 UNCORE_EVENT_CONSTRAINT(0x23, 0x3),
421 UNCORE_EVENT_CONSTRAINT(0x24, 0x3),
422 UNCORE_EVENT_CONSTRAINT(0x25, 0x3),
423 UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
424 UNCORE_EVENT_CONSTRAINT(0x30, 0x3),
425 UNCORE_EVENT_CONSTRAINT(0x31, 0x3),
426 UNCORE_EVENT_CONSTRAINT(0x32, 0x3),
427 UNCORE_EVENT_CONSTRAINT(0x33, 0x3),
428 UNCORE_EVENT_CONSTRAINT(0x34, 0x3),
429 UNCORE_EVENT_CONSTRAINT(0x36, 0x3),
430 UNCORE_EVENT_CONSTRAINT(0x37, 0x3),
431 EVENT_CONSTRAINT_END
432};
433
434static struct intel_uncore_type snbep_uncore_ubox = {
435 .name = "ubox",
436 .num_counters = 2,
437 .num_boxes = 1,
438 .perf_ctr_bits = 44,
439 .fixed_ctr_bits = 48,
440 .perf_ctr = SNBEP_U_MSR_PMON_CTR0,
441 .event_ctl = SNBEP_U_MSR_PMON_CTL0,
442 .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
443 .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR,
444 .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL,
445 .ops = &snbep_uncore_msr_ops,
446 .format_group = &snbep_uncore_ubox_format_group,
447};
448
449static struct intel_uncore_type snbep_uncore_cbox = {
450 .name = "cbox",
451 .num_counters = 4,
452 .num_boxes = 8,
453 .perf_ctr_bits = 44,
454 .event_ctl = SNBEP_C0_MSR_PMON_CTL0,
455 .perf_ctr = SNBEP_C0_MSR_PMON_CTR0,
456 .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
457 .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL,
458 .msr_offset = SNBEP_CBO_MSR_OFFSET,
459 .num_shared_regs = 1,
460 .constraints = snbep_uncore_cbox_constraints,
461 .ops = &snbep_uncore_msr_ops,
462 .format_group = &snbep_uncore_cbox_format_group,
463};
464
465static struct intel_uncore_type snbep_uncore_pcu = {
466 .name = "pcu",
467 .num_counters = 4,
468 .num_boxes = 1,
469 .perf_ctr_bits = 48,
470 .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0,
471 .event_ctl = SNBEP_PCU_MSR_PMON_CTL0,
472 .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK,
473 .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL,
474 .num_shared_regs = 1,
475 .ops = &snbep_uncore_msr_ops,
476 .format_group = &snbep_uncore_pcu_format_group,
477};
478
479static struct intel_uncore_type *snbep_msr_uncores[] = {
480 &snbep_uncore_ubox,
481 &snbep_uncore_cbox,
482 &snbep_uncore_pcu,
483 NULL,
484};
485
486#define SNBEP_UNCORE_PCI_COMMON_INIT() \
487 .perf_ctr = SNBEP_PCI_PMON_CTR0, \
488 .event_ctl = SNBEP_PCI_PMON_CTL0, \
489 .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \
490 .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \
491 .ops = &snbep_uncore_pci_ops, \
492 .format_group = &snbep_uncore_format_group
493
494static struct intel_uncore_type snbep_uncore_ha = {
495 .name = "ha",
496 .num_counters = 4,
497 .num_boxes = 1,
498 .perf_ctr_bits = 48,
499 SNBEP_UNCORE_PCI_COMMON_INIT(),
500};
501
502static struct intel_uncore_type snbep_uncore_imc = {
503 .name = "imc",
504 .num_counters = 4,
505 .num_boxes = 4,
506 .perf_ctr_bits = 48,
507 .fixed_ctr_bits = 48,
508 .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
509 .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
510 .event_descs = snbep_uncore_imc_events,
511 SNBEP_UNCORE_PCI_COMMON_INIT(),
512};
513
514static struct intel_uncore_type snbep_uncore_qpi = {
515 .name = "qpi",
516 .num_counters = 4,
517 .num_boxes = 2,
518 .perf_ctr_bits = 48,
519 .perf_ctr = SNBEP_PCI_PMON_CTR0,
520 .event_ctl = SNBEP_PCI_PMON_CTL0,
521 .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK,
522 .box_ctl = SNBEP_PCI_PMON_BOX_CTL,
523 .ops = &snbep_uncore_pci_ops,
524 .event_descs = snbep_uncore_qpi_events,
525 .format_group = &snbep_uncore_qpi_format_group,
526};
527
528
529static struct intel_uncore_type snbep_uncore_r2pcie = {
530 .name = "r2pcie",
531 .num_counters = 4,
532 .num_boxes = 1,
533 .perf_ctr_bits = 44,
534 .constraints = snbep_uncore_r2pcie_constraints,
535 SNBEP_UNCORE_PCI_COMMON_INIT(),
536};
537
538static struct intel_uncore_type snbep_uncore_r3qpi = {
539 .name = "r3qpi",
540 .num_counters = 3,
541 .num_boxes = 2,
542 .perf_ctr_bits = 44,
543 .constraints = snbep_uncore_r3qpi_constraints,
544 SNBEP_UNCORE_PCI_COMMON_INIT(),
545};
546
547static struct intel_uncore_type *snbep_pci_uncores[] = {
548 &snbep_uncore_ha,
549 &snbep_uncore_imc,
550 &snbep_uncore_qpi,
551 &snbep_uncore_r2pcie,
552 &snbep_uncore_r3qpi,
553 NULL,
554};
555
556static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = {
557 { /* Home Agent */
558 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA),
559 .driver_data = (unsigned long)&snbep_uncore_ha,
560 },
561 { /* MC Channel 0 */
562 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0),
563 .driver_data = (unsigned long)&snbep_uncore_imc,
564 },
565 { /* MC Channel 1 */
566 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1),
567 .driver_data = (unsigned long)&snbep_uncore_imc,
568 },
569 { /* MC Channel 2 */
570 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2),
571 .driver_data = (unsigned long)&snbep_uncore_imc,
572 },
573 { /* MC Channel 3 */
574 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3),
575 .driver_data = (unsigned long)&snbep_uncore_imc,
576 },
577 { /* QPI Port 0 */
578 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0),
579 .driver_data = (unsigned long)&snbep_uncore_qpi,
580 },
581 { /* QPI Port 1 */
582 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1),
583 .driver_data = (unsigned long)&snbep_uncore_qpi,
584 },
585 { /* P2PCIe */
586 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE),
587 .driver_data = (unsigned long)&snbep_uncore_r2pcie,
588 },
589 { /* R3QPI Link 0 */
590 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0),
591 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
592 },
593 { /* R3QPI Link 1 */
594 PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1),
595 .driver_data = (unsigned long)&snbep_uncore_r3qpi,
596 },
597 { /* end: all zeroes */ }
598};
599
600static struct pci_driver snbep_uncore_pci_driver = {
601 .name = "snbep_uncore",
602 .id_table = snbep_uncore_pci_ids,
603};
604
605/*
606 * build pci bus to socket mapping
607 */
608static int snbep_pci2phy_map_init(void)
609{
610 struct pci_dev *ubox_dev = NULL;
611 int i, bus, nodeid;
612 int err = 0;
613 u32 config = 0;
614
615 while (1) {
616 /* find the UBOX device */
617 ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL,
618 PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX,
619 ubox_dev);
620 if (!ubox_dev)
621 break;
622 bus = ubox_dev->bus->number;
623 /* get the Node ID of the local register */
624 err = pci_read_config_dword(ubox_dev, 0x40, &config);
625 if (err)
626 break;
627 nodeid = config;
628 /* get the Node ID mapping */
629 err = pci_read_config_dword(ubox_dev, 0x54, &config);
630 if (err)
631 break;
632 /*
633 * every three bits in the Node ID mapping register maps
634 * to a particular node.
635 */
636 for (i = 0; i < 8; i++) {
637 if (nodeid == ((config >> (3 * i)) & 0x7)) {
638 pcibus_to_physid[bus] = i;
639 break;
640 }
641 }
642 };
643
644 if (ubox_dev)
645 pci_dev_put(ubox_dev);
646
647 return err ? pcibios_err_to_errno(err) : 0;
648}
649/* end of Sandy Bridge-EP uncore support */
650
651/* Sandy Bridge uncore support */
652static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
653{
654 struct hw_perf_event *hwc = &event->hw;
655
656 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
657 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
658 else
659 wrmsrl(hwc->config_base, SNB_UNC_CTL_EN);
660}
661
662static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
663{
664 wrmsrl(event->hw.config_base, 0);
665}
666
667static void snb_uncore_msr_init_box(struct intel_uncore_box *box)
668{
669 if (box->pmu->pmu_idx == 0) {
670 wrmsrl(SNB_UNC_PERF_GLOBAL_CTL,
671 SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL);
672 }
673}
674
675static struct uncore_event_desc snb_uncore_events[] = {
676 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
677 { /* end: all zeroes */ },
678};
679
680static struct attribute *snb_uncore_formats_attr[] = {
681 &format_attr_event.attr,
682 &format_attr_umask.attr,
683 &format_attr_edge.attr,
684 &format_attr_inv.attr,
685 &format_attr_cmask5.attr,
686 NULL,
687};
688
689static struct attribute_group snb_uncore_format_group = {
690 .name = "format",
691 .attrs = snb_uncore_formats_attr,
692};
693
694static struct intel_uncore_ops snb_uncore_msr_ops = {
695 .init_box = snb_uncore_msr_init_box,
696 .disable_event = snb_uncore_msr_disable_event,
697 .enable_event = snb_uncore_msr_enable_event,
698 .read_counter = uncore_msr_read_counter,
699};
700
701static struct event_constraint snb_uncore_cbox_constraints[] = {
702 UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
703 UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
704 EVENT_CONSTRAINT_END
705};
706
707static struct intel_uncore_type snb_uncore_cbox = {
708 .name = "cbox",
709 .num_counters = 2,
710 .num_boxes = 4,
711 .perf_ctr_bits = 44,
712 .fixed_ctr_bits = 48,
713 .perf_ctr = SNB_UNC_CBO_0_PER_CTR0,
714 .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0,
715 .fixed_ctr = SNB_UNC_FIXED_CTR,
716 .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL,
717 .single_fixed = 1,
718 .event_mask = SNB_UNC_RAW_EVENT_MASK,
719 .msr_offset = SNB_UNC_CBO_MSR_OFFSET,
720 .constraints = snb_uncore_cbox_constraints,
721 .ops = &snb_uncore_msr_ops,
722 .format_group = &snb_uncore_format_group,
723 .event_descs = snb_uncore_events,
724};
725
726static struct intel_uncore_type *snb_msr_uncores[] = {
727 &snb_uncore_cbox,
728 NULL,
729};
730/* end of Sandy Bridge uncore support */
731
732/* Nehalem uncore support */
733static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box)
734{
735 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0);
736}
737
738static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box)
739{
740 wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC);
741}
742
743static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
744{
745 struct hw_perf_event *hwc = &event->hw;
746
747 if (hwc->idx < UNCORE_PMC_IDX_FIXED)
748 wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN);
749 else
750 wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN);
751}
752
753static struct attribute *nhm_uncore_formats_attr[] = {
754 &format_attr_event.attr,
755 &format_attr_umask.attr,
756 &format_attr_edge.attr,
757 &format_attr_inv.attr,
758 &format_attr_cmask8.attr,
759 NULL,
760};
761
762static struct attribute_group nhm_uncore_format_group = {
763 .name = "format",
764 .attrs = nhm_uncore_formats_attr,
765};
766
767static struct uncore_event_desc nhm_uncore_events[] = {
768 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"),
769 INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"),
770 INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"),
771 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"),
772 INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"),
773 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"),
774 INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"),
775 INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"),
776 INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"),
777 { /* end: all zeroes */ },
778};
779
780static struct intel_uncore_ops nhm_uncore_msr_ops = {
781 .disable_box = nhm_uncore_msr_disable_box,
782 .enable_box = nhm_uncore_msr_enable_box,
783 .disable_event = snb_uncore_msr_disable_event,
784 .enable_event = nhm_uncore_msr_enable_event,
785 .read_counter = uncore_msr_read_counter,
786};
787
788static struct intel_uncore_type nhm_uncore = {
789 .name = "",
790 .num_counters = 8,
791 .num_boxes = 1,
792 .perf_ctr_bits = 48,
793 .fixed_ctr_bits = 48,
794 .event_ctl = NHM_UNC_PERFEVTSEL0,
795 .perf_ctr = NHM_UNC_UNCORE_PMC0,
796 .fixed_ctr = NHM_UNC_FIXED_CTR,
797 .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL,
798 .event_mask = NHM_UNC_RAW_EVENT_MASK,
799 .event_descs = nhm_uncore_events,
800 .ops = &nhm_uncore_msr_ops,
801 .format_group = &nhm_uncore_format_group,
802};
803
804static struct intel_uncore_type *nhm_msr_uncores[] = {
805 &nhm_uncore,
806 NULL,
807};
808/* end of Nehalem uncore support */
809
810/* Nehalem-EX uncore support */
811#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \
812 ((1ULL << (n)) - 1)))
813
814DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5");
815DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7");
816DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63");
817DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63");
818
819static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box)
820{
821 wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL);
822}
823
824static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box)
825{
826 unsigned msr = uncore_msr_box_ctl(box);
827 u64 config;
828
829 if (msr) {
830 rdmsrl(msr, config);
831 config &= ~((1ULL << uncore_num_counters(box)) - 1);
832 /* WBox has a fixed counter */
833 if (uncore_msr_fixed_ctl(box))
834 config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN;
835 wrmsrl(msr, config);
836 }
837}
838
839static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box)
840{
841 unsigned msr = uncore_msr_box_ctl(box);
842 u64 config;
843
844 if (msr) {
845 rdmsrl(msr, config);
846 config |= (1ULL << uncore_num_counters(box)) - 1;
847 /* WBox has a fixed counter */
848 if (uncore_msr_fixed_ctl(box))
849 config |= NHMEX_W_PMON_GLOBAL_FIXED_EN;
850 wrmsrl(msr, config);
851 }
852}
853
854static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event)
855{
856 wrmsrl(event->hw.config_base, 0);
857}
858
859static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
860{
861 struct hw_perf_event *hwc = &event->hw;
862
863 if (hwc->idx >= UNCORE_PMC_IDX_FIXED)
864 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0);
865 else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0)
866 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
867 else
868 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
869}
870
871#define NHMEX_UNCORE_OPS_COMMON_INIT() \
872 .init_box = nhmex_uncore_msr_init_box, \
873 .disable_box = nhmex_uncore_msr_disable_box, \
874 .enable_box = nhmex_uncore_msr_enable_box, \
875 .disable_event = nhmex_uncore_msr_disable_event, \
876 .read_counter = uncore_msr_read_counter
877
878static struct intel_uncore_ops nhmex_uncore_ops = {
879 NHMEX_UNCORE_OPS_COMMON_INIT(),
880 .enable_event = nhmex_uncore_msr_enable_event,
881};
882
883static struct attribute *nhmex_uncore_ubox_formats_attr[] = {
884 &format_attr_event.attr,
885 &format_attr_edge.attr,
886 NULL,
887};
888
889static struct attribute_group nhmex_uncore_ubox_format_group = {
890 .name = "format",
891 .attrs = nhmex_uncore_ubox_formats_attr,
892};
893
894static struct intel_uncore_type nhmex_uncore_ubox = {
895 .name = "ubox",
896 .num_counters = 1,
897 .num_boxes = 1,
898 .perf_ctr_bits = 48,
899 .event_ctl = NHMEX_U_MSR_PMON_EV_SEL,
900 .perf_ctr = NHMEX_U_MSR_PMON_CTR,
901 .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK,
902 .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL,
903 .ops = &nhmex_uncore_ops,
904 .format_group = &nhmex_uncore_ubox_format_group
905};
906
907static struct attribute *nhmex_uncore_cbox_formats_attr[] = {
908 &format_attr_event.attr,
909 &format_attr_umask.attr,
910 &format_attr_edge.attr,
911 &format_attr_inv.attr,
912 &format_attr_thresh8.attr,
913 NULL,
914};
915
916static struct attribute_group nhmex_uncore_cbox_format_group = {
917 .name = "format",
918 .attrs = nhmex_uncore_cbox_formats_attr,
919};
920
921/* msr offset for each instance of cbox */
922static unsigned nhmex_cbox_msr_offsets[] = {
923 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0,
924};
925
926static struct intel_uncore_type nhmex_uncore_cbox = {
927 .name = "cbox",
928 .num_counters = 6,
929 .num_boxes = 10,
930 .perf_ctr_bits = 48,
931 .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0,
932 .perf_ctr = NHMEX_C0_MSR_PMON_CTR0,
933 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
934 .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL,
935 .msr_offsets = nhmex_cbox_msr_offsets,
936 .pair_ctr_ctl = 1,
937 .ops = &nhmex_uncore_ops,
938 .format_group = &nhmex_uncore_cbox_format_group
939};
940
941static struct uncore_event_desc nhmex_uncore_wbox_events[] = {
942 INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"),
943 { /* end: all zeroes */ },
944};
945
946static struct intel_uncore_type nhmex_uncore_wbox = {
947 .name = "wbox",
948 .num_counters = 4,
949 .num_boxes = 1,
950 .perf_ctr_bits = 48,
951 .event_ctl = NHMEX_W_MSR_PMON_CNT0,
952 .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0,
953 .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR,
954 .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL,
955 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
956 .box_ctl = NHMEX_W_MSR_GLOBAL_CTL,
957 .pair_ctr_ctl = 1,
958 .event_descs = nhmex_uncore_wbox_events,
959 .ops = &nhmex_uncore_ops,
960 .format_group = &nhmex_uncore_cbox_format_group
961};
962
963static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
964{
965 struct hw_perf_event *hwc = &event->hw;
966 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
967 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
968 int ctr, ev_sel;
969
970 ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >>
971 NHMEX_B_PMON_CTR_SHIFT;
972 ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >>
973 NHMEX_B_PMON_CTL_EV_SEL_SHIFT;
974
975 /* events that do not use the match/mask registers */
976 if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) ||
977 (ctr == 2 && ev_sel != 0x4) || ctr == 3)
978 return 0;
979
980 if (box->pmu->pmu_idx == 0)
981 reg1->reg = NHMEX_B0_MSR_MATCH;
982 else
983 reg1->reg = NHMEX_B1_MSR_MATCH;
984 reg1->idx = 0;
985 reg1->config = event->attr.config1;
986 reg2->config = event->attr.config2;
987 return 0;
988}
989
990static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
991{
992 struct hw_perf_event *hwc = &event->hw;
993 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
994 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
995
996 if (reg1->idx != EXTRA_REG_NONE) {
997 wrmsrl(reg1->reg, reg1->config);
998 wrmsrl(reg1->reg + 1, reg2->config);
999 }
1000 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
1001 (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK));
1002}
1003
1004/*
1005 * The Bbox has 4 counters, but each counter monitors different events.
1006 * Use bits 6-7 in the event config to select counter.
1007 */
1008static struct event_constraint nhmex_uncore_bbox_constraints[] = {
1009 EVENT_CONSTRAINT(0 , 1, 0xc0),
1010 EVENT_CONSTRAINT(0x40, 2, 0xc0),
1011 EVENT_CONSTRAINT(0x80, 4, 0xc0),
1012 EVENT_CONSTRAINT(0xc0, 8, 0xc0),
1013 EVENT_CONSTRAINT_END,
1014};
1015
1016static struct attribute *nhmex_uncore_bbox_formats_attr[] = {
1017 &format_attr_event5.attr,
1018 &format_attr_counter.attr,
1019 &format_attr_match.attr,
1020 &format_attr_mask.attr,
1021 NULL,
1022};
1023
1024static struct attribute_group nhmex_uncore_bbox_format_group = {
1025 .name = "format",
1026 .attrs = nhmex_uncore_bbox_formats_attr,
1027};
1028
1029static struct intel_uncore_ops nhmex_uncore_bbox_ops = {
1030 NHMEX_UNCORE_OPS_COMMON_INIT(),
1031 .enable_event = nhmex_bbox_msr_enable_event,
1032 .hw_config = nhmex_bbox_hw_config,
1033 .get_constraint = uncore_get_constraint,
1034 .put_constraint = uncore_put_constraint,
1035};
1036
1037static struct intel_uncore_type nhmex_uncore_bbox = {
1038 .name = "bbox",
1039 .num_counters = 4,
1040 .num_boxes = 2,
1041 .perf_ctr_bits = 48,
1042 .event_ctl = NHMEX_B0_MSR_PMON_CTL0,
1043 .perf_ctr = NHMEX_B0_MSR_PMON_CTR0,
1044 .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK,
1045 .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL,
1046 .msr_offset = NHMEX_B_MSR_OFFSET,
1047 .pair_ctr_ctl = 1,
1048 .num_shared_regs = 1,
1049 .constraints = nhmex_uncore_bbox_constraints,
1050 .ops = &nhmex_uncore_bbox_ops,
1051 .format_group = &nhmex_uncore_bbox_format_group
1052};
1053
1054static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1055{
1056 struct hw_perf_event *hwc = &event->hw;
1057 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1058 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1059
1060 /* only TO_R_PROG_EV event uses the match/mask register */
1061 if ((hwc->config & NHMEX_PMON_CTL_EV_SEL_MASK) !=
1062 NHMEX_S_EVENT_TO_R_PROG_EV)
1063 return 0;
1064
1065 if (box->pmu->pmu_idx == 0)
1066 reg1->reg = NHMEX_S0_MSR_MM_CFG;
1067 else
1068 reg1->reg = NHMEX_S1_MSR_MM_CFG;
1069 reg1->idx = 0;
1070 reg1->config = event->attr.config1;
1071 reg2->config = event->attr.config2;
1072 return 0;
1073}
1074
1075static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1076{
1077 struct hw_perf_event *hwc = &event->hw;
1078 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1079 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1080
1081 if (reg1->idx != EXTRA_REG_NONE) {
1082 wrmsrl(reg1->reg, 0);
1083 wrmsrl(reg1->reg + 1, reg1->config);
1084 wrmsrl(reg1->reg + 2, reg2->config);
1085 wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN);
1086 }
1087 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22);
1088}
1089
1090static struct attribute *nhmex_uncore_sbox_formats_attr[] = {
1091 &format_attr_event.attr,
1092 &format_attr_umask.attr,
1093 &format_attr_edge.attr,
1094 &format_attr_inv.attr,
1095 &format_attr_thresh8.attr,
1096 &format_attr_match.attr,
1097 &format_attr_mask.attr,
1098 NULL,
1099};
1100
1101static struct attribute_group nhmex_uncore_sbox_format_group = {
1102 .name = "format",
1103 .attrs = nhmex_uncore_sbox_formats_attr,
1104};
1105
1106static struct intel_uncore_ops nhmex_uncore_sbox_ops = {
1107 NHMEX_UNCORE_OPS_COMMON_INIT(),
1108 .enable_event = nhmex_sbox_msr_enable_event,
1109 .hw_config = nhmex_sbox_hw_config,
1110 .get_constraint = uncore_get_constraint,
1111 .put_constraint = uncore_put_constraint,
1112};
1113
1114static struct intel_uncore_type nhmex_uncore_sbox = {
1115 .name = "sbox",
1116 .num_counters = 4,
1117 .num_boxes = 2,
1118 .perf_ctr_bits = 48,
1119 .event_ctl = NHMEX_S0_MSR_PMON_CTL0,
1120 .perf_ctr = NHMEX_S0_MSR_PMON_CTR0,
1121 .event_mask = NHMEX_PMON_RAW_EVENT_MASK,
1122 .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL,
1123 .msr_offset = NHMEX_S_MSR_OFFSET,
1124 .pair_ctr_ctl = 1,
1125 .num_shared_regs = 1,
1126 .ops = &nhmex_uncore_sbox_ops,
1127 .format_group = &nhmex_uncore_sbox_format_group
1128};
1129
1130enum {
1131 EXTRA_REG_NHMEX_M_FILTER,
1132 EXTRA_REG_NHMEX_M_DSP,
1133 EXTRA_REG_NHMEX_M_ISS,
1134 EXTRA_REG_NHMEX_M_MAP,
1135 EXTRA_REG_NHMEX_M_MSC_THR,
1136 EXTRA_REG_NHMEX_M_PGT,
1137 EXTRA_REG_NHMEX_M_PLD,
1138 EXTRA_REG_NHMEX_M_ZDP_CTL_FVC,
1139};
1140
1141static struct extra_reg nhmex_uncore_mbox_extra_regs[] = {
1142 MBOX_INC_SEL_EXTAR_REG(0x0, DSP),
1143 MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR),
1144 MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR),
1145 MBOX_INC_SEL_EXTAR_REG(0x9, ISS),
1146 /* event 0xa uses two extra registers */
1147 MBOX_INC_SEL_EXTAR_REG(0xa, ISS),
1148 MBOX_INC_SEL_EXTAR_REG(0xa, PLD),
1149 MBOX_INC_SEL_EXTAR_REG(0xb, PLD),
1150 /* events 0xd ~ 0x10 use the same extra register */
1151 MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC),
1152 MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC),
1153 MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC),
1154 MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC),
1155 MBOX_INC_SEL_EXTAR_REG(0x16, PGT),
1156 MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP),
1157 MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS),
1158 MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT),
1159 MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP),
1160 EVENT_EXTRA_END
1161};
1162
1163/* Nehalem-EX or Westmere-EX ? */
1164bool uncore_nhmex;
1165
1166static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config)
1167{
1168 struct intel_uncore_extra_reg *er;
1169 unsigned long flags;
1170 bool ret = false;
1171 u64 mask;
1172
1173 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1174 er = &box->shared_regs[idx];
1175 raw_spin_lock_irqsave(&er->lock, flags);
1176 if (!atomic_read(&er->ref) || er->config == config) {
1177 atomic_inc(&er->ref);
1178 er->config = config;
1179 ret = true;
1180 }
1181 raw_spin_unlock_irqrestore(&er->lock, flags);
1182
1183 return ret;
1184 }
1185 /*
1186 * The ZDP_CTL_FVC MSR has 4 fields which are used to control
1187 * events 0xd ~ 0x10. Besides these 4 fields, there are additional
1188 * fields which are shared.
1189 */
1190 idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1191 if (WARN_ON_ONCE(idx >= 4))
1192 return false;
1193
1194 /* mask of the shared fields */
1195 if (uncore_nhmex)
1196 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK;
1197 else
1198 mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK;
1199 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1200
1201 raw_spin_lock_irqsave(&er->lock, flags);
1202 /* add mask of the non-shared field if it's in use */
1203 if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) {
1204 if (uncore_nhmex)
1205 mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1206 else
1207 mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1208 }
1209
1210 if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) {
1211 atomic_add(1 << (idx * 8), &er->ref);
1212 if (uncore_nhmex)
1213 mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK |
1214 NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1215 else
1216 mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK |
1217 WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1218 er->config &= ~mask;
1219 er->config |= (config & mask);
1220 ret = true;
1221 }
1222 raw_spin_unlock_irqrestore(&er->lock, flags);
1223
1224 return ret;
1225}
1226
1227static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx)
1228{
1229 struct intel_uncore_extra_reg *er;
1230
1231 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1232 er = &box->shared_regs[idx];
1233 atomic_dec(&er->ref);
1234 return;
1235 }
1236
1237 idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1238 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1239 atomic_sub(1 << (idx * 8), &er->ref);
1240}
1241
1242u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify)
1243{
1244 struct hw_perf_event *hwc = &event->hw;
1245 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1246 int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8);
1247 u64 config = reg1->config;
1248
1249 /* get the non-shared control bits and shift them */
1250 idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1251 if (uncore_nhmex)
1252 config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1253 else
1254 config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx);
1255 if (new_idx > orig_idx) {
1256 idx = new_idx - orig_idx;
1257 config <<= 3 * idx;
1258 } else {
1259 idx = orig_idx - new_idx;
1260 config >>= 3 * idx;
1261 }
1262
1263 /* add the shared control bits back */
1264 if (uncore_nhmex)
1265 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1266 else
1267 config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1268 config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config;
1269 if (modify) {
1270 /* adjust the main event selector */
1271 if (new_idx > orig_idx)
1272 hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
1273 else
1274 hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT;
1275 reg1->config = config;
1276 reg1->idx = ~0xff | new_idx;
1277 }
1278 return config;
1279}
1280
1281static struct event_constraint *
1282nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
1283{
1284 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1285 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1286 int i, idx[2], alloc = 0;
1287 u64 config1 = reg1->config;
1288
1289 idx[0] = __BITS_VALUE(reg1->idx, 0, 8);
1290 idx[1] = __BITS_VALUE(reg1->idx, 1, 8);
1291again:
1292 for (i = 0; i < 2; i++) {
1293 if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i)))
1294 idx[i] = 0xff;
1295
1296 if (idx[i] == 0xff)
1297 continue;
1298
1299 if (!nhmex_mbox_get_shared_reg(box, idx[i],
1300 __BITS_VALUE(config1, i, 32)))
1301 goto fail;
1302 alloc |= (0x1 << i);
1303 }
1304
1305 /* for the match/mask registers */
1306 if (reg2->idx != EXTRA_REG_NONE &&
1307 (uncore_box_is_fake(box) || !reg2->alloc) &&
1308 !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config))
1309 goto fail;
1310
1311 /*
1312 * If it's a fake box -- as per validate_{group,event}() we
1313 * shouldn't touch event state and we can avoid doing so
1314 * since both will only call get_event_constraints() once
1315 * on each event, this avoids the need for reg->alloc.
1316 */
1317 if (!uncore_box_is_fake(box)) {
1318 if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8))
1319 nhmex_mbox_alter_er(event, idx[0], true);
1320 reg1->alloc |= alloc;
1321 if (reg2->idx != EXTRA_REG_NONE)
1322 reg2->alloc = 1;
1323 }
1324 return NULL;
1325fail:
1326 if (idx[0] != 0xff && !(alloc & 0x1) &&
1327 idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) {
1328 /*
1329 * events 0xd ~ 0x10 are functional identical, but are
1330 * controlled by different fields in the ZDP_CTL_FVC
1331 * register. If we failed to take one field, try the
1332 * rest 3 choices.
1333 */
1334 BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff);
1335 idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1336 idx[0] = (idx[0] + 1) % 4;
1337 idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC;
1338 if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) {
1339 config1 = nhmex_mbox_alter_er(event, idx[0], false);
1340 goto again;
1341 }
1342 }
1343
1344 if (alloc & 0x1)
1345 nhmex_mbox_put_shared_reg(box, idx[0]);
1346 if (alloc & 0x2)
1347 nhmex_mbox_put_shared_reg(box, idx[1]);
1348 return &constraint_empty;
1349}
1350
1351static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
1352{
1353 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1354 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1355
1356 if (uncore_box_is_fake(box))
1357 return;
1358
1359 if (reg1->alloc & 0x1)
1360 nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8));
1361 if (reg1->alloc & 0x2)
1362 nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8));
1363 reg1->alloc = 0;
1364
1365 if (reg2->alloc) {
1366 nhmex_mbox_put_shared_reg(box, reg2->idx);
1367 reg2->alloc = 0;
1368 }
1369}
1370
1371static int nhmex_mbox_extra_reg_idx(struct extra_reg *er)
1372{
1373 if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
1374 return er->idx;
1375 return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd;
1376}
1377
1378static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1379{
1380 struct intel_uncore_type *type = box->pmu->type;
1381 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1382 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1383 struct extra_reg *er;
1384 unsigned msr;
1385 int reg_idx = 0;
1386 /*
1387 * The mbox events may require 2 extra MSRs at the most. But only
1388 * the lower 32 bits in these MSRs are significant, so we can use
1389 * config1 to pass two MSRs' config.
1390 */
1391 for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) {
1392 if (er->event != (event->hw.config & er->config_mask))
1393 continue;
1394 if (event->attr.config1 & ~er->valid_mask)
1395 return -EINVAL;
1396
1397 msr = er->msr + type->msr_offset * box->pmu->pmu_idx;
1398 if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff))
1399 return -EINVAL;
1400
1401 /* always use the 32~63 bits to pass the PLD config */
1402 if (er->idx == EXTRA_REG_NHMEX_M_PLD)
1403 reg_idx = 1;
1404 else if (WARN_ON_ONCE(reg_idx > 0))
1405 return -EINVAL;
1406
1407 reg1->idx &= ~(0xff << (reg_idx * 8));
1408 reg1->reg &= ~(0xffff << (reg_idx * 16));
1409 reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8);
1410 reg1->reg |= msr << (reg_idx * 16);
1411 reg1->config = event->attr.config1;
1412 reg_idx++;
1413 }
1414 /*
1415 * The mbox only provides ability to perform address matching
1416 * for the PLD events.
1417 */
1418 if (reg_idx == 2) {
1419 reg2->idx = EXTRA_REG_NHMEX_M_FILTER;
1420 if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN)
1421 reg2->config = event->attr.config2;
1422 else
1423 reg2->config = ~0ULL;
1424 if (box->pmu->pmu_idx == 0)
1425 reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG;
1426 else
1427 reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG;
1428 }
1429 return 0;
1430}
1431
1432static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx)
1433{
1434 struct intel_uncore_extra_reg *er;
1435 unsigned long flags;
1436 u64 config;
1437
1438 if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC)
1439 return box->shared_regs[idx].config;
1440
1441 er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC];
1442 raw_spin_lock_irqsave(&er->lock, flags);
1443 config = er->config;
1444 raw_spin_unlock_irqrestore(&er->lock, flags);
1445 return config;
1446}
1447
1448static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1449{
1450 struct hw_perf_event *hwc = &event->hw;
1451 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1452 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1453 int idx;
1454
1455 idx = __BITS_VALUE(reg1->idx, 0, 8);
1456 if (idx != 0xff)
1457 wrmsrl(__BITS_VALUE(reg1->reg, 0, 16),
1458 nhmex_mbox_shared_reg_config(box, idx));
1459 idx = __BITS_VALUE(reg1->idx, 1, 8);
1460 if (idx != 0xff)
1461 wrmsrl(__BITS_VALUE(reg1->reg, 1, 16),
1462 nhmex_mbox_shared_reg_config(box, idx));
1463
1464 if (reg2->idx != EXTRA_REG_NONE) {
1465 wrmsrl(reg2->reg, 0);
1466 if (reg2->config != ~0ULL) {
1467 wrmsrl(reg2->reg + 1,
1468 reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK);
1469 wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK &
1470 (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT));
1471 wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN);
1472 }
1473 }
1474
1475 wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0);
1476}
1477
1478DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3");
1479DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5");
1480DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6");
1481DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7");
1482DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13");
1483DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21");
1484DEFINE_UNCORE_FORMAT_ATTR(filter_cfg_en, filter_cfg_en, "config2:63");
1485DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33");
1486DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61");
1487DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31");
1488DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31");
1489DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31");
1490DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31");
1491DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31");
1492DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31");
1493DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63");
1494
1495static struct attribute *nhmex_uncore_mbox_formats_attr[] = {
1496 &format_attr_count_mode.attr,
1497 &format_attr_storage_mode.attr,
1498 &format_attr_wrap_mode.attr,
1499 &format_attr_flag_mode.attr,
1500 &format_attr_inc_sel.attr,
1501 &format_attr_set_flag_sel.attr,
1502 &format_attr_filter_cfg_en.attr,
1503 &format_attr_filter_match.attr,
1504 &format_attr_filter_mask.attr,
1505 &format_attr_dsp.attr,
1506 &format_attr_thr.attr,
1507 &format_attr_fvc.attr,
1508 &format_attr_pgt.attr,
1509 &format_attr_map.attr,
1510 &format_attr_iss.attr,
1511 &format_attr_pld.attr,
1512 NULL,
1513};
1514
1515static struct attribute_group nhmex_uncore_mbox_format_group = {
1516 .name = "format",
1517 .attrs = nhmex_uncore_mbox_formats_attr,
1518};
1519
1520static struct uncore_event_desc nhmex_uncore_mbox_events[] = {
1521 INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"),
1522 INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"),
1523 { /* end: all zeroes */ },
1524};
1525
1526static struct uncore_event_desc wsmex_uncore_mbox_events[] = {
1527 INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"),
1528 INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"),
1529 { /* end: all zeroes */ },
1530};
1531
1532static struct intel_uncore_ops nhmex_uncore_mbox_ops = {
1533 NHMEX_UNCORE_OPS_COMMON_INIT(),
1534 .enable_event = nhmex_mbox_msr_enable_event,
1535 .hw_config = nhmex_mbox_hw_config,
1536 .get_constraint = nhmex_mbox_get_constraint,
1537 .put_constraint = nhmex_mbox_put_constraint,
1538};
1539
1540static struct intel_uncore_type nhmex_uncore_mbox = {
1541 .name = "mbox",
1542 .num_counters = 6,
1543 .num_boxes = 2,
1544 .perf_ctr_bits = 48,
1545 .event_ctl = NHMEX_M0_MSR_PMU_CTL0,
1546 .perf_ctr = NHMEX_M0_MSR_PMU_CNT0,
1547 .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK,
1548 .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL,
1549 .msr_offset = NHMEX_M_MSR_OFFSET,
1550 .pair_ctr_ctl = 1,
1551 .num_shared_regs = 8,
1552 .event_descs = nhmex_uncore_mbox_events,
1553 .ops = &nhmex_uncore_mbox_ops,
1554 .format_group = &nhmex_uncore_mbox_format_group,
1555};
1556
1557void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event)
1558{
1559 struct hw_perf_event *hwc = &event->hw;
1560 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1561
1562 /* adjust the main event selector and extra register index */
1563 if (reg1->idx % 2) {
1564 reg1->idx--;
1565 hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1566 } else {
1567 reg1->idx++;
1568 hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1569 }
1570
1571 /* adjust extra register config */
1572 switch (reg1->idx % 6) {
1573 case 2:
1574 /* shift the 8~15 bits to the 0~7 bits */
1575 reg1->config >>= 8;
1576 break;
1577 case 3:
1578 /* shift the 0~7 bits to the 8~15 bits */
1579 reg1->config <<= 8;
1580 break;
1581 };
1582}
1583
1584/*
1585 * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7.
1586 * An event set consists of 6 events, the 3rd and 4th events in
1587 * an event set use the same extra register. So an event set uses
1588 * 5 extra registers.
1589 */
1590static struct event_constraint *
1591nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event)
1592{
1593 struct hw_perf_event *hwc = &event->hw;
1594 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1595 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1596 struct intel_uncore_extra_reg *er;
1597 unsigned long flags;
1598 int idx, er_idx;
1599 u64 config1;
1600 bool ok = false;
1601
1602 if (!uncore_box_is_fake(box) && reg1->alloc)
1603 return NULL;
1604
1605 idx = reg1->idx % 6;
1606 config1 = reg1->config;
1607again:
1608 er_idx = idx;
1609 /* the 3rd and 4th events use the same extra register */
1610 if (er_idx > 2)
1611 er_idx--;
1612 er_idx += (reg1->idx / 6) * 5;
1613
1614 er = &box->shared_regs[er_idx];
1615 raw_spin_lock_irqsave(&er->lock, flags);
1616 if (idx < 2) {
1617 if (!atomic_read(&er->ref) || er->config == reg1->config) {
1618 atomic_inc(&er->ref);
1619 er->config = reg1->config;
1620 ok = true;
1621 }
1622 } else if (idx == 2 || idx == 3) {
1623 /*
1624 * these two events use different fields in a extra register,
1625 * the 0~7 bits and the 8~15 bits respectively.
1626 */
1627 u64 mask = 0xff << ((idx - 2) * 8);
1628 if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) ||
1629 !((er->config ^ config1) & mask)) {
1630 atomic_add(1 << ((idx - 2) * 8), &er->ref);
1631 er->config &= ~mask;
1632 er->config |= config1 & mask;
1633 ok = true;
1634 }
1635 } else {
1636 if (!atomic_read(&er->ref) ||
1637 (er->config == (hwc->config >> 32) &&
1638 er->config1 == reg1->config &&
1639 er->config2 == reg2->config)) {
1640 atomic_inc(&er->ref);
1641 er->config = (hwc->config >> 32);
1642 er->config1 = reg1->config;
1643 er->config2 = reg2->config;
1644 ok = true;
1645 }
1646 }
1647 raw_spin_unlock_irqrestore(&er->lock, flags);
1648
1649 if (!ok) {
1650 /*
1651 * The Rbox events are always in pairs. The paired
1652 * events are functional identical, but use different
1653 * extra registers. If we failed to take an extra
1654 * register, try the alternative.
1655 */
1656 if (idx % 2)
1657 idx--;
1658 else
1659 idx++;
1660 if (idx != reg1->idx % 6) {
1661 if (idx == 2)
1662 config1 >>= 8;
1663 else if (idx == 3)
1664 config1 <<= 8;
1665 goto again;
1666 }
1667 } else {
1668 if (!uncore_box_is_fake(box)) {
1669 if (idx != reg1->idx % 6)
1670 nhmex_rbox_alter_er(box, event);
1671 reg1->alloc = 1;
1672 }
1673 return NULL;
1674 }
1675 return &constraint_empty;
1676}
1677
1678static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event)
1679{
1680 struct intel_uncore_extra_reg *er;
1681 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1682 int idx, er_idx;
1683
1684 if (uncore_box_is_fake(box) || !reg1->alloc)
1685 return;
1686
1687 idx = reg1->idx % 6;
1688 er_idx = idx;
1689 if (er_idx > 2)
1690 er_idx--;
1691 er_idx += (reg1->idx / 6) * 5;
1692
1693 er = &box->shared_regs[er_idx];
1694 if (idx == 2 || idx == 3)
1695 atomic_sub(1 << ((idx - 2) * 8), &er->ref);
1696 else
1697 atomic_dec(&er->ref);
1698
1699 reg1->alloc = 0;
1700}
1701
1702static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event)
1703{
1704 struct hw_perf_event *hwc = &event->hw;
1705 struct hw_perf_event_extra *reg1 = &event->hw.extra_reg;
1706 struct hw_perf_event_extra *reg2 = &event->hw.branch_reg;
1707 int idx;
1708
1709 idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >>
1710 NHMEX_R_PMON_CTL_EV_SEL_SHIFT;
1711 if (idx >= 0x18)
1712 return -EINVAL;
1713
1714 reg1->idx = idx;
1715 reg1->config = event->attr.config1;
1716
1717 switch (idx % 6) {
1718 case 4:
1719 case 5:
1720 hwc->config |= event->attr.config & (~0ULL << 32);
1721 reg2->config = event->attr.config2;
1722 break;
1723 };
1724 return 0;
1725}
1726
1727static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx)
1728{
1729 struct intel_uncore_extra_reg *er;
1730 unsigned long flags;
1731 u64 config;
1732
1733 er = &box->shared_regs[idx];
1734
1735 raw_spin_lock_irqsave(&er->lock, flags);
1736 config = er->config;
1737 raw_spin_unlock_irqrestore(&er->lock, flags);
1738
1739 return config;
1740}
1741
1742static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
1743{
1744 struct hw_perf_event *hwc = &event->hw;
1745 struct hw_perf_event_extra *reg1 = &hwc->extra_reg;
1746 struct hw_perf_event_extra *reg2 = &hwc->branch_reg;
1747 int idx, port;
1748
1749 idx = reg1->idx;
1750 port = idx / 6 + box->pmu->pmu_idx * 4;
1751
1752 switch (idx % 6) {
1753 case 0:
1754 wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG0(port), reg1->config);
1755 break;
1756 case 1:
1757 wrmsrl(NHMEX_R_MSR_PORTN_IPERF_CFG1(port), reg1->config);
1758 break;
1759 case 2:
1760 case 3:
1761 wrmsrl(NHMEX_R_MSR_PORTN_QLX_CFG(port),
1762 nhmex_rbox_shared_reg_config(box, 2 + (idx / 6) * 5));
1763 break;
1764 case 4:
1765 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port),
1766 hwc->config >> 32);
1767 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(port), reg1->config);
1768 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET1_MASK(port), reg2->config);
1769 break;
1770 case 5:
1771 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port),
1772 hwc->config >> 32);
1773 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(port), reg1->config);
1774 wrmsrl(NHMEX_R_MSR_PORTN_XBR_SET2_MASK(port), reg2->config);
1775 break;
1776 };
1777
1778 wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 |
1779 (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK));
1780}
1781
1782DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config:32-63");
1783DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config1:0-63");
1784DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63");
1785DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15");
1786DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31");
1787
1788static struct attribute *nhmex_uncore_rbox_formats_attr[] = {
1789 &format_attr_event5.attr,
1790 &format_attr_xbr_mm_cfg.attr,
1791 &format_attr_xbr_match.attr,
1792 &format_attr_xbr_mask.attr,
1793 &format_attr_qlx_cfg.attr,
1794 &format_attr_iperf_cfg.attr,
1795 NULL,
1796};
1797
1798static struct attribute_group nhmex_uncore_rbox_format_group = {
1799 .name = "format",
1800 .attrs = nhmex_uncore_rbox_formats_attr,
1801};
1802
1803static struct uncore_event_desc nhmex_uncore_rbox_events[] = {
1804 INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"),
1805 INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"),
1806 INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"),
1807 INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"),
1808 INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"),
1809 INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"),
1810 { /* end: all zeroes */ },
1811};
1812
1813static struct intel_uncore_ops nhmex_uncore_rbox_ops = {
1814 NHMEX_UNCORE_OPS_COMMON_INIT(),
1815 .enable_event = nhmex_rbox_msr_enable_event,
1816 .hw_config = nhmex_rbox_hw_config,
1817 .get_constraint = nhmex_rbox_get_constraint,
1818 .put_constraint = nhmex_rbox_put_constraint,
1819};
1820
1821static struct intel_uncore_type nhmex_uncore_rbox = {
1822 .name = "rbox",
1823 .num_counters = 8,
1824 .num_boxes = 2,
1825 .perf_ctr_bits = 48,
1826 .event_ctl = NHMEX_R_MSR_PMON_CTL0,
1827 .perf_ctr = NHMEX_R_MSR_PMON_CNT0,
1828 .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK,
1829 .box_ctl = NHMEX_R_MSR_GLOBAL_CTL,
1830 .msr_offset = NHMEX_R_MSR_OFFSET,
1831 .pair_ctr_ctl = 1,
1832 .num_shared_regs = 20,
1833 .event_descs = nhmex_uncore_rbox_events,
1834 .ops = &nhmex_uncore_rbox_ops,
1835 .format_group = &nhmex_uncore_rbox_format_group
1836};
1837
1838static struct intel_uncore_type *nhmex_msr_uncores[] = {
1839 &nhmex_uncore_ubox,
1840 &nhmex_uncore_cbox,
1841 &nhmex_uncore_bbox,
1842 &nhmex_uncore_sbox,
1843 &nhmex_uncore_mbox,
1844 &nhmex_uncore_rbox,
1845 &nhmex_uncore_wbox,
1846 NULL,
1847};
1848/* end of Nehalem-EX uncore support */
1849
1850static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx)
1851{
1852 struct hw_perf_event *hwc = &event->hw;
1853
1854 hwc->idx = idx;
1855 hwc->last_tag = ++box->tags[idx];
1856
1857 if (hwc->idx == UNCORE_PMC_IDX_FIXED) {
1858 hwc->event_base = uncore_fixed_ctr(box);
1859 hwc->config_base = uncore_fixed_ctl(box);
1860 return;
1861 }
1862
1863 hwc->config_base = uncore_event_ctl(box, hwc->idx);
1864 hwc->event_base = uncore_perf_ctr(box, hwc->idx);
1865}
1866
1867static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event)
1868{
1869 u64 prev_count, new_count, delta;
1870 int shift;
1871
1872 if (event->hw.idx >= UNCORE_PMC_IDX_FIXED)
1873 shift = 64 - uncore_fixed_ctr_bits(box);
1874 else
1875 shift = 64 - uncore_perf_ctr_bits(box);
1876
1877 /* the hrtimer might modify the previous event value */
1878again:
1879 prev_count = local64_read(&event->hw.prev_count);
1880 new_count = uncore_read_counter(box, event);
1881 if (local64_xchg(&event->hw.prev_count, new_count) != prev_count)
1882 goto again;
1883
1884 delta = (new_count << shift) - (prev_count << shift);
1885 delta >>= shift;
1886
1887 local64_add(delta, &event->count);
1888}
1889
1890/*
1891 * The overflow interrupt is unavailable for SandyBridge-EP, is broken
1892 * for SandyBridge. So we use hrtimer to periodically poll the counter
1893 * to avoid overflow.
1894 */
1895static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer)
1896{
1897 struct intel_uncore_box *box;
1898 unsigned long flags;
1899 int bit;
1900
1901 box = container_of(hrtimer, struct intel_uncore_box, hrtimer);
1902 if (!box->n_active || box->cpu != smp_processor_id())
1903 return HRTIMER_NORESTART;
1904 /*
1905 * disable local interrupt to prevent uncore_pmu_event_start/stop
1906 * to interrupt the update process
1907 */
1908 local_irq_save(flags);
1909
1910 for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX)
1911 uncore_perf_event_update(box, box->events[bit]);
1912
1913 local_irq_restore(flags);
1914
1915 hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL));
1916 return HRTIMER_RESTART;
1917}
1918
1919static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box)
1920{
1921 __hrtimer_start_range_ns(&box->hrtimer,
1922 ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0,
1923 HRTIMER_MODE_REL_PINNED, 0);
1924}
1925
1926static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box)
1927{
1928 hrtimer_cancel(&box->hrtimer);
1929}
1930
1931static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box)
1932{
1933 hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1934 box->hrtimer.function = uncore_pmu_hrtimer;
1935}
1936
1937struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu)
1938{
1939 struct intel_uncore_box *box;
1940 int i, size;
1941
1942 size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg);
1943
1944 box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu));
1945 if (!box)
1946 return NULL;
1947
1948 for (i = 0; i < type->num_shared_regs; i++)
1949 raw_spin_lock_init(&box->shared_regs[i].lock);
1950
1951 uncore_pmu_init_hrtimer(box);
1952 atomic_set(&box->refcnt, 1);
1953 box->cpu = -1;
1954 box->phys_id = -1;
1955
1956 return box;
1957}
1958
1959static struct intel_uncore_box *
1960uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)
1961{
1962 struct intel_uncore_box *box;
1963
1964 box = *per_cpu_ptr(pmu->box, cpu);
1965 if (box)
1966 return box;
1967
1968 raw_spin_lock(&uncore_box_lock);
1969 list_for_each_entry(box, &pmu->box_list, list) {
1970 if (box->phys_id == topology_physical_package_id(cpu)) {
1971 atomic_inc(&box->refcnt);
1972 *per_cpu_ptr(pmu->box, cpu) = box;
1973 break;
1974 }
1975 }
1976 raw_spin_unlock(&uncore_box_lock);
1977
1978 return *per_cpu_ptr(pmu->box, cpu);
1979}
1980
1981static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event)
1982{
1983 return container_of(event->pmu, struct intel_uncore_pmu, pmu);
1984}
1985
1986static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event)
1987{
1988 /*
1989 * perf core schedules event on the basis of cpu, uncore events are
1990 * collected by one of the cpus inside a physical package.
1991 */
1992 return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id());
1993}
1994
1995static int
1996uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
1997{
1998 struct perf_event *event;
1999 int n, max_count;
2000
2001 max_count = box->pmu->type->num_counters;
2002 if (box->pmu->type->fixed_ctl)
2003 max_count++;
2004
2005 if (box->n_events >= max_count)
2006 return -EINVAL;
2007
2008 n = box->n_events;
2009 box->event_list[n] = leader;
2010 n++;
2011 if (!dogrp)
2012 return n;
2013
2014 list_for_each_entry(event, &leader->sibling_list, group_entry) {
2015 if (event->state <= PERF_EVENT_STATE_OFF)
2016 continue;
2017
2018 if (n >= max_count)
2019 return -EINVAL;
2020
2021 box->event_list[n] = event;
2022 n++;
2023 }
2024 return n;
2025}
2026
2027static struct event_constraint *
2028uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
2029{
2030 struct intel_uncore_type *type = box->pmu->type;
2031 struct event_constraint *c;
2032
2033 if (type->ops->get_constraint) {
2034 c = type->ops->get_constraint(box, event);
2035 if (c)
2036 return c;
2037 }
2038
2039 if (event->hw.config == ~0ULL)
2040 return &constraint_fixed;
2041
2042 if (type->constraints) {
2043 for_each_event_constraint(c, type->constraints) {
2044 if ((event->hw.config & c->cmask) == c->code)
2045 return c;
2046 }
2047 }
2048
2049 return &type->unconstrainted;
2050}
2051
2052static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event)
2053{
2054 if (box->pmu->type->ops->put_constraint)
2055 box->pmu->type->ops->put_constraint(box, event);
2056}
2057
2058static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n)
2059{
2060 unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
2061 struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX];
2062 int i, wmin, wmax, ret = 0;
2063 struct hw_perf_event *hwc;
2064
2065 bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX);
2066
2067 for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) {
2068 c = uncore_get_event_constraint(box, box->event_list[i]);
2069 constraints[i] = c;
2070 wmin = min(wmin, c->weight);
2071 wmax = max(wmax, c->weight);
2072 }
2073
2074 /* fastpath, try to reuse previous register */
2075 for (i = 0; i < n; i++) {
2076 hwc = &box->event_list[i]->hw;
2077 c = constraints[i];
2078
2079 /* never assigned */
2080 if (hwc->idx == -1)
2081 break;
2082
2083 /* constraint still honored */
2084 if (!test_bit(hwc->idx, c->idxmsk))
2085 break;
2086
2087 /* not already used */
2088 if (test_bit(hwc->idx, used_mask))
2089 break;
2090
2091 __set_bit(hwc->idx, used_mask);
2092 if (assign)
2093 assign[i] = hwc->idx;
2094 }
2095 /* slow path */
2096 if (i != n)
2097 ret = perf_assign_events(constraints, n, wmin, wmax, assign);
2098
2099 if (!assign || ret) {
2100 for (i = 0; i < n; i++)
2101 uncore_put_event_constraint(box, box->event_list[i]);
2102 }
2103 return ret ? -EINVAL : 0;
2104}
2105
2106static void uncore_pmu_event_start(struct perf_event *event, int flags)
2107{
2108 struct intel_uncore_box *box = uncore_event_to_box(event);
2109 int idx = event->hw.idx;
2110
2111 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
2112 return;
2113
2114 if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX))
2115 return;
2116
2117 event->hw.state = 0;
2118 box->events[idx] = event;
2119 box->n_active++;
2120 __set_bit(idx, box->active_mask);
2121
2122 local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
2123 uncore_enable_event(box, event);
2124
2125 if (box->n_active == 1) {
2126 uncore_enable_box(box);
2127 uncore_pmu_start_hrtimer(box);
2128 }
2129}
2130
2131static void uncore_pmu_event_stop(struct perf_event *event, int flags)
2132{
2133 struct intel_uncore_box *box = uncore_event_to_box(event);
2134 struct hw_perf_event *hwc = &event->hw;
2135
2136 if (__test_and_clear_bit(hwc->idx, box->active_mask)) {
2137 uncore_disable_event(box, event);
2138 box->n_active--;
2139 box->events[hwc->idx] = NULL;
2140 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
2141 hwc->state |= PERF_HES_STOPPED;
2142
2143 if (box->n_active == 0) {
2144 uncore_disable_box(box);
2145 uncore_pmu_cancel_hrtimer(box);
2146 }
2147 }
2148
2149 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
2150 /*
2151 * Drain the remaining delta count out of a event
2152 * that we are disabling:
2153 */
2154 uncore_perf_event_update(box, event);
2155 hwc->state |= PERF_HES_UPTODATE;
2156 }
2157}
2158
2159static int uncore_pmu_event_add(struct perf_event *event, int flags)
2160{
2161 struct intel_uncore_box *box = uncore_event_to_box(event);
2162 struct hw_perf_event *hwc = &event->hw;
2163 int assign[UNCORE_PMC_IDX_MAX];
2164 int i, n, ret;
2165
2166 if (!box)
2167 return -ENODEV;
2168
2169 ret = n = uncore_collect_events(box, event, false);
2170 if (ret < 0)
2171 return ret;
2172
2173 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
2174 if (!(flags & PERF_EF_START))
2175 hwc->state |= PERF_HES_ARCH;
2176
2177 ret = uncore_assign_events(box, assign, n);
2178 if (ret)
2179 return ret;
2180
2181 /* save events moving to new counters */
2182 for (i = 0; i < box->n_events; i++) {
2183 event = box->event_list[i];
2184 hwc = &event->hw;
2185
2186 if (hwc->idx == assign[i] &&
2187 hwc->last_tag == box->tags[assign[i]])
2188 continue;
2189 /*
2190 * Ensure we don't accidentally enable a stopped
2191 * counter simply because we rescheduled.
2192 */
2193 if (hwc->state & PERF_HES_STOPPED)
2194 hwc->state |= PERF_HES_ARCH;
2195
2196 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
2197 }
2198
2199 /* reprogram moved events into new counters */
2200 for (i = 0; i < n; i++) {
2201 event = box->event_list[i];
2202 hwc = &event->hw;
2203
2204 if (hwc->idx != assign[i] ||
2205 hwc->last_tag != box->tags[assign[i]])
2206 uncore_assign_hw_event(box, event, assign[i]);
2207 else if (i < box->n_events)
2208 continue;
2209
2210 if (hwc->state & PERF_HES_ARCH)
2211 continue;
2212
2213 uncore_pmu_event_start(event, 0);
2214 }
2215 box->n_events = n;
2216
2217 return 0;
2218}
2219
2220static void uncore_pmu_event_del(struct perf_event *event, int flags)
2221{
2222 struct intel_uncore_box *box = uncore_event_to_box(event);
2223 int i;
2224
2225 uncore_pmu_event_stop(event, PERF_EF_UPDATE);
2226
2227 for (i = 0; i < box->n_events; i++) {
2228 if (event == box->event_list[i]) {
2229 uncore_put_event_constraint(box, event);
2230
2231 while (++i < box->n_events)
2232 box->event_list[i - 1] = box->event_list[i];
2233
2234 --box->n_events;
2235 break;
2236 }
2237 }
2238
2239 event->hw.idx = -1;
2240 event->hw.last_tag = ~0ULL;
2241}
2242
2243static void uncore_pmu_event_read(struct perf_event *event)
2244{
2245 struct intel_uncore_box *box = uncore_event_to_box(event);
2246 uncore_perf_event_update(box, event);
2247}
2248
2249/*
2250 * validation ensures the group can be loaded onto the
2251 * PMU if it was the only group available.
2252 */
2253static int uncore_validate_group(struct intel_uncore_pmu *pmu,
2254 struct perf_event *event)
2255{
2256 struct perf_event *leader = event->group_leader;
2257 struct intel_uncore_box *fake_box;
2258 int ret = -EINVAL, n;
2259
2260 fake_box = uncore_alloc_box(pmu->type, smp_processor_id());
2261 if (!fake_box)
2262 return -ENOMEM;
2263
2264 fake_box->pmu = pmu;
2265 /*
2266 * the event is not yet connected with its
2267 * siblings therefore we must first collect
2268 * existing siblings, then add the new event
2269 * before we can simulate the scheduling
2270 */
2271 n = uncore_collect_events(fake_box, leader, true);
2272 if (n < 0)
2273 goto out;
2274
2275 fake_box->n_events = n;
2276 n = uncore_collect_events(fake_box, event, false);
2277 if (n < 0)
2278 goto out;
2279
2280 fake_box->n_events = n;
2281
2282 ret = uncore_assign_events(fake_box, NULL, n);
2283out:
2284 kfree(fake_box);
2285 return ret;
2286}
2287
2288int uncore_pmu_event_init(struct perf_event *event)
2289{
2290 struct intel_uncore_pmu *pmu;
2291 struct intel_uncore_box *box;
2292 struct hw_perf_event *hwc = &event->hw;
2293 int ret;
2294
2295 if (event->attr.type != event->pmu->type)
2296 return -ENOENT;
2297
2298 pmu = uncore_event_to_pmu(event);
2299 /* no device found for this pmu */
2300 if (pmu->func_id < 0)
2301 return -ENOENT;
2302
2303 /*
2304 * Uncore PMU does measure at all privilege level all the time.
2305 * So it doesn't make sense to specify any exclude bits.
2306 */
2307 if (event->attr.exclude_user || event->attr.exclude_kernel ||
2308 event->attr.exclude_hv || event->attr.exclude_idle)
2309 return -EINVAL;
2310
2311 /* Sampling not supported yet */
2312 if (hwc->sample_period)
2313 return -EINVAL;
2314
2315 /*
2316 * Place all uncore events for a particular physical package
2317 * onto a single cpu
2318 */
2319 if (event->cpu < 0)
2320 return -EINVAL;
2321 box = uncore_pmu_to_box(pmu, event->cpu);
2322 if (!box || box->cpu < 0)
2323 return -EINVAL;
2324 event->cpu = box->cpu;
2325
2326 event->hw.idx = -1;
2327 event->hw.last_tag = ~0ULL;
2328 event->hw.extra_reg.idx = EXTRA_REG_NONE;
2329 event->hw.branch_reg.idx = EXTRA_REG_NONE;
2330
2331 if (event->attr.config == UNCORE_FIXED_EVENT) {
2332 /* no fixed counter */
2333 if (!pmu->type->fixed_ctl)
2334 return -EINVAL;
2335 /*
2336 * if there is only one fixed counter, only the first pmu
2337 * can access the fixed counter
2338 */
2339 if (pmu->type->single_fixed && pmu->pmu_idx > 0)
2340 return -EINVAL;
2341 hwc->config = ~0ULL;
2342 } else {
2343 hwc->config = event->attr.config & pmu->type->event_mask;
2344 if (pmu->type->ops->hw_config) {
2345 ret = pmu->type->ops->hw_config(box, event);
2346 if (ret)
2347 return ret;
2348 }
2349 }
2350
2351 if (event->group_leader != event)
2352 ret = uncore_validate_group(pmu, event);
2353 else
2354 ret = 0;
2355
2356 return ret;
2357}
2358
2359static ssize_t uncore_get_attr_cpumask(struct device *dev,
2360 struct device_attribute *attr, char *buf)
2361{
2362 int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask);
2363
2364 buf[n++] = '\n';
2365 buf[n] = '\0';
2366 return n;
2367}
2368
2369static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL);
2370
2371static struct attribute *uncore_pmu_attrs[] = {
2372 &dev_attr_cpumask.attr,
2373 NULL,
2374};
2375
2376static struct attribute_group uncore_pmu_attr_group = {
2377 .attrs = uncore_pmu_attrs,
2378};
2379
2380static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)
2381{
2382 int ret;
2383
2384 pmu->pmu = (struct pmu) {
2385 .attr_groups = pmu->type->attr_groups,
2386 .task_ctx_nr = perf_invalid_context,
2387 .event_init = uncore_pmu_event_init,
2388 .add = uncore_pmu_event_add,
2389 .del = uncore_pmu_event_del,
2390 .start = uncore_pmu_event_start,
2391 .stop = uncore_pmu_event_stop,
2392 .read = uncore_pmu_event_read,
2393 };
2394
2395 if (pmu->type->num_boxes == 1) {
2396 if (strlen(pmu->type->name) > 0)
2397 sprintf(pmu->name, "uncore_%s", pmu->type->name);
2398 else
2399 sprintf(pmu->name, "uncore");
2400 } else {
2401 sprintf(pmu->name, "uncore_%s_%d", pmu->type->name,
2402 pmu->pmu_idx);
2403 }
2404
2405 ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
2406 return ret;
2407}
2408
2409static void __init uncore_type_exit(struct intel_uncore_type *type)
2410{
2411 int i;
2412
2413 for (i = 0; i < type->num_boxes; i++)
2414 free_percpu(type->pmus[i].box);
2415 kfree(type->pmus);
2416 type->pmus = NULL;
2417 kfree(type->events_group);
2418 type->events_group = NULL;
2419}
2420
2421static void __init uncore_types_exit(struct intel_uncore_type **types)
2422{
2423 int i;
2424 for (i = 0; types[i]; i++)
2425 uncore_type_exit(types[i]);
2426}
2427
2428static int __init uncore_type_init(struct intel_uncore_type *type)
2429{
2430 struct intel_uncore_pmu *pmus;
2431 struct attribute_group *events_group;
2432 struct attribute **attrs;
2433 int i, j;
2434
2435 pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL);
2436 if (!pmus)
2437 return -ENOMEM;
2438
2439 type->unconstrainted = (struct event_constraint)
2440 __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
2441 0, type->num_counters, 0);
2442
2443 for (i = 0; i < type->num_boxes; i++) {
2444 pmus[i].func_id = -1;
2445 pmus[i].pmu_idx = i;
2446 pmus[i].type = type;
2447 INIT_LIST_HEAD(&pmus[i].box_list);
2448 pmus[i].box = alloc_percpu(struct intel_uncore_box *);
2449 if (!pmus[i].box)
2450 goto fail;
2451 }
2452
2453 if (type->event_descs) {
2454 i = 0;
2455 while (type->event_descs[i].attr.attr.name)
2456 i++;
2457
2458 events_group = kzalloc(sizeof(struct attribute *) * (i + 1) +
2459 sizeof(*events_group), GFP_KERNEL);
2460 if (!events_group)
2461 goto fail;
2462
2463 attrs = (struct attribute **)(events_group + 1);
2464 events_group->name = "events";
2465 events_group->attrs = attrs;
2466
2467 for (j = 0; j < i; j++)
2468 attrs[j] = &type->event_descs[j].attr.attr;
2469
2470 type->events_group = events_group;
2471 }
2472
2473 type->pmu_group = &uncore_pmu_attr_group;
2474 type->pmus = pmus;
2475 return 0;
2476fail:
2477 uncore_type_exit(type);
2478 return -ENOMEM;
2479}
2480
2481static int __init uncore_types_init(struct intel_uncore_type **types)
2482{
2483 int i, ret;
2484
2485 for (i = 0; types[i]; i++) {
2486 ret = uncore_type_init(types[i]);
2487 if (ret)
2488 goto fail;
2489 }
2490 return 0;
2491fail:
2492 while (--i >= 0)
2493 uncore_type_exit(types[i]);
2494 return ret;
2495}
2496
2497static struct pci_driver *uncore_pci_driver;
2498static bool pcidrv_registered;
2499
2500/*
2501 * add a pci uncore device
2502 */
2503static int uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev)
2504{
2505 struct intel_uncore_pmu *pmu;
2506 struct intel_uncore_box *box;
2507 int i, phys_id;
2508
2509 phys_id = pcibus_to_physid[pdev->bus->number];
2510 if (phys_id < 0)
2511 return -ENODEV;
2512
2513 box = uncore_alloc_box(type, 0);
2514 if (!box)
2515 return -ENOMEM;
2516
2517 /*
2518 * for performance monitoring unit with multiple boxes,
2519 * each box has a different function id.
2520 */
2521 for (i = 0; i < type->num_boxes; i++) {
2522 pmu = &type->pmus[i];
2523 if (pmu->func_id == pdev->devfn)
2524 break;
2525 if (pmu->func_id < 0) {
2526 pmu->func_id = pdev->devfn;
2527 break;
2528 }
2529 pmu = NULL;
2530 }
2531
2532 if (!pmu) {
2533 kfree(box);
2534 return -EINVAL;
2535 }
2536
2537 box->phys_id = phys_id;
2538 box->pci_dev = pdev;
2539 box->pmu = pmu;
2540 uncore_box_init(box);
2541 pci_set_drvdata(pdev, box);
2542
2543 raw_spin_lock(&uncore_box_lock);
2544 list_add_tail(&box->list, &pmu->box_list);
2545 raw_spin_unlock(&uncore_box_lock);
2546
2547 return 0;
2548}
2549
2550static void uncore_pci_remove(struct pci_dev *pdev)
2551{
2552 struct intel_uncore_box *box = pci_get_drvdata(pdev);
2553 struct intel_uncore_pmu *pmu = box->pmu;
2554 int cpu, phys_id = pcibus_to_physid[pdev->bus->number];
2555
2556 if (WARN_ON_ONCE(phys_id != box->phys_id))
2557 return;
2558
2559 raw_spin_lock(&uncore_box_lock);
2560 list_del(&box->list);
2561 raw_spin_unlock(&uncore_box_lock);
2562
2563 for_each_possible_cpu(cpu) {
2564 if (*per_cpu_ptr(pmu->box, cpu) == box) {
2565 *per_cpu_ptr(pmu->box, cpu) = NULL;
2566 atomic_dec(&box->refcnt);
2567 }
2568 }
2569
2570 WARN_ON_ONCE(atomic_read(&box->refcnt) != 1);
2571 kfree(box);
2572}
2573
2574static int uncore_pci_probe(struct pci_dev *pdev,
2575 const struct pci_device_id *id)
2576{
2577 struct intel_uncore_type *type;
2578
2579 type = (struct intel_uncore_type *)id->driver_data;
2580
2581 return uncore_pci_add(type, pdev);
2582}
2583
2584static int __init uncore_pci_init(void)
2585{
2586 int ret;
2587
2588 switch (boot_cpu_data.x86_model) {
2589 case 45: /* Sandy Bridge-EP */
2590 ret = snbep_pci2phy_map_init();
2591 if (ret)
2592 return ret;
2593 pci_uncores = snbep_pci_uncores;
2594 uncore_pci_driver = &snbep_uncore_pci_driver;
2595 break;
2596 default:
2597 return 0;
2598 }
2599
2600 ret = uncore_types_init(pci_uncores);
2601 if (ret)
2602 return ret;
2603
2604 uncore_pci_driver->probe = uncore_pci_probe;
2605 uncore_pci_driver->remove = uncore_pci_remove;
2606
2607 ret = pci_register_driver(uncore_pci_driver);
2608 if (ret == 0)
2609 pcidrv_registered = true;
2610 else
2611 uncore_types_exit(pci_uncores);
2612
2613 return ret;
2614}
2615
2616static void __init uncore_pci_exit(void)
2617{
2618 if (pcidrv_registered) {
2619 pcidrv_registered = false;
2620 pci_unregister_driver(uncore_pci_driver);
2621 uncore_types_exit(pci_uncores);
2622 }
2623}
2624
2625static void __cpuinit uncore_cpu_dying(int cpu)
2626{
2627 struct intel_uncore_type *type;
2628 struct intel_uncore_pmu *pmu;
2629 struct intel_uncore_box *box;
2630 int i, j;
2631
2632 for (i = 0; msr_uncores[i]; i++) {
2633 type = msr_uncores[i];
2634 for (j = 0; j < type->num_boxes; j++) {
2635 pmu = &type->pmus[j];
2636 box = *per_cpu_ptr(pmu->box, cpu);
2637 *per_cpu_ptr(pmu->box, cpu) = NULL;
2638 if (box && atomic_dec_and_test(&box->refcnt))
2639 kfree(box);
2640 }
2641 }
2642}
2643
2644static int __cpuinit uncore_cpu_starting(int cpu)
2645{
2646 struct intel_uncore_type *type;
2647 struct intel_uncore_pmu *pmu;
2648 struct intel_uncore_box *box, *exist;
2649 int i, j, k, phys_id;
2650
2651 phys_id = topology_physical_package_id(cpu);
2652
2653 for (i = 0; msr_uncores[i]; i++) {
2654 type = msr_uncores[i];
2655 for (j = 0; j < type->num_boxes; j++) {
2656 pmu = &type->pmus[j];
2657 box = *per_cpu_ptr(pmu->box, cpu);
2658 /* called by uncore_cpu_init? */
2659 if (box && box->phys_id >= 0) {
2660 uncore_box_init(box);
2661 continue;
2662 }
2663
2664 for_each_online_cpu(k) {
2665 exist = *per_cpu_ptr(pmu->box, k);
2666 if (exist && exist->phys_id == phys_id) {
2667 atomic_inc(&exist->refcnt);
2668 *per_cpu_ptr(pmu->box, cpu) = exist;
2669 kfree(box);
2670 box = NULL;
2671 break;
2672 }
2673 }
2674
2675 if (box) {
2676 box->phys_id = phys_id;
2677 uncore_box_init(box);
2678 }
2679 }
2680 }
2681 return 0;
2682}
2683
2684static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id)
2685{
2686 struct intel_uncore_type *type;
2687 struct intel_uncore_pmu *pmu;
2688 struct intel_uncore_box *box;
2689 int i, j;
2690
2691 for (i = 0; msr_uncores[i]; i++) {
2692 type = msr_uncores[i];
2693 for (j = 0; j < type->num_boxes; j++) {
2694 pmu = &type->pmus[j];
2695 if (pmu->func_id < 0)
2696 pmu->func_id = j;
2697
2698 box = uncore_alloc_box(type, cpu);
2699 if (!box)
2700 return -ENOMEM;
2701
2702 box->pmu = pmu;
2703 box->phys_id = phys_id;
2704 *per_cpu_ptr(pmu->box, cpu) = box;
2705 }
2706 }
2707 return 0;
2708}
2709
2710static void __cpuinit
2711uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu)
2712{
2713 struct intel_uncore_type *type;
2714 struct intel_uncore_pmu *pmu;
2715 struct intel_uncore_box *box;
2716 int i, j;
2717
2718 for (i = 0; uncores[i]; i++) {
2719 type = uncores[i];
2720 for (j = 0; j < type->num_boxes; j++) {
2721 pmu = &type->pmus[j];
2722 if (old_cpu < 0)
2723 box = uncore_pmu_to_box(pmu, new_cpu);
2724 else
2725 box = uncore_pmu_to_box(pmu, old_cpu);
2726 if (!box)
2727 continue;
2728
2729 if (old_cpu < 0) {
2730 WARN_ON_ONCE(box->cpu != -1);
2731 box->cpu = new_cpu;
2732 continue;
2733 }
2734
2735 WARN_ON_ONCE(box->cpu != old_cpu);
2736 if (new_cpu >= 0) {
2737 uncore_pmu_cancel_hrtimer(box);
2738 perf_pmu_migrate_context(&pmu->pmu,
2739 old_cpu, new_cpu);
2740 box->cpu = new_cpu;
2741 } else {
2742 box->cpu = -1;
2743 }
2744 }
2745 }
2746}
2747
2748static void __cpuinit uncore_event_exit_cpu(int cpu)
2749{
2750 int i, phys_id, target;
2751
2752 /* if exiting cpu is used for collecting uncore events */
2753 if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask))
2754 return;
2755
2756 /* find a new cpu to collect uncore events */
2757 phys_id = topology_physical_package_id(cpu);
2758 target = -1;
2759 for_each_online_cpu(i) {
2760 if (i == cpu)
2761 continue;
2762 if (phys_id == topology_physical_package_id(i)) {
2763 target = i;
2764 break;
2765 }
2766 }
2767
2768 /* migrate uncore events to the new cpu */
2769 if (target >= 0)
2770 cpumask_set_cpu(target, &uncore_cpu_mask);
2771
2772 uncore_change_context(msr_uncores, cpu, target);
2773 uncore_change_context(pci_uncores, cpu, target);
2774}
2775
2776static void __cpuinit uncore_event_init_cpu(int cpu)
2777{
2778 int i, phys_id;
2779
2780 phys_id = topology_physical_package_id(cpu);
2781 for_each_cpu(i, &uncore_cpu_mask) {
2782 if (phys_id == topology_physical_package_id(i))
2783 return;
2784 }
2785
2786 cpumask_set_cpu(cpu, &uncore_cpu_mask);
2787
2788 uncore_change_context(msr_uncores, -1, cpu);
2789 uncore_change_context(pci_uncores, -1, cpu);
2790}
2791
2792static int
2793 __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
2794{
2795 unsigned int cpu = (long)hcpu;
2796
2797 /* allocate/free data structure for uncore box */
2798 switch (action & ~CPU_TASKS_FROZEN) {
2799 case CPU_UP_PREPARE:
2800 uncore_cpu_prepare(cpu, -1);
2801 break;
2802 case CPU_STARTING:
2803 uncore_cpu_starting(cpu);
2804 break;
2805 case CPU_UP_CANCELED:
2806 case CPU_DYING:
2807 uncore_cpu_dying(cpu);
2808 break;
2809 default:
2810 break;
2811 }
2812
2813 /* select the cpu that collects uncore events */
2814 switch (action & ~CPU_TASKS_FROZEN) {
2815 case CPU_DOWN_FAILED:
2816 case CPU_STARTING:
2817 uncore_event_init_cpu(cpu);
2818 break;
2819 case CPU_DOWN_PREPARE:
2820 uncore_event_exit_cpu(cpu);
2821 break;
2822 default:
2823 break;
2824 }
2825
2826 return NOTIFY_OK;
2827}
2828
2829static struct notifier_block uncore_cpu_nb __cpuinitdata = {
2830 .notifier_call = uncore_cpu_notifier,
2831 /*
2832 * to migrate uncore events, our notifier should be executed
2833 * before perf core's notifier.
2834 */
2835 .priority = CPU_PRI_PERF + 1,
2836};
2837
2838static void __init uncore_cpu_setup(void *dummy)
2839{
2840 uncore_cpu_starting(smp_processor_id());
2841}
2842
2843static int __init uncore_cpu_init(void)
2844{
2845 int ret, cpu, max_cores;
2846
2847 max_cores = boot_cpu_data.x86_max_cores;
2848 switch (boot_cpu_data.x86_model) {
2849 case 26: /* Nehalem */
2850 case 30:
2851 case 37: /* Westmere */
2852 case 44:
2853 msr_uncores = nhm_msr_uncores;
2854 break;
2855 case 42: /* Sandy Bridge */
2856 if (snb_uncore_cbox.num_boxes > max_cores)
2857 snb_uncore_cbox.num_boxes = max_cores;
2858 msr_uncores = snb_msr_uncores;
2859 break;
2860 case 45: /* Sandy Birdge-EP */
2861 if (snbep_uncore_cbox.num_boxes > max_cores)
2862 snbep_uncore_cbox.num_boxes = max_cores;
2863 msr_uncores = snbep_msr_uncores;
2864 break;
2865 case 46: /* Nehalem-EX */
2866 uncore_nhmex = true;
2867 case 47: /* Westmere-EX aka. Xeon E7 */
2868 if (!uncore_nhmex)
2869 nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events;
2870 if (nhmex_uncore_cbox.num_boxes > max_cores)
2871 nhmex_uncore_cbox.num_boxes = max_cores;
2872 msr_uncores = nhmex_msr_uncores;
2873 break;
2874 default:
2875 return 0;
2876 }
2877
2878 ret = uncore_types_init(msr_uncores);
2879 if (ret)
2880 return ret;
2881
2882 get_online_cpus();
2883
2884 for_each_online_cpu(cpu) {
2885 int i, phys_id = topology_physical_package_id(cpu);
2886
2887 for_each_cpu(i, &uncore_cpu_mask) {
2888 if (phys_id == topology_physical_package_id(i)) {
2889 phys_id = -1;
2890 break;
2891 }
2892 }
2893 if (phys_id < 0)
2894 continue;
2895
2896 uncore_cpu_prepare(cpu, phys_id);
2897 uncore_event_init_cpu(cpu);
2898 }
2899 on_each_cpu(uncore_cpu_setup, NULL, 1);
2900
2901 register_cpu_notifier(&uncore_cpu_nb);
2902
2903 put_online_cpus();
2904
2905 return 0;
2906}
2907
2908static int __init uncore_pmus_register(void)
2909{
2910 struct intel_uncore_pmu *pmu;
2911 struct intel_uncore_type *type;
2912 int i, j;
2913
2914 for (i = 0; msr_uncores[i]; i++) {
2915 type = msr_uncores[i];
2916 for (j = 0; j < type->num_boxes; j++) {
2917 pmu = &type->pmus[j];
2918 uncore_pmu_register(pmu);
2919 }
2920 }
2921
2922 for (i = 0; pci_uncores[i]; i++) {
2923 type = pci_uncores[i];
2924 for (j = 0; j < type->num_boxes; j++) {
2925 pmu = &type->pmus[j];
2926 uncore_pmu_register(pmu);
2927 }
2928 }
2929
2930 return 0;
2931}
2932
2933static int __init intel_uncore_init(void)
2934{
2935 int ret;
2936
2937 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
2938 return -ENODEV;
2939
2940 if (cpu_has_hypervisor)
2941 return -ENODEV;
2942
2943 ret = uncore_pci_init();
2944 if (ret)
2945 goto fail;
2946 ret = uncore_cpu_init();
2947 if (ret) {
2948 uncore_pci_exit();
2949 goto fail;
2950 }
2951
2952 uncore_pmus_register();
2953 return 0;
2954fail:
2955 return ret;
2956}
2957device_initcall(intel_uncore_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
deleted file mode 100644
index e68a4550e95..00000000000
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ /dev/null
@@ -1,621 +0,0 @@
1#include <linux/module.h>
2#include <linux/slab.h>
3#include <linux/pci.h>
4#include <linux/perf_event.h>
5#include "perf_event.h"
6
7#define UNCORE_PMU_NAME_LEN 32
8#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC)
9
10#define UNCORE_FIXED_EVENT 0xff
11#define UNCORE_PMC_IDX_MAX_GENERIC 8
12#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC
13#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1)
14
15#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff)
16
17/* SNB event control */
18#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
19#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00
20#define SNB_UNC_CTL_EDGE_DET (1 << 18)
21#define SNB_UNC_CTL_EN (1 << 22)
22#define SNB_UNC_CTL_INVERT (1 << 23)
23#define SNB_UNC_CTL_CMASK_MASK 0x1f000000
24#define NHM_UNC_CTL_CMASK_MASK 0xff000000
25#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0)
26
27#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
28 SNB_UNC_CTL_UMASK_MASK | \
29 SNB_UNC_CTL_EDGE_DET | \
30 SNB_UNC_CTL_INVERT | \
31 SNB_UNC_CTL_CMASK_MASK)
32
33#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
34 SNB_UNC_CTL_UMASK_MASK | \
35 SNB_UNC_CTL_EDGE_DET | \
36 SNB_UNC_CTL_INVERT | \
37 NHM_UNC_CTL_CMASK_MASK)
38
39/* SNB global control register */
40#define SNB_UNC_PERF_GLOBAL_CTL 0x391
41#define SNB_UNC_FIXED_CTR_CTRL 0x394
42#define SNB_UNC_FIXED_CTR 0x395
43
44/* SNB uncore global control */
45#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1)
46#define SNB_UNC_GLOBAL_CTL_EN (1 << 29)
47
48/* SNB Cbo register */
49#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700
50#define SNB_UNC_CBO_0_PER_CTR0 0x706
51#define SNB_UNC_CBO_MSR_OFFSET 0x10
52
53/* NHM global control register */
54#define NHM_UNC_PERF_GLOBAL_CTL 0x391
55#define NHM_UNC_FIXED_CTR 0x394
56#define NHM_UNC_FIXED_CTR_CTRL 0x395
57
58/* NHM uncore global control */
59#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1)
60#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32)
61
62/* NHM uncore register */
63#define NHM_UNC_PERFEVTSEL0 0x3c0
64#define NHM_UNC_UNCORE_PMC0 0x3b0
65
66/* SNB-EP Box level control */
67#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0)
68#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1)
69#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8)
70#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16)
71#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \
72 SNBEP_PMON_BOX_CTL_RST_CTRS | \
73 SNBEP_PMON_BOX_CTL_FRZ_EN)
74/* SNB-EP event control */
75#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff
76#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00
77#define SNBEP_PMON_CTL_RST (1 << 17)
78#define SNBEP_PMON_CTL_EDGE_DET (1 << 18)
79#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */
80#define SNBEP_PMON_CTL_EN (1 << 22)
81#define SNBEP_PMON_CTL_INVERT (1 << 23)
82#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000
83#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \
84 SNBEP_PMON_CTL_UMASK_MASK | \
85 SNBEP_PMON_CTL_EDGE_DET | \
86 SNBEP_PMON_CTL_INVERT | \
87 SNBEP_PMON_CTL_TRESH_MASK)
88
89/* SNB-EP Ubox event control */
90#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000
91#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \
92 (SNBEP_PMON_CTL_EV_SEL_MASK | \
93 SNBEP_PMON_CTL_UMASK_MASK | \
94 SNBEP_PMON_CTL_EDGE_DET | \
95 SNBEP_PMON_CTL_INVERT | \
96 SNBEP_U_MSR_PMON_CTL_TRESH_MASK)
97
98#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19)
99#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \
100 SNBEP_CBO_PMON_CTL_TID_EN)
101
102/* SNB-EP PCU event control */
103#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000
104#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000
105#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30)
106#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31)
107#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \
108 (SNBEP_PMON_CTL_EV_SEL_MASK | \
109 SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \
110 SNBEP_PMON_CTL_EDGE_DET | \
111 SNBEP_PMON_CTL_INVERT | \
112 SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \
113 SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \
114 SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET)
115
116#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \
117 (SNBEP_PMON_RAW_EVENT_MASK | \
118 SNBEP_PMON_CTL_EV_SEL_EXT)
119
120/* SNB-EP pci control register */
121#define SNBEP_PCI_PMON_BOX_CTL 0xf4
122#define SNBEP_PCI_PMON_CTL0 0xd8
123/* SNB-EP pci counter register */
124#define SNBEP_PCI_PMON_CTR0 0xa0
125
126/* SNB-EP home agent register */
127#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40
128#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44
129#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48
130/* SNB-EP memory controller register */
131#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0
132#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0
133/* SNB-EP QPI register */
134#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228
135#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c
136#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238
137#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c
138
139/* SNB-EP Ubox register */
140#define SNBEP_U_MSR_PMON_CTR0 0xc16
141#define SNBEP_U_MSR_PMON_CTL0 0xc10
142
143#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08
144#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09
145
146/* SNB-EP Cbo register */
147#define SNBEP_C0_MSR_PMON_CTR0 0xd16
148#define SNBEP_C0_MSR_PMON_CTL0 0xd10
149#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04
150#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14
151#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f
152#define SNBEP_CBO_MSR_OFFSET 0x20
153
154/* SNB-EP PCU register */
155#define SNBEP_PCU_MSR_PMON_CTR0 0xc36
156#define SNBEP_PCU_MSR_PMON_CTL0 0xc30
157#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24
158#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34
159#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff
160#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc
161#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd
162
163/* NHM-EX event control */
164#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff
165#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00
166#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0)
167#define NHMEX_PMON_CTL_EDGE_DET (1 << 18)
168#define NHMEX_PMON_CTL_PMI_EN (1 << 20)
169#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22)
170#define NHMEX_PMON_CTL_INVERT (1 << 23)
171#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000
172#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \
173 NHMEX_PMON_CTL_UMASK_MASK | \
174 NHMEX_PMON_CTL_EDGE_DET | \
175 NHMEX_PMON_CTL_INVERT | \
176 NHMEX_PMON_CTL_TRESH_MASK)
177
178/* NHM-EX Ubox */
179#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00
180#define NHMEX_U_MSR_PMON_CTR 0xc11
181#define NHMEX_U_MSR_PMON_EV_SEL 0xc10
182
183#define NHMEX_U_PMON_GLOBAL_EN (1 << 0)
184#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e
185#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28)
186#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29)
187#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31)
188
189#define NHMEX_U_PMON_RAW_EVENT_MASK \
190 (NHMEX_PMON_CTL_EV_SEL_MASK | \
191 NHMEX_PMON_CTL_EDGE_DET)
192
193/* NHM-EX Cbox */
194#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00
195#define NHMEX_C0_MSR_PMON_CTR0 0xd11
196#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10
197#define NHMEX_C_MSR_OFFSET 0x20
198
199/* NHM-EX Bbox */
200#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20
201#define NHMEX_B0_MSR_PMON_CTR0 0xc31
202#define NHMEX_B0_MSR_PMON_CTL0 0xc30
203#define NHMEX_B_MSR_OFFSET 0x40
204#define NHMEX_B0_MSR_MATCH 0xe45
205#define NHMEX_B0_MSR_MASK 0xe46
206#define NHMEX_B1_MSR_MATCH 0xe4d
207#define NHMEX_B1_MSR_MASK 0xe4e
208
209#define NHMEX_B_PMON_CTL_EN (1 << 0)
210#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1
211#define NHMEX_B_PMON_CTL_EV_SEL_MASK \
212 (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT)
213#define NHMEX_B_PMON_CTR_SHIFT 6
214#define NHMEX_B_PMON_CTR_MASK \
215 (0x3 << NHMEX_B_PMON_CTR_SHIFT)
216#define NHMEX_B_PMON_RAW_EVENT_MASK \
217 (NHMEX_B_PMON_CTL_EV_SEL_MASK | \
218 NHMEX_B_PMON_CTR_MASK)
219
220/* NHM-EX Sbox */
221#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40
222#define NHMEX_S0_MSR_PMON_CTR0 0xc51
223#define NHMEX_S0_MSR_PMON_CTL0 0xc50
224#define NHMEX_S_MSR_OFFSET 0x80
225#define NHMEX_S0_MSR_MM_CFG 0xe48
226#define NHMEX_S0_MSR_MATCH 0xe49
227#define NHMEX_S0_MSR_MASK 0xe4a
228#define NHMEX_S1_MSR_MM_CFG 0xe58
229#define NHMEX_S1_MSR_MATCH 0xe59
230#define NHMEX_S1_MSR_MASK 0xe5a
231
232#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63)
233#define NHMEX_S_EVENT_TO_R_PROG_EV 0
234
235/* NHM-EX Mbox */
236#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0
237#define NHMEX_M0_MSR_PMU_DSP 0xca5
238#define NHMEX_M0_MSR_PMU_ISS 0xca6
239#define NHMEX_M0_MSR_PMU_MAP 0xca7
240#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8
241#define NHMEX_M0_MSR_PMU_PGT 0xca9
242#define NHMEX_M0_MSR_PMU_PLD 0xcaa
243#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab
244#define NHMEX_M0_MSR_PMU_CTL0 0xcb0
245#define NHMEX_M0_MSR_PMU_CNT0 0xcb1
246#define NHMEX_M_MSR_OFFSET 0x40
247#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54
248#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c
249
250#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63)
251#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL
252#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL
253#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34
254
255#define NHMEX_M_PMON_CTL_EN (1 << 0)
256#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1)
257#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2
258#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \
259 (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT)
260#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4
261#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \
262 (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT)
263#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6)
264#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7)
265#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9
266#define NHMEX_M_PMON_CTL_INC_SEL_MASK \
267 (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
268#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19
269#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \
270 (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT)
271#define NHMEX_M_PMON_RAW_EVENT_MASK \
272 (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \
273 NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \
274 NHMEX_M_PMON_CTL_WRAP_MODE | \
275 NHMEX_M_PMON_CTL_FLAG_MODE | \
276 NHMEX_M_PMON_CTL_INC_SEL_MASK | \
277 NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK)
278
279#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23))
280#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n)))
281
282#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24))
283#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n)))
284
285/*
286 * use the 9~13 bits to select event If the 7th bit is not set,
287 * otherwise use the 19~21 bits to select event.
288 */
289#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT)
290#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \
291 NHMEX_M_PMON_CTL_FLAG_MODE)
292#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \
293 NHMEX_M_PMON_CTL_FLAG_MODE)
294#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \
295 NHMEX_M_PMON_CTL_FLAG_MODE)
296#define MBOX_INC_SEL_EXTAR_REG(c, r) \
297 EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \
298 MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r)
299#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \
300 EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \
301 MBOX_SET_FLAG_SEL_MASK, \
302 (u64)-1, NHMEX_M_##r)
303
304/* NHM-EX Rbox */
305#define NHMEX_R_MSR_GLOBAL_CTL 0xe00
306#define NHMEX_R_MSR_PMON_CTL0 0xe10
307#define NHMEX_R_MSR_PMON_CNT0 0xe11
308#define NHMEX_R_MSR_OFFSET 0x20
309
310#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \
311 ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4))
312#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n))
313#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n))
314#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \
315 (((n) < 4 ? 0 : 0x10) + (n) * 4)
316#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \
317 (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
318#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \
319 (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1)
320#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \
321 (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2)
322#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \
323 (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n))
324#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \
325 (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1)
326#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \
327 (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2)
328
329#define NHMEX_R_PMON_CTL_EN (1 << 0)
330#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1
331#define NHMEX_R_PMON_CTL_EV_SEL_MASK \
332 (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT)
333#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6)
334#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK
335
336/* NHM-EX Wbox */
337#define NHMEX_W_MSR_GLOBAL_CTL 0xc80
338#define NHMEX_W_MSR_PMON_CNT0 0xc90
339#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91
340#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394
341#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395
342
343#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31)
344
345struct intel_uncore_ops;
346struct intel_uncore_pmu;
347struct intel_uncore_box;
348struct uncore_event_desc;
349
350struct intel_uncore_type {
351 const char *name;
352 int num_counters;
353 int num_boxes;
354 int perf_ctr_bits;
355 int fixed_ctr_bits;
356 unsigned perf_ctr;
357 unsigned event_ctl;
358 unsigned event_mask;
359 unsigned fixed_ctr;
360 unsigned fixed_ctl;
361 unsigned box_ctl;
362 unsigned msr_offset;
363 unsigned num_shared_regs:8;
364 unsigned single_fixed:1;
365 unsigned pair_ctr_ctl:1;
366 unsigned *msr_offsets;
367 struct event_constraint unconstrainted;
368 struct event_constraint *constraints;
369 struct intel_uncore_pmu *pmus;
370 struct intel_uncore_ops *ops;
371 struct uncore_event_desc *event_descs;
372 const struct attribute_group *attr_groups[4];
373};
374
375#define pmu_group attr_groups[0]
376#define format_group attr_groups[1]
377#define events_group attr_groups[2]
378
379struct intel_uncore_ops {
380 void (*init_box)(struct intel_uncore_box *);
381 void (*disable_box)(struct intel_uncore_box *);
382 void (*enable_box)(struct intel_uncore_box *);
383 void (*disable_event)(struct intel_uncore_box *, struct perf_event *);
384 void (*enable_event)(struct intel_uncore_box *, struct perf_event *);
385 u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *);
386 int (*hw_config)(struct intel_uncore_box *, struct perf_event *);
387 struct event_constraint *(*get_constraint)(struct intel_uncore_box *,
388 struct perf_event *);
389 void (*put_constraint)(struct intel_uncore_box *, struct perf_event *);
390};
391
392struct intel_uncore_pmu {
393 struct pmu pmu;
394 char name[UNCORE_PMU_NAME_LEN];
395 int pmu_idx;
396 int func_id;
397 struct intel_uncore_type *type;
398 struct intel_uncore_box ** __percpu box;
399 struct list_head box_list;
400};
401
402struct intel_uncore_extra_reg {
403 raw_spinlock_t lock;
404 u64 config, config1, config2;
405 atomic_t ref;
406};
407
408struct intel_uncore_box {
409 int phys_id;
410 int n_active; /* number of active events */
411 int n_events;
412 int cpu; /* cpu to collect events */
413 unsigned long flags;
414 atomic_t refcnt;
415 struct perf_event *events[UNCORE_PMC_IDX_MAX];
416 struct perf_event *event_list[UNCORE_PMC_IDX_MAX];
417 unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)];
418 u64 tags[UNCORE_PMC_IDX_MAX];
419 struct pci_dev *pci_dev;
420 struct intel_uncore_pmu *pmu;
421 struct hrtimer hrtimer;
422 struct list_head list;
423 struct intel_uncore_extra_reg shared_regs[0];
424};
425
426#define UNCORE_BOX_FLAG_INITIATED 0
427
428struct uncore_event_desc {
429 struct kobj_attribute attr;
430 const char *config;
431};
432
433#define INTEL_UNCORE_EVENT_DESC(_name, _config) \
434{ \
435 .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \
436 .config = _config, \
437}
438
439#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \
440static ssize_t __uncore_##_var##_show(struct kobject *kobj, \
441 struct kobj_attribute *attr, \
442 char *page) \
443{ \
444 BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \
445 return sprintf(page, _format "\n"); \
446} \
447static struct kobj_attribute format_attr_##_var = \
448 __ATTR(_name, 0444, __uncore_##_var##_show, NULL)
449
450
451static ssize_t uncore_event_show(struct kobject *kobj,
452 struct kobj_attribute *attr, char *buf)
453{
454 struct uncore_event_desc *event =
455 container_of(attr, struct uncore_event_desc, attr);
456 return sprintf(buf, "%s", event->config);
457}
458
459static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box)
460{
461 return box->pmu->type->box_ctl;
462}
463
464static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box)
465{
466 return box->pmu->type->fixed_ctl;
467}
468
469static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box)
470{
471 return box->pmu->type->fixed_ctr;
472}
473
474static inline
475unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx)
476{
477 return idx * 4 + box->pmu->type->event_ctl;
478}
479
480static inline
481unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx)
482{
483 return idx * 8 + box->pmu->type->perf_ctr;
484}
485
486static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box)
487{
488 struct intel_uncore_pmu *pmu = box->pmu;
489 return pmu->type->msr_offsets ?
490 pmu->type->msr_offsets[pmu->pmu_idx] :
491 pmu->type->msr_offset * pmu->pmu_idx;
492}
493
494static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box)
495{
496 if (!box->pmu->type->box_ctl)
497 return 0;
498 return box->pmu->type->box_ctl + uncore_msr_box_offset(box);
499}
500
501static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box)
502{
503 if (!box->pmu->type->fixed_ctl)
504 return 0;
505 return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box);
506}
507
508static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box)
509{
510 return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box);
511}
512
513static inline
514unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx)
515{
516 return box->pmu->type->event_ctl +
517 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
518 uncore_msr_box_offset(box);
519}
520
521static inline
522unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx)
523{
524 return box->pmu->type->perf_ctr +
525 (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) +
526 uncore_msr_box_offset(box);
527}
528
529static inline
530unsigned uncore_fixed_ctl(struct intel_uncore_box *box)
531{
532 if (box->pci_dev)
533 return uncore_pci_fixed_ctl(box);
534 else
535 return uncore_msr_fixed_ctl(box);
536}
537
538static inline
539unsigned uncore_fixed_ctr(struct intel_uncore_box *box)
540{
541 if (box->pci_dev)
542 return uncore_pci_fixed_ctr(box);
543 else
544 return uncore_msr_fixed_ctr(box);
545}
546
547static inline
548unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx)
549{
550 if (box->pci_dev)
551 return uncore_pci_event_ctl(box, idx);
552 else
553 return uncore_msr_event_ctl(box, idx);
554}
555
556static inline
557unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx)
558{
559 if (box->pci_dev)
560 return uncore_pci_perf_ctr(box, idx);
561 else
562 return uncore_msr_perf_ctr(box, idx);
563}
564
565static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box)
566{
567 return box->pmu->type->perf_ctr_bits;
568}
569
570static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box)
571{
572 return box->pmu->type->fixed_ctr_bits;
573}
574
575static inline int uncore_num_counters(struct intel_uncore_box *box)
576{
577 return box->pmu->type->num_counters;
578}
579
580static inline void uncore_disable_box(struct intel_uncore_box *box)
581{
582 if (box->pmu->type->ops->disable_box)
583 box->pmu->type->ops->disable_box(box);
584}
585
586static inline void uncore_enable_box(struct intel_uncore_box *box)
587{
588 if (box->pmu->type->ops->enable_box)
589 box->pmu->type->ops->enable_box(box);
590}
591
592static inline void uncore_disable_event(struct intel_uncore_box *box,
593 struct perf_event *event)
594{
595 box->pmu->type->ops->disable_event(box, event);
596}
597
598static inline void uncore_enable_event(struct intel_uncore_box *box,
599 struct perf_event *event)
600{
601 box->pmu->type->ops->enable_event(box, event);
602}
603
604static inline u64 uncore_read_counter(struct intel_uncore_box *box,
605 struct perf_event *event)
606{
607 return box->pmu->type->ops->read_counter(box, event);
608}
609
610static inline void uncore_box_init(struct intel_uncore_box *box)
611{
612 if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) {
613 if (box->pmu->type->ops->init_box)
614 box->pmu->type->ops->init_box(box);
615 }
616}
617
618static inline bool uncore_box_is_fake(struct intel_uncore_box *box)
619{
620 return (box->phys_id < 0);
621}
diff --git a/arch/x86/kernel/cpu/perf_event_knc.c b/arch/x86/kernel/cpu/perf_event_knc.c
deleted file mode 100644
index 4b7731bf23a..00000000000
--- a/arch/x86/kernel/cpu/perf_event_knc.c
+++ /dev/null
@@ -1,319 +0,0 @@
1/* Driver for Intel Xeon Phi "Knights Corner" PMU */
2
3#include <linux/perf_event.h>
4#include <linux/types.h>
5
6#include <asm/hardirq.h>
7
8#include "perf_event.h"
9
10static const u64 knc_perfmon_event_map[] =
11{
12 [PERF_COUNT_HW_CPU_CYCLES] = 0x002a,
13 [PERF_COUNT_HW_INSTRUCTIONS] = 0x0016,
14 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0028,
15 [PERF_COUNT_HW_CACHE_MISSES] = 0x0029,
16 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x0012,
17 [PERF_COUNT_HW_BRANCH_MISSES] = 0x002b,
18};
19
20static __initconst u64 knc_hw_cache_event_ids
21 [PERF_COUNT_HW_CACHE_MAX]
22 [PERF_COUNT_HW_CACHE_OP_MAX]
23 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
24{
25 [ C(L1D) ] = {
26 [ C(OP_READ) ] = {
27 /* On Xeon Phi event "0" is a valid DATA_READ */
28 /* (L1 Data Cache Reads) Instruction. */
29 /* We code this as ARCH_PERFMON_EVENTSEL_INT as this */
30 /* bit will always be set in x86_pmu_hw_config(). */
31 [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
32 /* DATA_READ */
33 [ C(RESULT_MISS) ] = 0x0003, /* DATA_READ_MISS */
34 },
35 [ C(OP_WRITE) ] = {
36 [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
37 [ C(RESULT_MISS) ] = 0x0004, /* DATA_WRITE_MISS */
38 },
39 [ C(OP_PREFETCH) ] = {
40 [ C(RESULT_ACCESS) ] = 0x0011, /* L1_DATA_PF1 */
41 [ C(RESULT_MISS) ] = 0x001c, /* L1_DATA_PF1_MISS */
42 },
43 },
44 [ C(L1I ) ] = {
45 [ C(OP_READ) ] = {
46 [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
47 [ C(RESULT_MISS) ] = 0x000e, /* CODE_CACHE_MISS */
48 },
49 [ C(OP_WRITE) ] = {
50 [ C(RESULT_ACCESS) ] = -1,
51 [ C(RESULT_MISS) ] = -1,
52 },
53 [ C(OP_PREFETCH) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0,
55 [ C(RESULT_MISS) ] = 0x0,
56 },
57 },
58 [ C(LL ) ] = {
59 [ C(OP_READ) ] = {
60 [ C(RESULT_ACCESS) ] = 0,
61 [ C(RESULT_MISS) ] = 0x10cb, /* L2_READ_MISS */
62 },
63 [ C(OP_WRITE) ] = {
64 [ C(RESULT_ACCESS) ] = 0x10cc, /* L2_WRITE_HIT */
65 [ C(RESULT_MISS) ] = 0,
66 },
67 [ C(OP_PREFETCH) ] = {
68 [ C(RESULT_ACCESS) ] = 0x10fc, /* L2_DATA_PF2 */
69 [ C(RESULT_MISS) ] = 0x10fe, /* L2_DATA_PF2_MISS */
70 },
71 },
72 [ C(DTLB) ] = {
73 [ C(OP_READ) ] = {
74 [ C(RESULT_ACCESS) ] = ARCH_PERFMON_EVENTSEL_INT,
75 /* DATA_READ */
76 /* see note on L1 OP_READ */
77 [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
78 },
79 [ C(OP_WRITE) ] = {
80 [ C(RESULT_ACCESS) ] = 0x0001, /* DATA_WRITE */
81 [ C(RESULT_MISS) ] = 0x0002, /* DATA_PAGE_WALK */
82 },
83 [ C(OP_PREFETCH) ] = {
84 [ C(RESULT_ACCESS) ] = 0x0,
85 [ C(RESULT_MISS) ] = 0x0,
86 },
87 },
88 [ C(ITLB) ] = {
89 [ C(OP_READ) ] = {
90 [ C(RESULT_ACCESS) ] = 0x000c, /* CODE_READ */
91 [ C(RESULT_MISS) ] = 0x000d, /* CODE_PAGE_WALK */
92 },
93 [ C(OP_WRITE) ] = {
94 [ C(RESULT_ACCESS) ] = -1,
95 [ C(RESULT_MISS) ] = -1,
96 },
97 [ C(OP_PREFETCH) ] = {
98 [ C(RESULT_ACCESS) ] = -1,
99 [ C(RESULT_MISS) ] = -1,
100 },
101 },
102 [ C(BPU ) ] = {
103 [ C(OP_READ) ] = {
104 [ C(RESULT_ACCESS) ] = 0x0012, /* BRANCHES */
105 [ C(RESULT_MISS) ] = 0x002b, /* BRANCHES_MISPREDICTED */
106 },
107 [ C(OP_WRITE) ] = {
108 [ C(RESULT_ACCESS) ] = -1,
109 [ C(RESULT_MISS) ] = -1,
110 },
111 [ C(OP_PREFETCH) ] = {
112 [ C(RESULT_ACCESS) ] = -1,
113 [ C(RESULT_MISS) ] = -1,
114 },
115 },
116};
117
118
119static u64 knc_pmu_event_map(int hw_event)
120{
121 return knc_perfmon_event_map[hw_event];
122}
123
124static struct event_constraint knc_event_constraints[] =
125{
126 INTEL_EVENT_CONSTRAINT(0xc3, 0x1), /* HWP_L2HIT */
127 INTEL_EVENT_CONSTRAINT(0xc4, 0x1), /* HWP_L2MISS */
128 INTEL_EVENT_CONSTRAINT(0xc8, 0x1), /* L2_READ_HIT_E */
129 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* L2_READ_HIT_M */
130 INTEL_EVENT_CONSTRAINT(0xca, 0x1), /* L2_READ_HIT_S */
131 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* L2_READ_MISS */
132 INTEL_EVENT_CONSTRAINT(0xcc, 0x1), /* L2_WRITE_HIT */
133 INTEL_EVENT_CONSTRAINT(0xce, 0x1), /* L2_STRONGLY_ORDERED_STREAMING_VSTORES_MISS */
134 INTEL_EVENT_CONSTRAINT(0xcf, 0x1), /* L2_WEAKLY_ORDERED_STREAMING_VSTORE_MISS */
135 INTEL_EVENT_CONSTRAINT(0xd7, 0x1), /* L2_VICTIM_REQ_WITH_DATA */
136 INTEL_EVENT_CONSTRAINT(0xe3, 0x1), /* SNP_HITM_BUNIT */
137 INTEL_EVENT_CONSTRAINT(0xe6, 0x1), /* SNP_HIT_L2 */
138 INTEL_EVENT_CONSTRAINT(0xe7, 0x1), /* SNP_HITM_L2 */
139 INTEL_EVENT_CONSTRAINT(0xf1, 0x1), /* L2_DATA_READ_MISS_CACHE_FILL */
140 INTEL_EVENT_CONSTRAINT(0xf2, 0x1), /* L2_DATA_WRITE_MISS_CACHE_FILL */
141 INTEL_EVENT_CONSTRAINT(0xf6, 0x1), /* L2_DATA_READ_MISS_MEM_FILL */
142 INTEL_EVENT_CONSTRAINT(0xf7, 0x1), /* L2_DATA_WRITE_MISS_MEM_FILL */
143 INTEL_EVENT_CONSTRAINT(0xfc, 0x1), /* L2_DATA_PF2 */
144 INTEL_EVENT_CONSTRAINT(0xfd, 0x1), /* L2_DATA_PF2_DROP */
145 INTEL_EVENT_CONSTRAINT(0xfe, 0x1), /* L2_DATA_PF2_MISS */
146 INTEL_EVENT_CONSTRAINT(0xff, 0x1), /* L2_DATA_HIT_INFLIGHT_PF2 */
147 EVENT_CONSTRAINT_END
148};
149
150#define MSR_KNC_IA32_PERF_GLOBAL_STATUS 0x0000002d
151#define MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL 0x0000002e
152#define MSR_KNC_IA32_PERF_GLOBAL_CTRL 0x0000002f
153
154#define KNC_ENABLE_COUNTER0 0x00000001
155#define KNC_ENABLE_COUNTER1 0x00000002
156
157static void knc_pmu_disable_all(void)
158{
159 u64 val;
160
161 rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
162 val &= ~(KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
163 wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
164}
165
166static void knc_pmu_enable_all(int added)
167{
168 u64 val;
169
170 rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
171 val |= (KNC_ENABLE_COUNTER0|KNC_ENABLE_COUNTER1);
172 wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_CTRL, val);
173}
174
175static inline void
176knc_pmu_disable_event(struct perf_event *event)
177{
178 struct hw_perf_event *hwc = &event->hw;
179 u64 val;
180
181 val = hwc->config;
182 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
183
184 (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
185}
186
187static void knc_pmu_enable_event(struct perf_event *event)
188{
189 struct hw_perf_event *hwc = &event->hw;
190 u64 val;
191
192 val = hwc->config;
193 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
194
195 (void)wrmsrl_safe(hwc->config_base + hwc->idx, val);
196}
197
198static inline u64 knc_pmu_get_status(void)
199{
200 u64 status;
201
202 rdmsrl(MSR_KNC_IA32_PERF_GLOBAL_STATUS, status);
203
204 return status;
205}
206
207static inline void knc_pmu_ack_status(u64 ack)
208{
209 wrmsrl(MSR_KNC_IA32_PERF_GLOBAL_OVF_CONTROL, ack);
210}
211
212static int knc_pmu_handle_irq(struct pt_regs *regs)
213{
214 struct perf_sample_data data;
215 struct cpu_hw_events *cpuc;
216 int handled = 0;
217 int bit, loops;
218 u64 status;
219
220 cpuc = &__get_cpu_var(cpu_hw_events);
221
222 knc_pmu_disable_all();
223
224 status = knc_pmu_get_status();
225 if (!status) {
226 knc_pmu_enable_all(0);
227 return handled;
228 }
229
230 loops = 0;
231again:
232 knc_pmu_ack_status(status);
233 if (++loops > 100) {
234 WARN_ONCE(1, "perf: irq loop stuck!\n");
235 perf_event_print_debug();
236 goto done;
237 }
238
239 inc_irq_stat(apic_perf_irqs);
240
241 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
242 struct perf_event *event = cpuc->events[bit];
243
244 handled++;
245
246 if (!test_bit(bit, cpuc->active_mask))
247 continue;
248
249 if (!intel_pmu_save_and_restart(event))
250 continue;
251
252 perf_sample_data_init(&data, 0, event->hw.last_period);
253
254 if (perf_event_overflow(event, &data, regs))
255 x86_pmu_stop(event, 0);
256 }
257
258 /*
259 * Repeat if there is more work to be done:
260 */
261 status = knc_pmu_get_status();
262 if (status)
263 goto again;
264
265done:
266 knc_pmu_enable_all(0);
267
268 return handled;
269}
270
271
272PMU_FORMAT_ATTR(event, "config:0-7" );
273PMU_FORMAT_ATTR(umask, "config:8-15" );
274PMU_FORMAT_ATTR(edge, "config:18" );
275PMU_FORMAT_ATTR(inv, "config:23" );
276PMU_FORMAT_ATTR(cmask, "config:24-31" );
277
278static struct attribute *intel_knc_formats_attr[] = {
279 &format_attr_event.attr,
280 &format_attr_umask.attr,
281 &format_attr_edge.attr,
282 &format_attr_inv.attr,
283 &format_attr_cmask.attr,
284 NULL,
285};
286
287static __initconst struct x86_pmu knc_pmu = {
288 .name = "knc",
289 .handle_irq = knc_pmu_handle_irq,
290 .disable_all = knc_pmu_disable_all,
291 .enable_all = knc_pmu_enable_all,
292 .enable = knc_pmu_enable_event,
293 .disable = knc_pmu_disable_event,
294 .hw_config = x86_pmu_hw_config,
295 .schedule_events = x86_schedule_events,
296 .eventsel = MSR_KNC_EVNTSEL0,
297 .perfctr = MSR_KNC_PERFCTR0,
298 .event_map = knc_pmu_event_map,
299 .max_events = ARRAY_SIZE(knc_perfmon_event_map),
300 .apic = 1,
301 .max_period = (1ULL << 39) - 1,
302 .version = 0,
303 .num_counters = 2,
304 .cntval_bits = 40,
305 .cntval_mask = (1ULL << 40) - 1,
306 .get_event_constraints = x86_get_event_constraints,
307 .event_constraints = knc_event_constraints,
308 .format_attrs = intel_knc_formats_attr,
309};
310
311__init int knc_pmu_init(void)
312{
313 x86_pmu = knc_pmu;
314
315 memcpy(hw_cache_event_ids, knc_hw_cache_event_ids,
316 sizeof(hw_cache_event_ids));
317
318 return 0;
319}
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 92c7e39a079..7809d2bcb20 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -7,13 +7,9 @@
7 * For licencing details see kernel-base/COPYING 7 * For licencing details see kernel-base/COPYING
8 */ 8 */
9 9
10#include <linux/perf_event.h> 10#ifdef CONFIG_CPU_SUP_INTEL
11 11
12#include <asm/perf_event_p4.h> 12#include <asm/perf_event_p4.h>
13#include <asm/hardirq.h>
14#include <asm/apic.h>
15
16#include "perf_event.h"
17 13
18#define P4_CNTR_LIMIT 3 14#define P4_CNTR_LIMIT 3
19/* 15/*
@@ -895,8 +891,8 @@ static void p4_pmu_disable_pebs(void)
895 * So at moment let leave metrics turned on forever -- it's 891 * So at moment let leave metrics turned on forever -- it's
896 * ok for now but need to be revisited! 892 * ok for now but need to be revisited!
897 * 893 *
898 * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); 894 * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
899 * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); 895 * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
900 */ 896 */
901} 897}
902 898
@@ -909,7 +905,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
909 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 905 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
910 * asserted again and again 906 * asserted again and again
911 */ 907 */
912 (void)wrmsrl_safe(hwc->config_base, 908 (void)checking_wrmsrl(hwc->config_base,
913 (u64)(p4_config_unpack_cccr(hwc->config)) & 909 (u64)(p4_config_unpack_cccr(hwc->config)) &
914 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 910 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
915} 911}
@@ -943,8 +939,8 @@ static void p4_pmu_enable_pebs(u64 config)
943 939
944 bind = &p4_pebs_bind_map[idx]; 940 bind = &p4_pebs_bind_map[idx];
945 941
946 (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); 942 (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
947 (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); 943 (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
948} 944}
949 945
950static void p4_pmu_enable_event(struct perf_event *event) 946static void p4_pmu_enable_event(struct perf_event *event)
@@ -978,8 +974,8 @@ static void p4_pmu_enable_event(struct perf_event *event)
978 */ 974 */
979 p4_pmu_enable_pebs(hwc->config); 975 p4_pmu_enable_pebs(hwc->config);
980 976
981 (void)wrmsrl_safe(escr_addr, escr_conf); 977 (void)checking_wrmsrl(escr_addr, escr_conf);
982 (void)wrmsrl_safe(hwc->config_base, 978 (void)checking_wrmsrl(hwc->config_base,
983 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 979 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
984} 980}
985 981
@@ -1005,6 +1001,8 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
1005 int idx, handled = 0; 1001 int idx, handled = 0;
1006 u64 val; 1002 u64 val;
1007 1003
1004 perf_sample_data_init(&data, 0);
1005
1008 cpuc = &__get_cpu_var(cpu_hw_events); 1006 cpuc = &__get_cpu_var(cpu_hw_events);
1009 1007
1010 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1008 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -1032,12 +1030,10 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
1032 handled += overflow; 1030 handled += overflow;
1033 1031
1034 /* event overflow for sure */ 1032 /* event overflow for sure */
1035 perf_sample_data_init(&data, 0, hwc->last_period); 1033 data.period = event->hw.last_period;
1036 1034
1037 if (!x86_perf_event_set_period(event)) 1035 if (!x86_perf_event_set_period(event))
1038 continue; 1036 continue;
1039
1040
1041 if (perf_event_overflow(event, &data, regs)) 1037 if (perf_event_overflow(event, &data, regs))
1042 x86_pmu_stop(event, 0); 1038 x86_pmu_stop(event, 0);
1043 } 1039 }
@@ -1268,20 +1264,9 @@ reserve:
1268 } 1264 }
1269 1265
1270done: 1266done:
1271 return num ? -EINVAL : 0; 1267 return num ? -ENOSPC : 0;
1272} 1268}
1273 1269
1274PMU_FORMAT_ATTR(cccr, "config:0-31" );
1275PMU_FORMAT_ATTR(escr, "config:32-62");
1276PMU_FORMAT_ATTR(ht, "config:63" );
1277
1278static struct attribute *intel_p4_formats_attr[] = {
1279 &format_attr_cccr.attr,
1280 &format_attr_escr.attr,
1281 &format_attr_ht.attr,
1282 NULL,
1283};
1284
1285static __initconst const struct x86_pmu p4_pmu = { 1270static __initconst const struct x86_pmu p4_pmu = {
1286 .name = "Netburst P4/Xeon", 1271 .name = "Netburst P4/Xeon",
1287 .handle_irq = p4_pmu_handle_irq, 1272 .handle_irq = p4_pmu_handle_irq,
@@ -1316,16 +1301,14 @@ static __initconst const struct x86_pmu p4_pmu = {
1316 * the former idea is taken from OProfile code 1301 * the former idea is taken from OProfile code
1317 */ 1302 */
1318 .perfctr_second_write = 1, 1303 .perfctr_second_write = 1,
1319
1320 .format_attrs = intel_p4_formats_attr,
1321}; 1304};
1322 1305
1323__init int p4_pmu_init(void) 1306static __init int p4_pmu_init(void)
1324{ 1307{
1325 unsigned int low, high; 1308 unsigned int low, high;
1326 1309
1327 /* If we get stripped -- indexing fails */ 1310 /* If we get stripped -- indexing fails */
1328 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); 1311 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
1329 1312
1330 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1313 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
1331 if (!(low & (1 << 7))) { 1314 if (!(low & (1 << 7))) {
@@ -1343,3 +1326,5 @@ __init int p4_pmu_init(void)
1343 1326
1344 return 0; 1327 return 0;
1345} 1328}
1329
1330#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index f2af39f5dc3..20c097e3386 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -1,113 +1,17 @@
1#include <linux/perf_event.h> 1#ifdef CONFIG_CPU_SUP_INTEL
2#include <linux/types.h>
3
4#include "perf_event.h"
5 2
6/* 3/*
7 * Not sure about some of these 4 * Not sure about some of these
8 */ 5 */
9static const u64 p6_perfmon_event_map[] = 6static const u64 p6_perfmon_event_map[] =
10{ 7{
11 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, /* CPU_CLK_UNHALTED */ 8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
12 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, /* INST_RETIRED */ 9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
13 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, /* L2_RQSTS:M:E:S:I */ 10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
14 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, /* L2_RQSTS:I */ 11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
15 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, /* BR_INST_RETIRED */ 12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
16 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, /* BR_MISS_PRED_RETIRED */ 13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
17 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, /* BUS_DRDY_CLOCKS */ 14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
18 [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a2, /* RESOURCE_STALLS */
19
20};
21
22static __initconst u64 p6_hw_cache_event_ids
23 [PERF_COUNT_HW_CACHE_MAX]
24 [PERF_COUNT_HW_CACHE_OP_MAX]
25 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
26{
27 [ C(L1D) ] = {
28 [ C(OP_READ) ] = {
29 [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
30 [ C(RESULT_MISS) ] = 0x0045, /* DCU_LINES_IN */
31 },
32 [ C(OP_WRITE) ] = {
33 [ C(RESULT_ACCESS) ] = 0,
34 [ C(RESULT_MISS) ] = 0x0f29, /* L2_LD:M:E:S:I */
35 },
36 [ C(OP_PREFETCH) ] = {
37 [ C(RESULT_ACCESS) ] = 0,
38 [ C(RESULT_MISS) ] = 0,
39 },
40 },
41 [ C(L1I ) ] = {
42 [ C(OP_READ) ] = {
43 [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
44 [ C(RESULT_MISS) ] = 0x0f28, /* L2_IFETCH:M:E:S:I */
45 },
46 [ C(OP_WRITE) ] = {
47 [ C(RESULT_ACCESS) ] = -1,
48 [ C(RESULT_MISS) ] = -1,
49 },
50 [ C(OP_PREFETCH) ] = {
51 [ C(RESULT_ACCESS) ] = 0,
52 [ C(RESULT_MISS) ] = 0,
53 },
54 },
55 [ C(LL ) ] = {
56 [ C(OP_READ) ] = {
57 [ C(RESULT_ACCESS) ] = 0,
58 [ C(RESULT_MISS) ] = 0,
59 },
60 [ C(OP_WRITE) ] = {
61 [ C(RESULT_ACCESS) ] = 0,
62 [ C(RESULT_MISS) ] = 0x0025, /* L2_M_LINES_INM */
63 },
64 [ C(OP_PREFETCH) ] = {
65 [ C(RESULT_ACCESS) ] = 0,
66 [ C(RESULT_MISS) ] = 0,
67 },
68 },
69 [ C(DTLB) ] = {
70 [ C(OP_READ) ] = {
71 [ C(RESULT_ACCESS) ] = 0x0043, /* DATA_MEM_REFS */
72 [ C(RESULT_MISS) ] = 0,
73 },
74 [ C(OP_WRITE) ] = {
75 [ C(RESULT_ACCESS) ] = 0,
76 [ C(RESULT_MISS) ] = 0,
77 },
78 [ C(OP_PREFETCH) ] = {
79 [ C(RESULT_ACCESS) ] = 0,
80 [ C(RESULT_MISS) ] = 0,
81 },
82 },
83 [ C(ITLB) ] = {
84 [ C(OP_READ) ] = {
85 [ C(RESULT_ACCESS) ] = 0x0080, /* IFU_IFETCH */
86 [ C(RESULT_MISS) ] = 0x0085, /* ITLB_MISS */
87 },
88 [ C(OP_WRITE) ] = {
89 [ C(RESULT_ACCESS) ] = -1,
90 [ C(RESULT_MISS) ] = -1,
91 },
92 [ C(OP_PREFETCH) ] = {
93 [ C(RESULT_ACCESS) ] = -1,
94 [ C(RESULT_MISS) ] = -1,
95 },
96 },
97 [ C(BPU ) ] = {
98 [ C(OP_READ) ] = {
99 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED */
100 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISS_PRED_RETIRED */
101 },
102 [ C(OP_WRITE) ] = {
103 [ C(RESULT_ACCESS) ] = -1,
104 [ C(RESULT_MISS) ] = -1,
105 },
106 [ C(OP_PREFETCH) ] = {
107 [ C(RESULT_ACCESS) ] = -1,
108 [ C(RESULT_MISS) ] = -1,
109 },
110 },
111}; 15};
112 16
113static u64 p6_pmu_event_map(int hw_event) 17static u64 p6_pmu_event_map(int hw_event)
@@ -127,7 +31,7 @@ static struct event_constraint p6_event_constraints[] =
127{ 31{
128 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ 32 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
129 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ 33 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
130 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 34 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
131 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 35 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
132 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ 36 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
133 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ 37 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
@@ -157,46 +61,29 @@ static void p6_pmu_enable_all(int added)
157static inline void 61static inline void
158p6_pmu_disable_event(struct perf_event *event) 62p6_pmu_disable_event(struct perf_event *event)
159{ 63{
64 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
160 struct hw_perf_event *hwc = &event->hw; 65 struct hw_perf_event *hwc = &event->hw;
161 u64 val = P6_NOP_EVENT; 66 u64 val = P6_NOP_EVENT;
162 67
163 (void)wrmsrl_safe(hwc->config_base, val); 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70
71 (void)checking_wrmsrl(hwc->config_base, val);
164} 72}
165 73
166static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
167{ 75{
76 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
168 struct hw_perf_event *hwc = &event->hw; 77 struct hw_perf_event *hwc = &event->hw;
169 u64 val; 78 u64 val;
170 79
171 val = hwc->config; 80 val = hwc->config;
81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
172 83
173 /* 84 (void)checking_wrmsrl(hwc->config_base, val);
174 * p6 only has a global event enable, set on PerfEvtSel0
175 * We "disable" events by programming P6_NOP_EVENT
176 * and we rely on p6_pmu_enable_all() being called
177 * to actually enable the events.
178 */
179
180 (void)wrmsrl_safe(hwc->config_base, val);
181} 85}
182 86
183PMU_FORMAT_ATTR(event, "config:0-7" );
184PMU_FORMAT_ATTR(umask, "config:8-15" );
185PMU_FORMAT_ATTR(edge, "config:18" );
186PMU_FORMAT_ATTR(pc, "config:19" );
187PMU_FORMAT_ATTR(inv, "config:23" );
188PMU_FORMAT_ATTR(cmask, "config:24-31" );
189
190static struct attribute *intel_p6_formats_attr[] = {
191 &format_attr_event.attr,
192 &format_attr_umask.attr,
193 &format_attr_edge.attr,
194 &format_attr_pc.attr,
195 &format_attr_inv.attr,
196 &format_attr_cmask.attr,
197 NULL,
198};
199
200static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
201 .name = "p6", 88 .name = "p6",
202 .handle_irq = x86_pmu_handle_irq, 89 .handle_irq = x86_pmu_handle_irq,
@@ -225,13 +112,9 @@ static __initconst const struct x86_pmu p6_pmu = {
225 .cntval_mask = (1ULL << 32) - 1, 112 .cntval_mask = (1ULL << 32) - 1,
226 .get_event_constraints = x86_get_event_constraints, 113 .get_event_constraints = x86_get_event_constraints,
227 .event_constraints = p6_event_constraints, 114 .event_constraints = p6_event_constraints,
228
229 .format_attrs = intel_p6_formats_attr,
230 .events_sysfs_show = intel_event_sysfs_show,
231
232}; 115};
233 116
234__init int p6_pmu_init(void) 117static __init int p6_pmu_init(void)
235{ 118{
236 switch (boot_cpu_data.x86_model) { 119 switch (boot_cpu_data.x86_model) {
237 case 1: 120 case 1:
@@ -253,9 +136,7 @@ __init int p6_pmu_init(void)
253 136
254 x86_pmu = p6_pmu; 137 x86_pmu = p6_pmu;
255 138
256 memcpy(hw_cache_event_ids, p6_hw_cache_event_ids,
257 sizeof(hw_cache_event_ids));
258
259
260 return 0; 139 return 0;
261} 140}
141
142#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 2e8caf03f59..966512b2cac 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -56,8 +56,6 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
56 switch (boot_cpu_data.x86) { 56 switch (boot_cpu_data.x86) {
57 case 6: 57 case 6:
58 return msr - MSR_P6_PERFCTR0; 58 return msr - MSR_P6_PERFCTR0;
59 case 11:
60 return msr - MSR_KNC_PERFCTR0;
61 case 15: 59 case 15:
62 return msr - MSR_P4_BPU_PERFCTR0; 60 return msr - MSR_P4_BPU_PERFCTR0;
63 } 61 }
@@ -84,8 +82,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
84 switch (boot_cpu_data.x86) { 82 switch (boot_cpu_data.x86) {
85 case 6: 83 case 6:
86 return msr - MSR_P6_EVNTSEL0; 84 return msr - MSR_P6_EVNTSEL0;
87 case 11:
88 return msr - MSR_KNC_EVNTSEL0;
89 case 15: 85 case 15:
90 return msr - MSR_P4_BSU_ESCR0; 86 return msr - MSR_P4_BSU_ESCR0;
91 } 87 }
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
index 7b3fe56b1c2..5abbea297e0 100644
--- a/arch/x86/kernel/cpu/powerflags.c
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -16,6 +16,5 @@ const char *const x86_power_flags[32] = {
16 "100mhzsteps", 16 "100mhzsteps",
17 "hwpstate", 17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */ 18 "", /* tsc invariant mapped to constant_tsc */
19 "cpb", /* core performance boost */ 19 /* nothing */
20 "eff_freq_ro", /* Readonly aperf/mperf */
21}; 20};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 3286a92e662..62ac8cb6ba2 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -26,6 +26,11 @@ static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
26#ifdef CONFIG_X86_32 26#ifdef CONFIG_X86_32
27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) 27static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
28{ 28{
29 /*
30 * We use exception 16 if we have hardware math and we've either seen
31 * it or the CPU claims it is internal
32 */
33 int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
29 seq_printf(m, 34 seq_printf(m,
30 "fdiv_bug\t: %s\n" 35 "fdiv_bug\t: %s\n"
31 "hlt_bug\t\t: %s\n" 36 "hlt_bug\t\t: %s\n"
@@ -40,7 +45,7 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
40 c->f00f_bug ? "yes" : "no", 45 c->f00f_bug ? "yes" : "no",
41 c->coma_bug ? "yes" : "no", 46 c->coma_bug ? "yes" : "no",
42 c->hard_math ? "yes" : "no", 47 c->hard_math ? "yes" : "no",
43 c->hard_math ? "yes" : "no", 48 fpu_exception ? "yes" : "no",
44 c->cpuid_level, 49 c->cpuid_level,
45 c->wp_works_ok ? "yes" : "no"); 50 c->wp_works_ok ? "yes" : "no");
46} 51}
@@ -59,10 +64,12 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
59static int show_cpuinfo(struct seq_file *m, void *v) 64static int show_cpuinfo(struct seq_file *m, void *v)
60{ 65{
61 struct cpuinfo_x86 *c = v; 66 struct cpuinfo_x86 *c = v;
62 unsigned int cpu; 67 unsigned int cpu = 0;
63 int i; 68 int i;
64 69
70#ifdef CONFIG_SMP
65 cpu = c->cpu_index; 71 cpu = c->cpu_index;
72#endif
66 seq_printf(m, "processor\t: %u\n" 73 seq_printf(m, "processor\t: %u\n"
67 "vendor_id\t: %s\n" 74 "vendor_id\t: %s\n"
68 "cpu family\t: %d\n" 75 "cpu family\t: %d\n"
@@ -78,8 +85,6 @@ static int show_cpuinfo(struct seq_file *m, void *v)
78 seq_printf(m, "stepping\t: %d\n", c->x86_mask); 85 seq_printf(m, "stepping\t: %d\n", c->x86_mask);
79 else 86 else
80 seq_printf(m, "stepping\t: unknown\n"); 87 seq_printf(m, "stepping\t: unknown\n");
81 if (c->microcode)
82 seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
83 88
84 if (cpu_has(c, X86_FEATURE_TSC)) { 89 if (cpu_has(c, X86_FEATURE_TSC)) {
85 unsigned int freq = cpufreq_quick_get(cpu); 90 unsigned int freq = cpufreq_quick_get(cpu);
@@ -135,7 +140,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
135 140
136static void *c_start(struct seq_file *m, loff_t *pos) 141static void *c_start(struct seq_file *m, loff_t *pos)
137{ 142{
138 *pos = cpumask_next(*pos - 1, cpu_online_mask); 143 if (*pos == 0) /* just in case, cpu 0 is not the first */
144 *pos = cpumask_first(cpu_online_mask);
145 else
146 *pos = cpumask_next(*pos - 1, cpu_online_mask);
139 if ((*pos) < nr_cpu_ids) 147 if ((*pos) < nr_cpu_ids)
140 return &cpu_data(*pos); 148 return &cpu_data(*pos);
141 return NULL; 149 return NULL;
diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c
deleted file mode 100644
index feca286c2bb..00000000000
--- a/arch/x86/kernel/cpu/rdrand.c
+++ /dev/null
@@ -1,73 +0,0 @@
1/*
2 * This file is part of the Linux kernel.
3 *
4 * Copyright (c) 2011, Intel Corporation
5 * Authors: Fenghua Yu <fenghua.yu@intel.com>,
6 * H. Peter Anvin <hpa@linux.intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 */
22
23#include <asm/processor.h>
24#include <asm/archrandom.h>
25#include <asm/sections.h>
26
27static int __init x86_rdrand_setup(char *s)
28{
29 setup_clear_cpu_cap(X86_FEATURE_RDRAND);
30 return 1;
31}
32__setup("nordrand", x86_rdrand_setup);
33
34/* We can't use arch_get_random_long() here since alternatives haven't run */
35static inline int rdrand_long(unsigned long *v)
36{
37 int ok;
38 asm volatile("1: " RDRAND_LONG "\n\t"
39 "jc 2f\n\t"
40 "decl %0\n\t"
41 "jnz 1b\n\t"
42 "2:"
43 : "=r" (ok), "=a" (*v)
44 : "0" (RDRAND_RETRY_LOOPS));
45 return ok;
46}
47
48/*
49 * Force a reseed cycle; we are architecturally guaranteed a reseed
50 * after no more than 512 128-bit chunks of random data. This also
51 * acts as a test of the CPU capability.
52 */
53#define RESEED_LOOP ((512*128)/sizeof(unsigned long))
54
55void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c)
56{
57#ifdef CONFIG_ARCH_RANDOM
58 unsigned long tmp;
59 int i, count, ok;
60
61 if (!cpu_has(c, X86_FEATURE_RDRAND))
62 return; /* Nothing to do */
63
64 for (count = i = 0; i < RESEED_LOOP; i++) {
65 ok = rdrand_long(&tmp);
66 if (ok)
67 count++;
68 }
69
70 if (count != RESEED_LOOP)
71 clear_cpu_cap(c, X86_FEATURE_RDRAND);
72#endif
73}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index ee8e9abc859..c7f64e6f537 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
31 const struct cpuid_bit *cb; 31 const struct cpuid_bit *cb;
32 32
33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 33 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
34 { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, 34 { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 },
35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, 35 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, 36 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, 37 { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
@@ -40,7 +40,6 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, 40 { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, 41 { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, 42 { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
43 { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 },
44 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, 43 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
45 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
46 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 60c78917190..212a6a42527 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -43,6 +43,7 @@
43 43
44#include <asm/processor.h> 44#include <asm/processor.h>
45#include <asm/msr.h> 45#include <asm/msr.h>
46#include <asm/system.h>
46 47
47static struct class *cpuid_class; 48static struct class *cpuid_class;
48 49
@@ -176,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier =
176 .notifier_call = cpuid_class_cpu_callback, 177 .notifier_call = cpuid_class_cpu_callback,
177}; 178};
178 179
179static char *cpuid_devnode(struct device *dev, umode_t *mode) 180static char *cpuid_devnode(struct device *dev, mode_t *mode)
180{ 181{
181 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); 182 return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt));
182} 183}
@@ -199,14 +200,12 @@ static int __init cpuid_init(void)
199 goto out_chrdev; 200 goto out_chrdev;
200 } 201 }
201 cpuid_class->devnode = cpuid_devnode; 202 cpuid_class->devnode = cpuid_devnode;
202 get_online_cpus();
203 for_each_online_cpu(i) { 203 for_each_online_cpu(i) {
204 err = cpuid_device_create(i); 204 err = cpuid_device_create(i);
205 if (err != 0) 205 if (err != 0)
206 goto out_class; 206 goto out_class;
207 } 207 }
208 register_hotcpu_notifier(&cpuid_class_cpu_notifier); 208 register_hotcpu_notifier(&cpuid_class_cpu_notifier);
209 put_online_cpus();
210 209
211 err = 0; 210 err = 0;
212 goto out; 211 goto out;
@@ -216,7 +215,6 @@ out_class:
216 for_each_online_cpu(i) { 215 for_each_online_cpu(i) {
217 cpuid_device_destroy(i); 216 cpuid_device_destroy(i);
218 } 217 }
219 put_online_cpus();
220 class_destroy(cpuid_class); 218 class_destroy(cpuid_class);
221out_chrdev: 219out_chrdev:
222 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 220 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
@@ -228,13 +226,11 @@ static void __exit cpuid_exit(void)
228{ 226{
229 int cpu = 0; 227 int cpu = 0;
230 228
231 get_online_cpus();
232 for_each_online_cpu(cpu) 229 for_each_online_cpu(cpu)
233 cpuid_device_destroy(cpu); 230 cpuid_device_destroy(cpu);
234 class_destroy(cpuid_class); 231 class_destroy(cpuid_class);
235 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); 232 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
236 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 233 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
237 put_online_cpus();
238} 234}
239 235
240module_init(cpuid_init); 236module_init(cpuid_init);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 74467feb4dc..764c7c2b181 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,7 +16,6 @@
16#include <linux/delay.h> 16#include <linux/delay.h>
17#include <linux/elf.h> 17#include <linux/elf.h>
18#include <linux/elfcore.h> 18#include <linux/elfcore.h>
19#include <linux/module.h>
20 19
21#include <asm/processor.h> 20#include <asm/processor.h>
22#include <asm/hardirq.h> 21#include <asm/hardirq.h>
@@ -31,35 +30,17 @@
31 30
32int in_crash_kexec; 31int in_crash_kexec;
33 32
34/*
35 * This is used to VMCLEAR all VMCSs loaded on the
36 * processor. And when loading kvm_intel module, the
37 * callback function pointer will be assigned.
38 *
39 * protected by rcu.
40 */
41crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
42EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
43
44static inline void cpu_crash_vmclear_loaded_vmcss(void)
45{
46 crash_vmclear_fn *do_vmclear_operation = NULL;
47
48 rcu_read_lock();
49 do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
50 if (do_vmclear_operation)
51 do_vmclear_operation();
52 rcu_read_unlock();
53}
54
55#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
56 34
57static void kdump_nmi_callback(int cpu, struct pt_regs *regs) 35static void kdump_nmi_callback(int cpu, struct die_args *args)
58{ 36{
37 struct pt_regs *regs;
59#ifdef CONFIG_X86_32 38#ifdef CONFIG_X86_32
60 struct pt_regs fixed_regs; 39 struct pt_regs fixed_regs;
61#endif 40#endif
62 41
42 regs = args->regs;
43
63#ifdef CONFIG_X86_32 44#ifdef CONFIG_X86_32
64 if (!user_mode_vm(regs)) { 45 if (!user_mode_vm(regs)) {
65 crash_fixup_ss_esp(&fixed_regs, regs); 46 crash_fixup_ss_esp(&fixed_regs, regs);
@@ -68,11 +49,6 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
68#endif 49#endif
69 crash_save_cpu(regs, cpu); 50 crash_save_cpu(regs, cpu);
70 51
71 /*
72 * VMCLEAR VMCSs loaded on all cpus if needed.
73 */
74 cpu_crash_vmclear_loaded_vmcss();
75
76 /* Disable VMX or SVM if needed. 52 /* Disable VMX or SVM if needed.
77 * 53 *
78 * We need to disable virtualization on all CPUs. 54 * We need to disable virtualization on all CPUs.
@@ -115,11 +91,6 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
115 91
116 kdump_nmi_shootdown_cpus(); 92 kdump_nmi_shootdown_cpus();
117 93
118 /*
119 * VMCLEAR VMCSs loaded on this cpu if needed.
120 */
121 cpu_crash_vmclear_loaded_vmcss();
122
123 /* Booting kdump kernel with VMX or SVM enabled won't work, 94 /* Booting kdump kernel with VMX or SVM enabled won't work,
124 * because (among other limitations) we can't disable paging 95 * because (among other limitations) we can't disable paging
125 * with the virt flags. 96 * with the virt flags.
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 11891ca7b71..642f75a68cd 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -62,16 +62,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
62 62
63 if (!userbuf) { 63 if (!userbuf) {
64 memcpy(buf, (vaddr + offset), csize); 64 memcpy(buf, (vaddr + offset), csize);
65 kunmap_atomic(vaddr); 65 kunmap_atomic(vaddr, KM_PTE0);
66 } else { 66 } else {
67 if (!kdump_buf_page) { 67 if (!kdump_buf_page) {
68 printk(KERN_WARNING "Kdump: Kdump buffer page not" 68 printk(KERN_WARNING "Kdump: Kdump buffer page not"
69 " allocated\n"); 69 " allocated\n");
70 kunmap_atomic(vaddr); 70 kunmap_atomic(vaddr, KM_PTE0);
71 return -EFAULT; 71 return -EFAULT;
72 } 72 }
73 copy_page(kdump_buf_page, vaddr); 73 copy_page(kdump_buf_page, vaddr);
74 kunmap_atomic(vaddr); 74 kunmap_atomic(vaddr, KM_PTE0);
75 if (copy_to_user(buf, (kdump_buf_page + offset), csize)) 75 if (copy_to_user(buf, (kdump_buf_page + offset), csize))
76 return -EFAULT; 76 return -EFAULT;
77 } 77 }
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index b1581527a23..a621f342768 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -2,9 +2,7 @@
2 * Architecture specific OF callbacks. 2 * Architecture specific OF callbacks.
3 */ 3 */
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/export.h>
6#include <linux/io.h> 5#include <linux/io.h>
7#include <linux/irqdomain.h>
8#include <linux/interrupt.h> 6#include <linux/interrupt.h>
9#include <linux/list.h> 7#include <linux/list.h>
10#include <linux/of.h> 8#include <linux/of.h>
@@ -18,14 +16,64 @@
18#include <linux/initrd.h> 16#include <linux/initrd.h>
19 17
20#include <asm/hpet.h> 18#include <asm/hpet.h>
19#include <asm/irq_controller.h>
21#include <asm/apic.h> 20#include <asm/apic.h>
22#include <asm/pci_x86.h> 21#include <asm/pci_x86.h>
23 22
24__initdata u64 initial_dtb; 23__initdata u64 initial_dtb;
25char __initdata cmd_line[COMMAND_LINE_SIZE]; 24char __initdata cmd_line[COMMAND_LINE_SIZE];
25static LIST_HEAD(irq_domains);
26static DEFINE_RAW_SPINLOCK(big_irq_lock);
26 27
27int __initdata of_ioapic; 28int __initdata of_ioapic;
28 29
30#ifdef CONFIG_X86_IO_APIC
31static void add_interrupt_host(struct irq_domain *ih)
32{
33 unsigned long flags;
34
35 raw_spin_lock_irqsave(&big_irq_lock, flags);
36 list_add(&ih->l, &irq_domains);
37 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
38}
39#endif
40
41static struct irq_domain *get_ih_from_node(struct device_node *controller)
42{
43 struct irq_domain *ih, *found = NULL;
44 unsigned long flags;
45
46 raw_spin_lock_irqsave(&big_irq_lock, flags);
47 list_for_each_entry(ih, &irq_domains, l) {
48 if (ih->controller == controller) {
49 found = ih;
50 break;
51 }
52 }
53 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
54 return found;
55}
56
57unsigned int irq_create_of_mapping(struct device_node *controller,
58 const u32 *intspec, unsigned int intsize)
59{
60 struct irq_domain *ih;
61 u32 virq, type;
62 int ret;
63
64 ih = get_ih_from_node(controller);
65 if (!ih)
66 return 0;
67 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
68 if (ret)
69 return 0;
70 if (type == IRQ_TYPE_NONE)
71 return virq;
72 irq_set_irq_type(virq, type);
73 return virq;
74}
75EXPORT_SYMBOL_GPL(irq_create_of_mapping);
76
29unsigned long pci_address_to_pio(phys_addr_t address) 77unsigned long pci_address_to_pio(phys_addr_t address)
30{ 78{
31 /* 79 /*
@@ -305,82 +353,34 @@ static struct of_ioapic_type of_ioapic_type[] =
305 }, 353 },
306}; 354};
307 355
308static int ioapic_xlate(struct irq_domain *domain, 356static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
309 struct device_node *controller, 357 u32 *out_hwirq, u32 *out_type)
310 const u32 *intspec, u32 intsize,
311 irq_hw_number_t *out_hwirq, u32 *out_type)
312{ 358{
359 struct mp_ioapic_gsi *gsi_cfg;
313 struct io_apic_irq_attr attr; 360 struct io_apic_irq_attr attr;
314 struct of_ioapic_type *it; 361 struct of_ioapic_type *it;
315 u32 line, idx; 362 u32 line, idx, type;
316 int rc;
317 363
318 if (WARN_ON(intsize < 2)) 364 if (intsize < 2)
319 return -EINVAL; 365 return -EINVAL;
320 366
321 line = intspec[0]; 367 line = *intspec;
322 368 idx = (u32) id->priv;
323 if (intspec[1] >= ARRAY_SIZE(of_ioapic_type)) 369 gsi_cfg = mp_ioapic_gsi_routing(idx);
324 return -EINVAL; 370 *out_hwirq = line + gsi_cfg->gsi_base;
325 371
326 it = &of_ioapic_type[intspec[1]]; 372 intspec++;
373 type = *intspec;
327 374
328 idx = (u32) domain->host_data; 375 if (type >= ARRAY_SIZE(of_ioapic_type))
329 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity); 376 return -EINVAL;
330
331 rc = io_apic_setup_irq_pin_once(irq_find_mapping(domain, line),
332 cpu_to_node(0), &attr);
333 if (rc)
334 return rc;
335 377
336 *out_hwirq = line; 378 it = of_ioapic_type + type;
337 *out_type = it->out_type; 379 *out_type = it->out_type;
338 return 0;
339}
340 380
341const struct irq_domain_ops ioapic_irq_domain_ops = { 381 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
342 .xlate = ioapic_xlate,
343};
344 382
345static void dt_add_ioapic_domain(unsigned int ioapic_num, 383 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
346 struct device_node *np)
347{
348 struct irq_domain *id;
349 struct mp_ioapic_gsi *gsi_cfg;
350 int ret;
351 int num;
352
353 gsi_cfg = mp_ioapic_gsi_routing(ioapic_num);
354 num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1;
355
356 id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops,
357 (void *)ioapic_num);
358 BUG_ON(!id);
359 if (gsi_cfg->gsi_base == 0) {
360 /*
361 * The first NR_IRQS_LEGACY irq descs are allocated in
362 * early_irq_init() and need just a mapping. The
363 * remaining irqs need both. All of them are preallocated
364 * and assigned so we can keep the 1:1 mapping which the ioapic
365 * is having.
366 */
367 ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY);
368 if (ret)
369 pr_err("Error mapping legacy IRQs: %d\n", ret);
370
371 if (num > NR_IRQS_LEGACY) {
372 ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY,
373 NR_IRQS_LEGACY, num - NR_IRQS_LEGACY);
374 if (ret)
375 pr_err("Error creating mapping for the "
376 "remaining IRQs: %d\n", ret);
377 }
378 irq_set_default_host(id);
379 } else {
380 ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num);
381 if (ret)
382 pr_err("Error creating IRQ mapping: %d\n", ret);
383 }
384} 384}
385 385
386static void __init ioapic_add_ofnode(struct device_node *np) 386static void __init ioapic_add_ofnode(struct device_node *np)
@@ -397,7 +397,14 @@ static void __init ioapic_add_ofnode(struct device_node *np)
397 397
398 for (i = 0; i < nr_ioapics; i++) { 398 for (i = 0; i < nr_ioapics; i++) {
399 if (r.start == mpc_ioapic_addr(i)) { 399 if (r.start == mpc_ioapic_addr(i)) {
400 dt_add_ioapic_domain(i, np); 400 struct irq_domain *id;
401
402 id = kzalloc(sizeof(*id), GFP_KERNEL);
403 BUG_ON(!id);
404 id->controller = np;
405 id->xlate = ioapic_xlate;
406 id->priv = (void *)i;
407 add_interrupt_host(id);
401 return; 408 return;
402 } 409 }
403 } 410 }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index ae42418bc50..1aae78f775f 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,8 +27,8 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 pr_cont(" [<%p>] %s%pB\n", 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 (void *)address, reliable ? "" : "? ", (void *)address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
34#ifdef CONFIG_FUNCTION_GRAPH_TRACER 34#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -37,16 +37,13 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
37 const struct stacktrace_ops *ops, 37 const struct stacktrace_ops *ops,
38 struct thread_info *tinfo, int *graph) 38 struct thread_info *tinfo, int *graph)
39{ 39{
40 struct task_struct *task; 40 struct task_struct *task = tinfo->task;
41 unsigned long ret_addr; 41 unsigned long ret_addr;
42 int index; 42 int index = task->curr_ret_stack;
43 43
44 if (addr != (unsigned long)return_to_handler) 44 if (addr != (unsigned long)return_to_handler)
45 return; 45 return;
46 46
47 task = tinfo->task;
48 index = task->curr_ret_stack;
49
50 if (!task->ret_stack || index < *graph) 47 if (!task->ret_stack || index < *graph)
51 return; 48 return;
52 49
@@ -255,8 +252,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
255 unsigned short ss; 252 unsigned short ss;
256 unsigned long sp; 253 unsigned long sp;
257#endif 254#endif
258 printk(KERN_DEFAULT 255 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
259 "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
260#ifdef CONFIG_PREEMPT 256#ifdef CONFIG_PREEMPT
261 printk("PREEMPT "); 257 printk("PREEMPT ");
262#endif 258#endif
@@ -268,11 +264,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268#endif 264#endif
269 printk("\n"); 265 printk("\n");
270 if (notify_die(DIE_OOPS, str, regs, err, 266 if (notify_die(DIE_OOPS, str, regs, err,
271 current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) 267 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
272 return 1; 268 return 1;
273 269
274 print_modules(); 270 show_registers(regs);
275 show_regs(regs);
276#ifdef CONFIG_X86_32 271#ifdef CONFIG_X86_32
277 if (user_mode_vm(regs)) { 272 if (user_mode_vm(regs)) {
278 sp = regs->sp; 273 sp = regs->sp;
@@ -312,33 +307,16 @@ void die(const char *str, struct pt_regs *regs, long err)
312 307
313static int __init kstack_setup(char *s) 308static int __init kstack_setup(char *s)
314{ 309{
315 ssize_t ret;
316 unsigned long val;
317
318 if (!s) 310 if (!s)
319 return -EINVAL; 311 return -EINVAL;
320 312 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
321 ret = kstrtoul(s, 0, &val);
322 if (ret)
323 return ret;
324 kstack_depth_to_print = val;
325 return 0; 313 return 0;
326} 314}
327early_param("kstack", kstack_setup); 315early_param("kstack", kstack_setup);
328 316
329static int __init code_bytes_setup(char *s) 317static int __init code_bytes_setup(char *s)
330{ 318{
331 ssize_t ret; 319 code_bytes = simple_strtoul(s, NULL, 0);
332 unsigned long val;
333
334 if (!s)
335 return -EINVAL;
336
337 ret = kstrtoul(s, 0, &val);
338 if (ret)
339 return ret;
340
341 code_bytes = val;
342 if (code_bytes > 8192) 320 if (code_bytes > 8192)
343 code_bytes = 8192; 321 code_bytes = 8192;
344 322
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1038a417ea5..3b97a80ce32 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -73,24 +73,25 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
73 if (kstack_end(stack)) 73 if (kstack_end(stack))
74 break; 74 break;
75 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
76 pr_cont("\n"); 76 printk(KERN_CONT "\n");
77 pr_cont(" %08lx", *stack++); 77 printk(KERN_CONT " %08lx", *stack++);
78 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
79 } 79 }
80 pr_cont("\n"); 80 printk(KERN_CONT "\n");
81 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
82} 82}
83 83
84 84
85void show_regs(struct pt_regs *regs) 85void show_registers(struct pt_regs *regs)
86{ 86{
87 int i; 87 int i;
88 88
89 __show_regs(regs, !user_mode_vm(regs)); 89 print_modules();
90 __show_regs(regs, 0);
90 91
91 pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", 92 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
92 TASK_COMM_LEN, current->comm, task_pid_nr(current), 93 TASK_COMM_LEN, current->comm, task_pid_nr(current),
93 current_thread_info(), current, task_thread_info(current)); 94 current_thread_info(), current, task_thread_info(current));
94 /* 95 /*
95 * When in-kernel, we also print out the stack and code at the 96 * When in-kernel, we also print out the stack and code at the
96 * time of the fault.. 97 * time of the fault..
@@ -101,10 +102,10 @@ void show_regs(struct pt_regs *regs)
101 unsigned char c; 102 unsigned char c;
102 u8 *ip; 103 u8 *ip;
103 104
104 pr_emerg("Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
105 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG); 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
106 107
107 pr_emerg("Code:"); 108 printk(KERN_EMERG "Code: ");
108 109
109 ip = (u8 *)regs->ip - code_prologue; 110 ip = (u8 *)regs->ip - code_prologue;
110 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 111 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -115,16 +116,16 @@ void show_regs(struct pt_regs *regs)
115 for (i = 0; i < code_len; i++, ip++) { 116 for (i = 0; i < code_len; i++, ip++) {
116 if (ip < (u8 *)PAGE_OFFSET || 117 if (ip < (u8 *)PAGE_OFFSET ||
117 probe_kernel_address(ip, c)) { 118 probe_kernel_address(ip, c)) {
118 pr_cont(" Bad EIP value."); 119 printk(" Bad EIP value.");
119 break; 120 break;
120 } 121 }
121 if (ip == (u8 *)regs->ip) 122 if (ip == (u8 *)regs->ip)
122 pr_cont(" <%02x>", c); 123 printk("<%02x> ", c);
123 else 124 else
124 pr_cont(" %02x", c); 125 printk("%02x ", c);
125 } 126 }
126 } 127 }
127 pr_cont("\n"); 128 printk("\n");
128} 129}
129 130
130int is_valid_bugaddr(unsigned long ip) 131int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index b653675d528..19853ad8afc 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -129,7 +129,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
129 if (!stack) { 129 if (!stack) {
130 if (regs) 130 if (regs)
131 stack = (unsigned long *)regs->sp; 131 stack = (unsigned long *)regs->sp;
132 else if (task != current) 132 else if (task && task != current)
133 stack = (unsigned long *)task->thread.sp; 133 stack = (unsigned long *)task->thread.sp;
134 else 134 else
135 stack = &dummy; 135 stack = &dummy;
@@ -228,24 +228,24 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
228 if (stack >= irq_stack && stack <= irq_stack_end) { 228 if (stack >= irq_stack && stack <= irq_stack_end) {
229 if (stack == irq_stack_end) { 229 if (stack == irq_stack_end) {
230 stack = (unsigned long *) (irq_stack_end[-1]); 230 stack = (unsigned long *) (irq_stack_end[-1]);
231 pr_cont(" <EOI> "); 231 printk(KERN_CONT " <EOI> ");
232 } 232 }
233 } else { 233 } else {
234 if (((long) stack & (THREAD_SIZE-1)) == 0) 234 if (((long) stack & (THREAD_SIZE-1)) == 0)
235 break; 235 break;
236 } 236 }
237 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 237 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
238 pr_cont("\n"); 238 printk(KERN_CONT "\n");
239 pr_cont(" %016lx", *stack++); 239 printk(KERN_CONT " %016lx", *stack++);
240 touch_nmi_watchdog(); 240 touch_nmi_watchdog();
241 } 241 }
242 preempt_enable(); 242 preempt_enable();
243 243
244 pr_cont("\n"); 244 printk(KERN_CONT "\n");
245 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 245 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
246} 246}
247 247
248void show_regs(struct pt_regs *regs) 248void show_registers(struct pt_regs *regs)
249{ 249{
250 int i; 250 int i;
251 unsigned long sp; 251 unsigned long sp;
@@ -254,9 +254,10 @@ void show_regs(struct pt_regs *regs)
254 254
255 sp = regs->sp; 255 sp = regs->sp;
256 printk("CPU %d ", cpu); 256 printk("CPU %d ", cpu);
257 print_modules();
257 __show_regs(regs, 1); 258 __show_regs(regs, 1);
258 printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", 259 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
259 cur->comm, cur->pid, task_thread_info(cur), cur); 260 cur->comm, cur->pid, task_thread_info(cur), cur);
260 261
261 /* 262 /*
262 * When in-kernel, we also print out the stack and code at the 263 * When in-kernel, we also print out the stack and code at the
@@ -268,11 +269,11 @@ void show_regs(struct pt_regs *regs)
268 unsigned char c; 269 unsigned char c;
269 u8 *ip; 270 u8 *ip;
270 271
271 printk(KERN_DEFAULT "Stack:\n"); 272 printk(KERN_EMERG "Stack:\n");
272 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 273 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
273 0, KERN_DEFAULT); 274 0, KERN_EMERG);
274 275
275 printk(KERN_DEFAULT "Code: "); 276 printk(KERN_EMERG "Code: ");
276 277
277 ip = (u8 *)regs->ip - code_prologue; 278 ip = (u8 *)regs->ip - code_prologue;
278 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { 279 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
@@ -283,16 +284,16 @@ void show_regs(struct pt_regs *regs)
283 for (i = 0; i < code_len; i++, ip++) { 284 for (i = 0; i < code_len; i++, ip++) {
284 if (ip < (u8 *)PAGE_OFFSET || 285 if (ip < (u8 *)PAGE_OFFSET ||
285 probe_kernel_address(ip, c)) { 286 probe_kernel_address(ip, c)) {
286 pr_cont(" Bad RIP value."); 287 printk(" Bad RIP value.");
287 break; 288 break;
288 } 289 }
289 if (ip == (u8 *)regs->ip) 290 if (ip == (u8 *)regs->ip)
290 pr_cont("<%02x> ", c); 291 printk("<%02x> ", c);
291 else 292 else
292 pr_cont("%02x ", c); 293 printk("%02x ", c);
293 } 294 }
294 } 295 }
295 pr_cont("\n"); 296 printk("\n");
296} 297}
297 298
298int is_valid_bugaddr(unsigned long ip) 299int is_valid_bugaddr(unsigned long ip)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index df06ade26be..3e2ef842531 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,14 +12,12 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h> 14#include <linux/crash_dump.h>
15#include <linux/export.h>
16#include <linux/bootmem.h> 15#include <linux/bootmem.h>
17#include <linux/pfn.h> 16#include <linux/pfn.h>
18#include <linux/suspend.h> 17#include <linux/suspend.h>
19#include <linux/acpi.h> 18#include <linux/acpi.h>
20#include <linux/firmware-map.h> 19#include <linux/firmware-map.h>
21#include <linux/memblock.h> 20#include <linux/memblock.h>
22#include <linux/sort.h>
23 21
24#include <asm/e820.h> 22#include <asm/e820.h>
25#include <asm/proto.h> 23#include <asm/proto.h>
@@ -113,9 +111,7 @@ static void __init __e820_add_region(struct e820map *e820x, u64 start, u64 size,
113 int x = e820x->nr_map; 111 int x = e820x->nr_map;
114 112
115 if (x >= ARRAY_SIZE(e820x->map)) { 113 if (x >= ARRAY_SIZE(e820x->map)) {
116 printk(KERN_ERR "e820: too many entries; ignoring [mem %#010llx-%#010llx]\n", 114 printk(KERN_ERR "Ooops! Too many entries in the memory map!\n");
117 (unsigned long long) start,
118 (unsigned long long) (start + size - 1));
119 return; 115 return;
120 } 116 }
121 117
@@ -135,19 +131,19 @@ static void __init e820_print_type(u32 type)
135 switch (type) { 131 switch (type) {
136 case E820_RAM: 132 case E820_RAM:
137 case E820_RESERVED_KERN: 133 case E820_RESERVED_KERN:
138 printk(KERN_CONT "usable"); 134 printk(KERN_CONT "(usable)");
139 break; 135 break;
140 case E820_RESERVED: 136 case E820_RESERVED:
141 printk(KERN_CONT "reserved"); 137 printk(KERN_CONT "(reserved)");
142 break; 138 break;
143 case E820_ACPI: 139 case E820_ACPI:
144 printk(KERN_CONT "ACPI data"); 140 printk(KERN_CONT "(ACPI data)");
145 break; 141 break;
146 case E820_NVS: 142 case E820_NVS:
147 printk(KERN_CONT "ACPI NVS"); 143 printk(KERN_CONT "(ACPI NVS)");
148 break; 144 break;
149 case E820_UNUSABLE: 145 case E820_UNUSABLE:
150 printk(KERN_CONT "unusable"); 146 printk(KERN_CONT "(unusable)");
151 break; 147 break;
152 default: 148 default:
153 printk(KERN_CONT "type %u", type); 149 printk(KERN_CONT "type %u", type);
@@ -160,10 +156,10 @@ void __init e820_print_map(char *who)
160 int i; 156 int i;
161 157
162 for (i = 0; i < e820.nr_map; i++) { 158 for (i = 0; i < e820.nr_map; i++) {
163 printk(KERN_INFO "%s: [mem %#018Lx-%#018Lx] ", who, 159 printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
164 (unsigned long long) e820.map[i].addr, 160 (unsigned long long) e820.map[i].addr,
165 (unsigned long long) 161 (unsigned long long)
166 (e820.map[i].addr + e820.map[i].size - 1)); 162 (e820.map[i].addr + e820.map[i].size));
167 e820_print_type(e820.map[i].type); 163 e820_print_type(e820.map[i].type);
168 printk(KERN_CONT "\n"); 164 printk(KERN_CONT "\n");
169 } 165 }
@@ -230,38 +226,22 @@ void __init e820_print_map(char *who)
230 * ____________________33__ 226 * ____________________33__
231 * ______________________4_ 227 * ______________________4_
232 */ 228 */
233struct change_member {
234 struct e820entry *pbios; /* pointer to original bios entry */
235 unsigned long long addr; /* address for this change point */
236};
237
238static int __init cpcompare(const void *a, const void *b)
239{
240 struct change_member * const *app = a, * const *bpp = b;
241 const struct change_member *ap = *app, *bp = *bpp;
242
243 /*
244 * Inputs are pointers to two elements of change_point[]. If their
245 * addresses are unequal, their difference dominates. If the addresses
246 * are equal, then consider one that represents the end of its region
247 * to be greater than one that does not.
248 */
249 if (ap->addr != bp->addr)
250 return ap->addr > bp->addr ? 1 : -1;
251
252 return (ap->addr != ap->pbios->addr) - (bp->addr != bp->pbios->addr);
253}
254 229
255int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, 230int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
256 u32 *pnr_map) 231 u32 *pnr_map)
257{ 232{
233 struct change_member {
234 struct e820entry *pbios; /* pointer to original bios entry */
235 unsigned long long addr; /* address for this change point */
236 };
258 static struct change_member change_point_list[2*E820_X_MAX] __initdata; 237 static struct change_member change_point_list[2*E820_X_MAX] __initdata;
259 static struct change_member *change_point[2*E820_X_MAX] __initdata; 238 static struct change_member *change_point[2*E820_X_MAX] __initdata;
260 static struct e820entry *overlap_list[E820_X_MAX] __initdata; 239 static struct e820entry *overlap_list[E820_X_MAX] __initdata;
261 static struct e820entry new_bios[E820_X_MAX] __initdata; 240 static struct e820entry new_bios[E820_X_MAX] __initdata;
241 struct change_member *change_tmp;
262 unsigned long current_type, last_type; 242 unsigned long current_type, last_type;
263 unsigned long long last_addr; 243 unsigned long long last_addr;
264 int chgidx; 244 int chgidx, still_changing;
265 int overlap_entries; 245 int overlap_entries;
266 int new_bios_entry; 246 int new_bios_entry;
267 int old_nr, new_nr, chg_nr; 247 int old_nr, new_nr, chg_nr;
@@ -298,7 +278,35 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
298 chg_nr = chgidx; 278 chg_nr = chgidx;
299 279
300 /* sort change-point list by memory addresses (low -> high) */ 280 /* sort change-point list by memory addresses (low -> high) */
301 sort(change_point, chg_nr, sizeof *change_point, cpcompare, NULL); 281 still_changing = 1;
282 while (still_changing) {
283 still_changing = 0;
284 for (i = 1; i < chg_nr; i++) {
285 unsigned long long curaddr, lastaddr;
286 unsigned long long curpbaddr, lastpbaddr;
287
288 curaddr = change_point[i]->addr;
289 lastaddr = change_point[i - 1]->addr;
290 curpbaddr = change_point[i]->pbios->addr;
291 lastpbaddr = change_point[i - 1]->pbios->addr;
292
293 /*
294 * swap entries, when:
295 *
296 * curaddr > lastaddr or
297 * curaddr == lastaddr and curaddr == curpbaddr and
298 * lastaddr != lastpbaddr
299 */
300 if (curaddr < lastaddr ||
301 (curaddr == lastaddr && curaddr == curpbaddr &&
302 lastaddr != lastpbaddr)) {
303 change_tmp = change_point[i];
304 change_point[i] = change_point[i-1];
305 change_point[i-1] = change_tmp;
306 still_changing = 1;
307 }
308 }
309 }
302 310
303 /* create a new bios memory map, removing overlaps */ 311 /* create a new bios memory map, removing overlaps */
304 overlap_entries = 0; /* number of entries in the overlap table */ 312 overlap_entries = 0; /* number of entries in the overlap table */
@@ -430,8 +438,9 @@ static u64 __init __e820_update_range(struct e820map *e820x, u64 start,
430 size = ULLONG_MAX - start; 438 size = ULLONG_MAX - start;
431 439
432 end = start + size; 440 end = start + size;
433 printk(KERN_DEBUG "e820: update [mem %#010Lx-%#010Lx] ", 441 printk(KERN_DEBUG "e820 update range: %016Lx - %016Lx ",
434 (unsigned long long) start, (unsigned long long) (end - 1)); 442 (unsigned long long) start,
443 (unsigned long long) end);
435 e820_print_type(old_type); 444 e820_print_type(old_type);
436 printk(KERN_CONT " ==> "); 445 printk(KERN_CONT " ==> ");
437 e820_print_type(new_type); 446 e820_print_type(new_type);
@@ -510,8 +519,9 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
510 size = ULLONG_MAX - start; 519 size = ULLONG_MAX - start;
511 520
512 end = start + size; 521 end = start + size;
513 printk(KERN_DEBUG "e820: remove [mem %#010Lx-%#010Lx] ", 522 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
514 (unsigned long long) start, (unsigned long long) (end - 1)); 523 (unsigned long long) start,
524 (unsigned long long) end);
515 if (checktype) 525 if (checktype)
516 e820_print_type(old_type); 526 e820_print_type(old_type);
517 printk(KERN_CONT "\n"); 527 printk(KERN_CONT "\n");
@@ -567,7 +577,7 @@ void __init update_e820(void)
567 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map)) 577 if (sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &nr_map))
568 return; 578 return;
569 e820.nr_map = nr_map; 579 e820.nr_map = nr_map;
570 printk(KERN_INFO "e820: modified physical RAM map:\n"); 580 printk(KERN_INFO "modified physical RAM map:\n");
571 e820_print_map("modified"); 581 e820_print_map("modified");
572} 582}
573static void __init update_e820_saved(void) 583static void __init update_e820_saved(void)
@@ -637,8 +647,8 @@ __init void e820_setup_gap(void)
637 if (!found) { 647 if (!found) {
638 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024; 648 gapstart = (max_pfn << PAGE_SHIFT) + 1024*1024;
639 printk(KERN_ERR 649 printk(KERN_ERR
640 "e820: cannot find a gap in the 32bit address range\n" 650 "PCI: Warning: Cannot find a gap in the 32bit address range\n"
641 "e820: PCI devices with unassigned 32bit BARs may break!\n"); 651 "PCI: Unassigned devices with 32bit resource registers may break!\n");
642 } 652 }
643#endif 653#endif
644 654
@@ -648,8 +658,8 @@ __init void e820_setup_gap(void)
648 pci_mem_start = gapstart; 658 pci_mem_start = gapstart;
649 659
650 printk(KERN_INFO 660 printk(KERN_INFO
651 "e820: [mem %#010lx-%#010lx] available for PCI devices\n", 661 "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
652 gapstart, gapstart + gapsize - 1); 662 pci_mem_start, gapstart, gapsize);
653} 663}
654 664
655/** 665/**
@@ -667,7 +677,7 @@ void __init parse_e820_ext(struct setup_data *sdata)
667 extmap = (struct e820entry *)(sdata->data); 677 extmap = (struct e820entry *)(sdata->data);
668 __append_e820_map(extmap, entries); 678 __append_e820_map(extmap, entries);
669 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 679 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
670 printk(KERN_INFO "e820: extended physical RAM map:\n"); 680 printk(KERN_INFO "extended physical RAM map:\n");
671 e820_print_map("extended"); 681 e820_print_map("extended");
672} 682}
673 683
@@ -703,7 +713,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
703} 713}
704#endif 714#endif
705 715
706#ifdef CONFIG_ACPI 716#ifdef CONFIG_HIBERNATION
707/** 717/**
708 * Mark ACPI NVS memory region, so that we can save/restore it during 718 * Mark ACPI NVS memory region, so that we can save/restore it during
709 * hibernation and the subsequent resume. 719 * hibernation and the subsequent resume.
@@ -716,7 +726,7 @@ static int __init e820_mark_nvs_memory(void)
716 struct e820entry *ei = &e820.map[i]; 726 struct e820entry *ei = &e820.map[i];
717 727
718 if (ei->type == E820_NVS) 728 if (ei->type == E820_NVS)
719 acpi_nvs_register(ei->addr, ei->size); 729 suspend_nvs_register(ei->addr, ei->size);
720 } 730 }
721 731
722 return 0; 732 return 0;
@@ -727,17 +737,35 @@ core_initcall(e820_mark_nvs_memory);
727/* 737/*
728 * pre allocated 4k and reserved it in memblock and e820_saved 738 * pre allocated 4k and reserved it in memblock and e820_saved
729 */ 739 */
730u64 __init early_reserve_e820(u64 size, u64 align) 740u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
731{ 741{
742 u64 size = 0;
732 u64 addr; 743 u64 addr;
744 u64 start;
733 745
734 addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); 746 for (start = startt; ; start += size) {
735 if (addr) { 747 start = memblock_x86_find_in_range_size(start, &size, align);
736 e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); 748 if (start == MEMBLOCK_ERROR)
737 printk(KERN_INFO "e820: update e820_saved for early_reserve_e820\n"); 749 return 0;
738 update_e820_saved(); 750 if (size >= sizet)
751 break;
739 } 752 }
740 753
754#ifdef CONFIG_X86_32
755 if (start >= MAXMEM)
756 return 0;
757 if (start + size > MAXMEM)
758 size = MAXMEM - start;
759#endif
760
761 addr = round_down(start + size - sizet, align);
762 if (addr < start)
763 return 0;
764 memblock_x86_reserve_range(addr, addr + sizet, "new next");
765 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
766 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
767 update_e820_saved();
768
741 return addr; 769 return addr;
742} 770}
743 771
@@ -784,7 +812,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
784 if (last_pfn > max_arch_pfn) 812 if (last_pfn > max_arch_pfn)
785 last_pfn = max_arch_pfn; 813 last_pfn = max_arch_pfn;
786 814
787 printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n", 815 printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
788 last_pfn, max_arch_pfn); 816 last_pfn, max_arch_pfn);
789 return last_pfn; 817 return last_pfn;
790} 818}
@@ -888,7 +916,7 @@ void __init finish_e820_parsing(void)
888 early_panic("Invalid user supplied memory map"); 916 early_panic("Invalid user supplied memory map");
889 e820.nr_map = nr; 917 e820.nr_map = nr;
890 918
891 printk(KERN_INFO "e820: user-defined physical RAM map:\n"); 919 printk(KERN_INFO "user-defined physical RAM map:\n");
892 e820_print_map("user"); 920 e820_print_map("user");
893 } 921 }
894} 922}
@@ -944,7 +972,7 @@ void __init e820_reserve_resources(void)
944 for (i = 0; i < e820_saved.nr_map; i++) { 972 for (i = 0; i < e820_saved.nr_map; i++) {
945 struct e820entry *entry = &e820_saved.map[i]; 973 struct e820entry *entry = &e820_saved.map[i];
946 firmware_map_add_early(entry->addr, 974 firmware_map_add_early(entry->addr,
947 entry->addr + entry->size, 975 entry->addr + entry->size - 1,
948 e820_type_to_string(entry->type)); 976 e820_type_to_string(entry->type));
949 } 977 }
950} 978}
@@ -996,9 +1024,8 @@ void __init e820_reserve_resources_late(void)
996 end = MAX_RESOURCE_SIZE; 1024 end = MAX_RESOURCE_SIZE;
997 if (start >= end) 1025 if (start >= end)
998 continue; 1026 continue;
999 printk(KERN_DEBUG 1027 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1000 "e820: reserve RAM buffer [mem %#010llx-%#010llx]\n", 1028 start, end);
1001 start, end);
1002 reserve_region_with_split(&iomem_resource, start, end, 1029 reserve_region_with_split(&iomem_resource, start, end,
1003 "RAM buffer"); 1030 "RAM buffer");
1004 } 1031 }
@@ -1048,7 +1075,7 @@ void __init setup_memory_map(void)
1048 1075
1049 who = x86_init.resources.memory_setup(); 1076 who = x86_init.resources.memory_setup();
1050 memcpy(&e820_saved, &e820, sizeof(struct e820map)); 1077 memcpy(&e820_saved, &e820, sizeof(struct e820map));
1051 printk(KERN_INFO "e820: BIOS-provided physical RAM map:\n"); 1078 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1052 e820_print_map(who); 1079 e820_print_map(who);
1053} 1080}
1054 1081
@@ -1062,7 +1089,7 @@ void __init memblock_x86_fill(void)
1062 * We are safe to enable resizing, beause memblock_x86_fill() 1089 * We are safe to enable resizing, beause memblock_x86_fill()
1063 * is rather later for x86 1090 * is rather later for x86
1064 */ 1091 */
1065 memblock_allow_resize(); 1092 memblock_can_resize = 1;
1066 1093
1067 for (i = 0; i < e820.nr_map; i++) { 1094 for (i = 0; i < e820.nr_map; i++) {
1068 struct e820entry *ei = &e820.map[i]; 1095 struct e820entry *ei = &e820.map[i];
@@ -1077,39 +1104,22 @@ void __init memblock_x86_fill(void)
1077 memblock_add(ei->addr, ei->size); 1104 memblock_add(ei->addr, ei->size);
1078 } 1105 }
1079 1106
1080 /* throw away partial pages */ 1107 memblock_analyze();
1081 memblock_trim_memory(PAGE_SIZE);
1082
1083 memblock_dump_all(); 1108 memblock_dump_all();
1084} 1109}
1085 1110
1086void __init memblock_find_dma_reserve(void) 1111void __init memblock_find_dma_reserve(void)
1087{ 1112{
1088#ifdef CONFIG_X86_64 1113#ifdef CONFIG_X86_64
1089 u64 nr_pages = 0, nr_free_pages = 0; 1114 u64 free_size_pfn;
1090 unsigned long start_pfn, end_pfn; 1115 u64 mem_size_pfn;
1091 phys_addr_t start, end;
1092 int i;
1093 u64 u;
1094
1095 /* 1116 /*
1096 * need to find out used area below MAX_DMA_PFN 1117 * need to find out used area below MAX_DMA_PFN
1097 * need to use memblock to get free size in [0, MAX_DMA_PFN] 1118 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1098 * at first, and assume boot_mem will not take below MAX_DMA_PFN 1119 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1099 */ 1120 */
1100 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { 1121 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1101 start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); 1122 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1102 end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); 1123 set_dma_reserve(mem_size_pfn - free_size_pfn);
1103 nr_pages += end_pfn - start_pfn;
1104 }
1105
1106 for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) {
1107 start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN);
1108 end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN);
1109 if (start_pfn < end_pfn)
1110 nr_free_pages += end_pfn - start_pfn;
1111 }
1112
1113 set_dma_reserve(nr_pages - nr_free_pages);
1114#endif 1124#endif
1115} 1125}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 9b9f18b4991..cd28a350f7f 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -240,14 +240,14 @@ static int __init setup_early_printk(char *buf)
240 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
241 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
242#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_INTEL_MID 243#ifdef CONFIG_EARLY_PRINTK_MRST
244 if (!strncmp(buf, "mrst", 4)) { 244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init(); 245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep); 246 early_console_register(&early_mrst_console, keep);
247 } 247 }
248 248
249 if (!strncmp(buf, "hsu", 3)) { 249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init(buf + 3); 250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep); 251 early_console_register(&early_hsu_console, keep);
252 } 252 }
253#endif 253#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index ff84d5469d7..f3f6f534400 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -42,7 +42,6 @@
42 */ 42 */
43 43
44#include <linux/linkage.h> 44#include <linux/linkage.h>
45#include <linux/err.h>
46#include <asm/thread_info.h> 45#include <asm/thread_info.h>
47#include <asm/irqflags.h> 46#include <asm/irqflags.h>
48#include <asm/errno.h> 47#include <asm/errno.h>
@@ -56,8 +55,6 @@
56#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
57#include <asm/cpufeature.h> 56#include <asm/cpufeature.h>
58#include <asm/alternative-asm.h> 57#include <asm/alternative-asm.h>
59#include <asm/asm.h>
60#include <asm/smap.h>
61 58
62/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
63#include <linux/elf-em.h> 60#include <linux/elf-em.h>
@@ -84,6 +81,8 @@
84 * enough to patch inline, increasing performance. 81 * enough to patch inline, increasing performance.
85 */ 82 */
86 83
84#define nr_syscalls ((syscall_table_size)/4)
85
87#ifdef CONFIG_PREEMPT 86#ifdef CONFIG_PREEMPT
88#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 87#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
89#else 88#else
@@ -100,6 +99,12 @@
100#endif 99#endif
101.endm 100.endm
102 101
102#ifdef CONFIG_VM86
103#define resume_userspace_sig check_userspace
104#else
105#define resume_userspace_sig resume_userspace
106#endif
107
103/* 108/*
104 * User gs save/restore 109 * User gs save/restore
105 * 110 *
@@ -153,8 +158,10 @@
153.pushsection .fixup, "ax" 158.pushsection .fixup, "ax"
15499: movl $0, (%esp) 15999: movl $0, (%esp)
155 jmp 98b 160 jmp 98b
161.section __ex_table, "a"
162 .align 4
163 .long 98b, 99b
156.popsection 164.popsection
157 _ASM_EXTABLE(98b,99b)
158.endm 165.endm
159 166
160.macro PTGS_TO_GS 167.macro PTGS_TO_GS
@@ -164,8 +171,10 @@
164.pushsection .fixup, "ax" 171.pushsection .fixup, "ax"
16599: movl $0, PT_GS(%esp) 17299: movl $0, PT_GS(%esp)
166 jmp 98b 173 jmp 98b
174.section __ex_table, "a"
175 .align 4
176 .long 98b, 99b
167.popsection 177.popsection
168 _ASM_EXTABLE(98b,99b)
169.endm 178.endm
170 179
171.macro GS_TO_REG reg 180.macro GS_TO_REG reg
@@ -247,10 +256,12 @@
247 jmp 2b 256 jmp 2b
2486: movl $0, (%esp) 2576: movl $0, (%esp)
249 jmp 3b 258 jmp 3b
259.section __ex_table, "a"
260 .align 4
261 .long 1b, 4b
262 .long 2b, 5b
263 .long 3b, 6b
250.popsection 264.popsection
251 _ASM_EXTABLE(1b,4b)
252 _ASM_EXTABLE(2b,5b)
253 _ASM_EXTABLE(3b,6b)
254 POP_GS_EX 265 POP_GS_EX
255.endm 266.endm
256 267
@@ -299,21 +310,6 @@ ENTRY(ret_from_fork)
299 CFI_ENDPROC 310 CFI_ENDPROC
300END(ret_from_fork) 311END(ret_from_fork)
301 312
302ENTRY(ret_from_kernel_thread)
303 CFI_STARTPROC
304 pushl_cfi %eax
305 call schedule_tail
306 GET_THREAD_INFO(%ebp)
307 popl_cfi %eax
308 pushl_cfi $0x0202 # Reset kernel eflags
309 popfl_cfi
310 movl PT_EBP(%esp),%eax
311 call *PT_EBX(%esp)
312 movl $0,PT_EAX(%esp)
313 jmp syscall_exit
314 CFI_ENDPROC
315ENDPROC(ret_from_kernel_thread)
316
317/* 313/*
318 * Interrupt exit functions should be protected against kprobes 314 * Interrupt exit functions should be protected against kprobes
319 */ 315 */
@@ -332,17 +328,10 @@ ret_from_exception:
332 preempt_stop(CLBR_ANY) 328 preempt_stop(CLBR_ANY)
333ret_from_intr: 329ret_from_intr:
334 GET_THREAD_INFO(%ebp) 330 GET_THREAD_INFO(%ebp)
335#ifdef CONFIG_VM86 331check_userspace:
336 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS 332 movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
337 movb PT_CS(%esp), %al 333 movb PT_CS(%esp), %al
338 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax 334 andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
339#else
340 /*
341 * We can be coming here from child spawned by kernel_thread().
342 */
343 movl PT_CS(%esp), %eax
344 andl $SEGMENT_RPL_MASK, %eax
345#endif
346 cmpl $USER_RPL, %eax 335 cmpl $USER_RPL, %eax
347 jb resume_kernel # not returning to v8086 or userspace 336 jb resume_kernel # not returning to v8086 or userspace
348 337
@@ -422,18 +411,19 @@ sysenter_past_esp:
422 */ 411 */
423 cmpl $__PAGE_OFFSET-3,%ebp 412 cmpl $__PAGE_OFFSET-3,%ebp
424 jae syscall_fault 413 jae syscall_fault
425 ASM_STAC
4261: movl (%ebp),%ebp 4141: movl (%ebp),%ebp
427 ASM_CLAC
428 movl %ebp,PT_EBP(%esp) 415 movl %ebp,PT_EBP(%esp)
429 _ASM_EXTABLE(1b,syscall_fault) 416.section __ex_table,"a"
417 .align 4
418 .long 1b,syscall_fault
419.previous
430 420
431 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
432 422
433 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 423 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
434 jnz sysenter_audit 424 jnz sysenter_audit
435sysenter_do_call: 425sysenter_do_call:
436 cmpl $(NR_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
437 jae syscall_badsys 427 jae syscall_badsys
438 call *sys_call_table(,%eax,4) 428 call *sys_call_table(,%eax,4)
439 movl %eax,PT_EAX(%esp) 429 movl %eax,PT_EAX(%esp)
@@ -465,7 +455,7 @@ sysenter_audit:
465 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ 455 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
466 movl %eax,%edx /* 2nd arg: syscall number */ 456 movl %eax,%edx /* 2nd arg: syscall number */
467 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 457 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
468 call __audit_syscall_entry 458 call audit_syscall_entry
469 pushl_cfi %ebx 459 pushl_cfi %ebx
470 movl PT_EAX(%esp),%eax /* reload syscall number */ 460 movl PT_EAX(%esp),%eax /* reload syscall number */
471 jmp sysenter_do_call 461 jmp sysenter_do_call
@@ -476,10 +466,11 @@ sysexit_audit:
476 TRACE_IRQS_ON 466 TRACE_IRQS_ON
477 ENABLE_INTERRUPTS(CLBR_ANY) 467 ENABLE_INTERRUPTS(CLBR_ANY)
478 movl %eax,%edx /* second arg, syscall return value */ 468 movl %eax,%edx /* second arg, syscall return value */
479 cmpl $-MAX_ERRNO,%eax /* is it an error ? */ 469 cmpl $0,%eax /* is it < 0? */
480 setbe %al /* 1 if so, 0 if not */ 470 setl %al /* 1 if so, 0 if not */
481 movzbl %al,%eax /* zero-extend that */ 471 movzbl %al,%eax /* zero-extend that */
482 call __audit_syscall_exit 472 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
473 call audit_syscall_exit
483 DISABLE_INTERRUPTS(CLBR_ANY) 474 DISABLE_INTERRUPTS(CLBR_ANY)
484 TRACE_IRQS_OFF 475 TRACE_IRQS_OFF
485 movl TI_flags(%ebp), %ecx 476 movl TI_flags(%ebp), %ecx
@@ -493,8 +484,10 @@ sysexit_audit:
493.pushsection .fixup,"ax" 484.pushsection .fixup,"ax"
4942: movl $0,PT_FS(%esp) 4852: movl $0,PT_FS(%esp)
495 jmp 1b 486 jmp 1b
487.section __ex_table,"a"
488 .align 4
489 .long 1b,2b
496.popsection 490.popsection
497 _ASM_EXTABLE(1b,2b)
498 PTGS_TO_GS_EX 491 PTGS_TO_GS_EX
499ENDPROC(ia32_sysenter_target) 492ENDPROC(ia32_sysenter_target)
500 493
@@ -505,14 +498,13 @@ ENDPROC(ia32_sysenter_target)
505 # system call handler stub 498 # system call handler stub
506ENTRY(system_call) 499ENTRY(system_call)
507 RING0_INT_FRAME # can't unwind into user space anyway 500 RING0_INT_FRAME # can't unwind into user space anyway
508 ASM_CLAC
509 pushl_cfi %eax # save orig_eax 501 pushl_cfi %eax # save orig_eax
510 SAVE_ALL 502 SAVE_ALL
511 GET_THREAD_INFO(%ebp) 503 GET_THREAD_INFO(%ebp)
512 # system call tracing in operation / emulation 504 # system call tracing in operation / emulation
513 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) 505 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
514 jnz syscall_trace_entry 506 jnz syscall_trace_entry
515 cmpl $(NR_syscalls), %eax 507 cmpl $(nr_syscalls), %eax
516 jae syscall_badsys 508 jae syscall_badsys
517syscall_call: 509syscall_call:
518 call *sys_call_table(,%eax,4) 510 call *sys_call_table(,%eax,4)
@@ -550,7 +542,10 @@ ENTRY(iret_exc)
550 pushl $do_iret_error 542 pushl $do_iret_error
551 jmp error_code 543 jmp error_code
552.previous 544.previous
553 _ASM_EXTABLE(irq_return,iret_exc) 545.section __ex_table,"a"
546 .align 4
547 .long irq_return,iret_exc
548.previous
554 549
555 CFI_RESTORE_STATE 550 CFI_RESTORE_STATE
556ldt_ss: 551ldt_ss:
@@ -630,29 +625,22 @@ work_notifysig: # deal with pending signals and
630 movl %esp, %eax 625 movl %esp, %eax
631 jne work_notifysig_v86 # returning to kernel-space or 626 jne work_notifysig_v86 # returning to kernel-space or
632 # vm86-space 627 # vm86-space
6331:
634#else
635 movl %esp, %eax
636#endif
637 TRACE_IRQS_ON
638 ENABLE_INTERRUPTS(CLBR_NONE)
639 movb PT_CS(%esp), %bl
640 andb $SEGMENT_RPL_MASK, %bl
641 cmpb $USER_RPL, %bl
642 jb resume_kernel
643 xorl %edx, %edx 628 xorl %edx, %edx
644 call do_notify_resume 629 call do_notify_resume
645 jmp resume_userspace 630 jmp resume_userspace_sig
646 631
647#ifdef CONFIG_VM86
648 ALIGN 632 ALIGN
649work_notifysig_v86: 633work_notifysig_v86:
650 pushl_cfi %ecx # save ti_flags for do_notify_resume 634 pushl_cfi %ecx # save ti_flags for do_notify_resume
651 call save_v86_state # %eax contains pt_regs pointer 635 call save_v86_state # %eax contains pt_regs pointer
652 popl_cfi %ecx 636 popl_cfi %ecx
653 movl %eax, %esp 637 movl %eax, %esp
654 jmp 1b 638#else
639 movl %esp, %eax
655#endif 640#endif
641 xorl %edx, %edx
642 call do_notify_resume
643 jmp resume_userspace_sig
656END(work_pending) 644END(work_pending)
657 645
658 # perform syscall exit tracing 646 # perform syscall exit tracing
@@ -662,7 +650,7 @@ syscall_trace_entry:
662 movl %esp, %eax 650 movl %esp, %eax
663 call syscall_trace_enter 651 call syscall_trace_enter
664 /* What it returned is what we'll actually use. */ 652 /* What it returned is what we'll actually use. */
665 cmpl $(NR_syscalls), %eax 653 cmpl $(nr_syscalls), %eax
666 jnae syscall_call 654 jnae syscall_call
667 jmp syscall_exit 655 jmp syscall_exit
668END(syscall_trace_entry) 656END(syscall_trace_entry)
@@ -683,7 +671,6 @@ END(syscall_exit_work)
683 671
684 RING0_INT_FRAME # can't unwind into user space anyway 672 RING0_INT_FRAME # can't unwind into user space anyway
685syscall_fault: 673syscall_fault:
686 ASM_CLAC
687 GET_THREAD_INFO(%ebp) 674 GET_THREAD_INFO(%ebp)
688 movl $-EFAULT,PT_EAX(%esp) 675 movl $-EFAULT,PT_EAX(%esp)
689 jmp resume_userspace 676 jmp resume_userspace
@@ -703,28 +690,29 @@ END(syscall_badsys)
703 * System calls that need a pt_regs pointer. 690 * System calls that need a pt_regs pointer.
704 */ 691 */
705#define PTREGSCALL0(name) \ 692#define PTREGSCALL0(name) \
706ENTRY(ptregs_##name) ; \ 693 ALIGN; \
694ptregs_##name: \
707 leal 4(%esp),%eax; \ 695 leal 4(%esp),%eax; \
708 jmp sys_##name; \ 696 jmp sys_##name;
709ENDPROC(ptregs_##name)
710 697
711#define PTREGSCALL1(name) \ 698#define PTREGSCALL1(name) \
712ENTRY(ptregs_##name) ; \ 699 ALIGN; \
700ptregs_##name: \
713 leal 4(%esp),%edx; \ 701 leal 4(%esp),%edx; \
714 movl (PT_EBX+4)(%esp),%eax; \ 702 movl (PT_EBX+4)(%esp),%eax; \
715 jmp sys_##name; \ 703 jmp sys_##name;
716ENDPROC(ptregs_##name)
717 704
718#define PTREGSCALL2(name) \ 705#define PTREGSCALL2(name) \
719ENTRY(ptregs_##name) ; \ 706 ALIGN; \
707ptregs_##name: \
720 leal 4(%esp),%ecx; \ 708 leal 4(%esp),%ecx; \
721 movl (PT_ECX+4)(%esp),%edx; \ 709 movl (PT_ECX+4)(%esp),%edx; \
722 movl (PT_EBX+4)(%esp),%eax; \ 710 movl (PT_EBX+4)(%esp),%eax; \
723 jmp sys_##name; \ 711 jmp sys_##name;
724ENDPROC(ptregs_##name)
725 712
726#define PTREGSCALL3(name) \ 713#define PTREGSCALL3(name) \
727ENTRY(ptregs_##name) ; \ 714 ALIGN; \
715ptregs_##name: \
728 CFI_STARTPROC; \ 716 CFI_STARTPROC; \
729 leal 4(%esp),%eax; \ 717 leal 4(%esp),%eax; \
730 pushl_cfi %eax; \ 718 pushl_cfi %eax; \
@@ -739,11 +727,32 @@ ENTRY(ptregs_##name) ; \
739ENDPROC(ptregs_##name) 727ENDPROC(ptregs_##name)
740 728
741PTREGSCALL1(iopl) 729PTREGSCALL1(iopl)
730PTREGSCALL0(fork)
731PTREGSCALL0(vfork)
732PTREGSCALL3(execve)
733PTREGSCALL2(sigaltstack)
742PTREGSCALL0(sigreturn) 734PTREGSCALL0(sigreturn)
743PTREGSCALL0(rt_sigreturn) 735PTREGSCALL0(rt_sigreturn)
744PTREGSCALL2(vm86) 736PTREGSCALL2(vm86)
745PTREGSCALL1(vm86old) 737PTREGSCALL1(vm86old)
746 738
739/* Clone is an oddball. The 4th arg is in %edi */
740 ALIGN;
741ptregs_clone:
742 CFI_STARTPROC
743 leal 4(%esp),%eax
744 pushl_cfi %eax
745 pushl_cfi PT_EDI(%eax)
746 movl PT_EDX(%eax),%ecx
747 movl PT_ECX(%eax),%edx
748 movl PT_EBX(%eax),%eax
749 call sys_clone
750 addl $8,%esp
751 CFI_ADJUST_CFA_OFFSET -8
752 ret
753 CFI_ENDPROC
754ENDPROC(ptregs_clone)
755
747.macro FIXUP_ESPFIX_STACK 756.macro FIXUP_ESPFIX_STACK
748/* 757/*
749 * Switch back for ESPFIX stack to the normal zerobased stack 758 * Switch back for ESPFIX stack to the normal zerobased stack
@@ -819,7 +828,6 @@ END(interrupt)
819 */ 828 */
820 .p2align CONFIG_X86_L1_CACHE_SHIFT 829 .p2align CONFIG_X86_L1_CACHE_SHIFT
821common_interrupt: 830common_interrupt:
822 ASM_CLAC
823 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */ 831 addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
824 SAVE_ALL 832 SAVE_ALL
825 TRACE_IRQS_OFF 833 TRACE_IRQS_OFF
@@ -836,7 +844,6 @@ ENDPROC(common_interrupt)
836#define BUILD_INTERRUPT3(name, nr, fn) \ 844#define BUILD_INTERRUPT3(name, nr, fn) \
837ENTRY(name) \ 845ENTRY(name) \
838 RING0_INT_FRAME; \ 846 RING0_INT_FRAME; \
839 ASM_CLAC; \
840 pushl_cfi $~(nr); \ 847 pushl_cfi $~(nr); \
841 SAVE_ALL; \ 848 SAVE_ALL; \
842 TRACE_IRQS_OFF \ 849 TRACE_IRQS_OFF \
@@ -853,7 +860,6 @@ ENDPROC(name)
853 860
854ENTRY(coprocessor_error) 861ENTRY(coprocessor_error)
855 RING0_INT_FRAME 862 RING0_INT_FRAME
856 ASM_CLAC
857 pushl_cfi $0 863 pushl_cfi $0
858 pushl_cfi $do_coprocessor_error 864 pushl_cfi $do_coprocessor_error
859 jmp error_code 865 jmp error_code
@@ -862,7 +868,6 @@ END(coprocessor_error)
862 868
863ENTRY(simd_coprocessor_error) 869ENTRY(simd_coprocessor_error)
864 RING0_INT_FRAME 870 RING0_INT_FRAME
865 ASM_CLAC
866 pushl_cfi $0 871 pushl_cfi $0
867#ifdef CONFIG_X86_INVD_BUG 872#ifdef CONFIG_X86_INVD_BUG
868 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 873 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
@@ -884,7 +889,6 @@ END(simd_coprocessor_error)
884 889
885ENTRY(device_not_available) 890ENTRY(device_not_available)
886 RING0_INT_FRAME 891 RING0_INT_FRAME
887 ASM_CLAC
888 pushl_cfi $-1 # mark this as an int 892 pushl_cfi $-1 # mark this as an int
889 pushl_cfi $do_device_not_available 893 pushl_cfi $do_device_not_available
890 jmp error_code 894 jmp error_code
@@ -894,7 +898,10 @@ END(device_not_available)
894#ifdef CONFIG_PARAVIRT 898#ifdef CONFIG_PARAVIRT
895ENTRY(native_iret) 899ENTRY(native_iret)
896 iret 900 iret
897 _ASM_EXTABLE(native_iret, iret_exc) 901.section __ex_table,"a"
902 .align 4
903 .long native_iret, iret_exc
904.previous
898END(native_iret) 905END(native_iret)
899 906
900ENTRY(native_irq_enable_sysexit) 907ENTRY(native_irq_enable_sysexit)
@@ -905,7 +912,6 @@ END(native_irq_enable_sysexit)
905 912
906ENTRY(overflow) 913ENTRY(overflow)
907 RING0_INT_FRAME 914 RING0_INT_FRAME
908 ASM_CLAC
909 pushl_cfi $0 915 pushl_cfi $0
910 pushl_cfi $do_overflow 916 pushl_cfi $do_overflow
911 jmp error_code 917 jmp error_code
@@ -914,7 +920,6 @@ END(overflow)
914 920
915ENTRY(bounds) 921ENTRY(bounds)
916 RING0_INT_FRAME 922 RING0_INT_FRAME
917 ASM_CLAC
918 pushl_cfi $0 923 pushl_cfi $0
919 pushl_cfi $do_bounds 924 pushl_cfi $do_bounds
920 jmp error_code 925 jmp error_code
@@ -923,7 +928,6 @@ END(bounds)
923 928
924ENTRY(invalid_op) 929ENTRY(invalid_op)
925 RING0_INT_FRAME 930 RING0_INT_FRAME
926 ASM_CLAC
927 pushl_cfi $0 931 pushl_cfi $0
928 pushl_cfi $do_invalid_op 932 pushl_cfi $do_invalid_op
929 jmp error_code 933 jmp error_code
@@ -932,7 +936,6 @@ END(invalid_op)
932 936
933ENTRY(coprocessor_segment_overrun) 937ENTRY(coprocessor_segment_overrun)
934 RING0_INT_FRAME 938 RING0_INT_FRAME
935 ASM_CLAC
936 pushl_cfi $0 939 pushl_cfi $0
937 pushl_cfi $do_coprocessor_segment_overrun 940 pushl_cfi $do_coprocessor_segment_overrun
938 jmp error_code 941 jmp error_code
@@ -941,7 +944,6 @@ END(coprocessor_segment_overrun)
941 944
942ENTRY(invalid_TSS) 945ENTRY(invalid_TSS)
943 RING0_EC_FRAME 946 RING0_EC_FRAME
944 ASM_CLAC
945 pushl_cfi $do_invalid_TSS 947 pushl_cfi $do_invalid_TSS
946 jmp error_code 948 jmp error_code
947 CFI_ENDPROC 949 CFI_ENDPROC
@@ -949,7 +951,6 @@ END(invalid_TSS)
949 951
950ENTRY(segment_not_present) 952ENTRY(segment_not_present)
951 RING0_EC_FRAME 953 RING0_EC_FRAME
952 ASM_CLAC
953 pushl_cfi $do_segment_not_present 954 pushl_cfi $do_segment_not_present
954 jmp error_code 955 jmp error_code
955 CFI_ENDPROC 956 CFI_ENDPROC
@@ -957,7 +958,6 @@ END(segment_not_present)
957 958
958ENTRY(stack_segment) 959ENTRY(stack_segment)
959 RING0_EC_FRAME 960 RING0_EC_FRAME
960 ASM_CLAC
961 pushl_cfi $do_stack_segment 961 pushl_cfi $do_stack_segment
962 jmp error_code 962 jmp error_code
963 CFI_ENDPROC 963 CFI_ENDPROC
@@ -965,7 +965,6 @@ END(stack_segment)
965 965
966ENTRY(alignment_check) 966ENTRY(alignment_check)
967 RING0_EC_FRAME 967 RING0_EC_FRAME
968 ASM_CLAC
969 pushl_cfi $do_alignment_check 968 pushl_cfi $do_alignment_check
970 jmp error_code 969 jmp error_code
971 CFI_ENDPROC 970 CFI_ENDPROC
@@ -973,7 +972,6 @@ END(alignment_check)
973 972
974ENTRY(divide_error) 973ENTRY(divide_error)
975 RING0_INT_FRAME 974 RING0_INT_FRAME
976 ASM_CLAC
977 pushl_cfi $0 # no error code 975 pushl_cfi $0 # no error code
978 pushl_cfi $do_divide_error 976 pushl_cfi $do_divide_error
979 jmp error_code 977 jmp error_code
@@ -983,7 +981,6 @@ END(divide_error)
983#ifdef CONFIG_X86_MCE 981#ifdef CONFIG_X86_MCE
984ENTRY(machine_check) 982ENTRY(machine_check)
985 RING0_INT_FRAME 983 RING0_INT_FRAME
986 ASM_CLAC
987 pushl_cfi $0 984 pushl_cfi $0
988 pushl_cfi machine_check_vector 985 pushl_cfi machine_check_vector
989 jmp error_code 986 jmp error_code
@@ -993,7 +990,6 @@ END(machine_check)
993 990
994ENTRY(spurious_interrupt_bug) 991ENTRY(spurious_interrupt_bug)
995 RING0_INT_FRAME 992 RING0_INT_FRAME
996 ASM_CLAC
997 pushl_cfi $0 993 pushl_cfi $0
998 pushl_cfi $do_spurious_interrupt_bug 994 pushl_cfi $do_spurious_interrupt_bug
999 jmp error_code 995 jmp error_code
@@ -1004,6 +1000,16 @@ END(spurious_interrupt_bug)
1004 */ 1000 */
1005 .popsection 1001 .popsection
1006 1002
1003ENTRY(kernel_thread_helper)
1004 pushl $0 # fake return address for unwinder
1005 CFI_STARTPROC
1006 movl %edi,%eax
1007 call *%esi
1008 call do_exit
1009 ud2 # padding for call trace
1010 CFI_ENDPROC
1011ENDPROC(kernel_thread_helper)
1012
1007#ifdef CONFIG_XEN 1013#ifdef CONFIG_XEN
1008/* Xen doesn't set %esp to be precisely what the normal sysenter 1014/* Xen doesn't set %esp to be precisely what the normal sysenter
1009 entrypoint expects, so fix it up before using the normal path. */ 1015 entrypoint expects, so fix it up before using the normal path. */
@@ -1016,7 +1022,7 @@ ENTRY(xen_sysenter_target)
1016 1022
1017ENTRY(xen_hypervisor_callback) 1023ENTRY(xen_hypervisor_callback)
1018 CFI_STARTPROC 1024 CFI_STARTPROC
1019 pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 1025 pushl_cfi $0
1020 SAVE_ALL 1026 SAVE_ALL
1021 TRACE_IRQS_OFF 1027 TRACE_IRQS_OFF
1022 1028
@@ -1058,16 +1064,14 @@ ENTRY(xen_failsafe_callback)
10582: mov 8(%esp),%es 10642: mov 8(%esp),%es
10593: mov 12(%esp),%fs 10653: mov 12(%esp),%fs
10604: mov 16(%esp),%gs 10664: mov 16(%esp),%gs
1061 /* EAX == 0 => Category 1 (Bad segment)
1062 EAX != 0 => Category 2 (Bad IRET) */
1063 testl %eax,%eax 1067 testl %eax,%eax
1064 popl_cfi %eax 1068 popl_cfi %eax
1065 lea 16(%esp),%esp 1069 lea 16(%esp),%esp
1066 CFI_ADJUST_CFA_OFFSET -16 1070 CFI_ADJUST_CFA_OFFSET -16
1067 jz 5f 1071 jz 5f
1068 addl $16,%esp 1072 addl $16,%esp
1069 jmp iret_exc 1073 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
10705: pushl_cfi $-1 /* orig_ax = -1 => not a system call */ 10745: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
1071 SAVE_ALL 1075 SAVE_ALL
1072 jmp ret_from_exception 1076 jmp ret_from_exception
1073 CFI_ENDPROC 1077 CFI_ENDPROC
@@ -1086,10 +1090,13 @@ ENTRY(xen_failsafe_callback)
1086 movl %eax,16(%esp) 1090 movl %eax,16(%esp)
1087 jmp 4b 1091 jmp 4b
1088.previous 1092.previous
1089 _ASM_EXTABLE(1b,6b) 1093.section __ex_table,"a"
1090 _ASM_EXTABLE(2b,7b) 1094 .align 4
1091 _ASM_EXTABLE(3b,8b) 1095 .long 1b,6b
1092 _ASM_EXTABLE(4b,9b) 1096 .long 2b,7b
1097 .long 3b,8b
1098 .long 4b,9b
1099.previous
1093ENDPROC(xen_failsafe_callback) 1100ENDPROC(xen_failsafe_callback)
1094 1101
1095BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, 1102BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
@@ -1111,21 +1118,17 @@ ENTRY(ftrace_caller)
1111 pushl %eax 1118 pushl %eax
1112 pushl %ecx 1119 pushl %ecx
1113 pushl %edx 1120 pushl %edx
1114 pushl $0 /* Pass NULL as regs pointer */ 1121 movl 0xc(%esp), %eax
1115 movl 4*4(%esp), %eax
1116 movl 0x4(%ebp), %edx 1122 movl 0x4(%ebp), %edx
1117 leal function_trace_op, %ecx
1118 subl $MCOUNT_INSN_SIZE, %eax 1123 subl $MCOUNT_INSN_SIZE, %eax
1119 1124
1120.globl ftrace_call 1125.globl ftrace_call
1121ftrace_call: 1126ftrace_call:
1122 call ftrace_stub 1127 call ftrace_stub
1123 1128
1124 addl $4,%esp /* skip NULL pointer */
1125 popl %edx 1129 popl %edx
1126 popl %ecx 1130 popl %ecx
1127 popl %eax 1131 popl %eax
1128ftrace_ret:
1129#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1132#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1130.globl ftrace_graph_call 1133.globl ftrace_graph_call
1131ftrace_graph_call: 1134ftrace_graph_call:
@@ -1137,71 +1140,6 @@ ftrace_stub:
1137 ret 1140 ret
1138END(ftrace_caller) 1141END(ftrace_caller)
1139 1142
1140ENTRY(ftrace_regs_caller)
1141 pushf /* push flags before compare (in cs location) */
1142 cmpl $0, function_trace_stop
1143 jne ftrace_restore_flags
1144
1145 /*
1146 * i386 does not save SS and ESP when coming from kernel.
1147 * Instead, to get sp, &regs->sp is used (see ptrace.h).
1148 * Unfortunately, that means eflags must be at the same location
1149 * as the current return ip is. We move the return ip into the
1150 * ip location, and move flags into the return ip location.
1151 */
1152 pushl 4(%esp) /* save return ip into ip slot */
1153
1154 pushl $0 /* Load 0 into orig_ax */
1155 pushl %gs
1156 pushl %fs
1157 pushl %es
1158 pushl %ds
1159 pushl %eax
1160 pushl %ebp
1161 pushl %edi
1162 pushl %esi
1163 pushl %edx
1164 pushl %ecx
1165 pushl %ebx
1166
1167 movl 13*4(%esp), %eax /* Get the saved flags */
1168 movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
1169 /* clobbering return ip */
1170 movl $__KERNEL_CS,13*4(%esp)
1171
1172 movl 12*4(%esp), %eax /* Load ip (1st parameter) */
1173 subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
1174 movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
1175 leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
1176 pushl %esp /* Save pt_regs as 4th parameter */
1177
1178GLOBAL(ftrace_regs_call)
1179 call ftrace_stub
1180
1181 addl $4, %esp /* Skip pt_regs */
1182 movl 14*4(%esp), %eax /* Move flags back into cs */
1183 movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
1184 movl 12*4(%esp), %eax /* Get return ip from regs->ip */
1185 movl %eax, 14*4(%esp) /* Put return ip back for ret */
1186
1187 popl %ebx
1188 popl %ecx
1189 popl %edx
1190 popl %esi
1191 popl %edi
1192 popl %ebp
1193 popl %eax
1194 popl %ds
1195 popl %es
1196 popl %fs
1197 popl %gs
1198 addl $8, %esp /* Skip orig_ax and ip */
1199 popf /* Pop flags at end (no addl to corrupt flags) */
1200 jmp ftrace_ret
1201
1202ftrace_restore_flags:
1203 popf
1204 jmp ftrace_stub
1205#else /* ! CONFIG_DYNAMIC_FTRACE */ 1143#else /* ! CONFIG_DYNAMIC_FTRACE */
1206 1144
1207ENTRY(mcount) 1145ENTRY(mcount)
@@ -1242,6 +1180,9 @@ END(mcount)
1242 1180
1243#ifdef CONFIG_FUNCTION_GRAPH_TRACER 1181#ifdef CONFIG_FUNCTION_GRAPH_TRACER
1244ENTRY(ftrace_graph_caller) 1182ENTRY(ftrace_graph_caller)
1183 cmpl $0, function_trace_stop
1184 jne ftrace_stub
1185
1245 pushl %eax 1186 pushl %eax
1246 pushl %ecx 1187 pushl %ecx
1247 pushl %edx 1188 pushl %edx
@@ -1268,6 +1209,11 @@ return_to_handler:
1268 jmp *%ecx 1209 jmp *%ecx
1269#endif 1210#endif
1270 1211
1212.section .rodata,"a"
1213#include "syscall_table_32.S"
1214
1215syscall_table_size=(.-sys_call_table)
1216
1271/* 1217/*
1272 * Some functions should be protected against kprobes 1218 * Some functions should be protected against kprobes
1273 */ 1219 */
@@ -1275,7 +1221,6 @@ return_to_handler:
1275 1221
1276ENTRY(page_fault) 1222ENTRY(page_fault)
1277 RING0_EC_FRAME 1223 RING0_EC_FRAME
1278 ASM_CLAC
1279 pushl_cfi $do_page_fault 1224 pushl_cfi $do_page_fault
1280 ALIGN 1225 ALIGN
1281error_code: 1226error_code:
@@ -1348,7 +1293,6 @@ END(page_fault)
1348 1293
1349ENTRY(debug) 1294ENTRY(debug)
1350 RING0_INT_FRAME 1295 RING0_INT_FRAME
1351 ASM_CLAC
1352 cmpl $ia32_sysenter_target,(%esp) 1296 cmpl $ia32_sysenter_target,(%esp)
1353 jne debug_stack_correct 1297 jne debug_stack_correct
1354 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1298 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
@@ -1373,7 +1317,6 @@ END(debug)
1373 */ 1317 */
1374ENTRY(nmi) 1318ENTRY(nmi)
1375 RING0_INT_FRAME 1319 RING0_INT_FRAME
1376 ASM_CLAC
1377 pushl_cfi %eax 1320 pushl_cfi %eax
1378 movl %ss, %eax 1321 movl %ss, %eax
1379 cmpw $__ESPFIX_SS, %ax 1322 cmpw $__ESPFIX_SS, %ax
@@ -1444,7 +1387,6 @@ END(nmi)
1444 1387
1445ENTRY(int3) 1388ENTRY(int3)
1446 RING0_INT_FRAME 1389 RING0_INT_FRAME
1447 ASM_CLAC
1448 pushl_cfi $-1 # mark this as an int 1390 pushl_cfi $-1 # mark this as an int
1449 SAVE_ALL 1391 SAVE_ALL
1450 TRACE_IRQS_OFF 1392 TRACE_IRQS_OFF
@@ -1465,7 +1407,6 @@ END(general_protection)
1465#ifdef CONFIG_KVM_GUEST 1407#ifdef CONFIG_KVM_GUEST
1466ENTRY(async_page_fault) 1408ENTRY(async_page_fault)
1467 RING0_EC_FRAME 1409 RING0_EC_FRAME
1468 ASM_CLAC
1469 pushl_cfi $do_async_page_fault 1410 pushl_cfi $do_async_page_fault
1470 jmp error_code 1411 jmp error_code
1471 CFI_ENDPROC 1412 CFI_ENDPROC
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 07a7a04529b..6419bb05ecd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,10 +55,6 @@
55#include <asm/paravirt.h> 55#include <asm/paravirt.h>
56#include <asm/ftrace.h> 56#include <asm/ftrace.h>
57#include <asm/percpu.h> 57#include <asm/percpu.h>
58#include <asm/asm.h>
59#include <asm/context_tracking.h>
60#include <asm/smap.h>
61#include <linux/err.h>
62 58
63/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ 59/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
64#include <linux/elf-em.h> 60#include <linux/elf-em.h>
@@ -70,51 +66,25 @@
70 .section .entry.text, "ax" 66 .section .entry.text, "ax"
71 67
72#ifdef CONFIG_FUNCTION_TRACER 68#ifdef CONFIG_FUNCTION_TRACER
73
74#ifdef CC_USING_FENTRY
75# define function_hook __fentry__
76#else
77# define function_hook mcount
78#endif
79
80#ifdef CONFIG_DYNAMIC_FTRACE 69#ifdef CONFIG_DYNAMIC_FTRACE
81 70ENTRY(mcount)
82ENTRY(function_hook)
83 retq 71 retq
84END(function_hook) 72END(mcount)
85
86/* skip is set if stack has been adjusted */
87.macro ftrace_caller_setup skip=0
88 MCOUNT_SAVE_FRAME \skip
89
90 /* Load the ftrace_ops into the 3rd parameter */
91 leaq function_trace_op, %rdx
92
93 /* Load ip into the first parameter */
94 movq RIP(%rsp), %rdi
95 subq $MCOUNT_INSN_SIZE, %rdi
96 /* Load the parent_ip into the second parameter */
97#ifdef CC_USING_FENTRY
98 movq SS+16(%rsp), %rsi
99#else
100 movq 8(%rbp), %rsi
101#endif
102.endm
103 73
104ENTRY(ftrace_caller) 74ENTRY(ftrace_caller)
105 /* Check if tracing was disabled (quick check) */
106 cmpl $0, function_trace_stop 75 cmpl $0, function_trace_stop
107 jne ftrace_stub 76 jne ftrace_stub
108 77
109 ftrace_caller_setup 78 MCOUNT_SAVE_FRAME
110 /* regs go into 4th parameter (but make it NULL) */ 79
111 movq $0, %rcx 80 movq 0x38(%rsp), %rdi
81 movq 8(%rbp), %rsi
82 subq $MCOUNT_INSN_SIZE, %rdi
112 83
113GLOBAL(ftrace_call) 84GLOBAL(ftrace_call)
114 call ftrace_stub 85 call ftrace_stub
115 86
116 MCOUNT_RESTORE_FRAME 87 MCOUNT_RESTORE_FRAME
117ftrace_return:
118 88
119#ifdef CONFIG_FUNCTION_GRAPH_TRACER 89#ifdef CONFIG_FUNCTION_GRAPH_TRACER
120GLOBAL(ftrace_graph_call) 90GLOBAL(ftrace_graph_call)
@@ -125,78 +95,8 @@ GLOBAL(ftrace_stub)
125 retq 95 retq
126END(ftrace_caller) 96END(ftrace_caller)
127 97
128ENTRY(ftrace_regs_caller)
129 /* Save the current flags before compare (in SS location)*/
130 pushfq
131
132 /* Check if tracing was disabled (quick check) */
133 cmpl $0, function_trace_stop
134 jne ftrace_restore_flags
135
136 /* skip=8 to skip flags saved in SS */
137 ftrace_caller_setup 8
138
139 /* Save the rest of pt_regs */
140 movq %r15, R15(%rsp)
141 movq %r14, R14(%rsp)
142 movq %r13, R13(%rsp)
143 movq %r12, R12(%rsp)
144 movq %r11, R11(%rsp)
145 movq %r10, R10(%rsp)
146 movq %rbp, RBP(%rsp)
147 movq %rbx, RBX(%rsp)
148 /* Copy saved flags */
149 movq SS(%rsp), %rcx
150 movq %rcx, EFLAGS(%rsp)
151 /* Kernel segments */
152 movq $__KERNEL_DS, %rcx
153 movq %rcx, SS(%rsp)
154 movq $__KERNEL_CS, %rcx
155 movq %rcx, CS(%rsp)
156 /* Stack - skipping return address */
157 leaq SS+16(%rsp), %rcx
158 movq %rcx, RSP(%rsp)
159
160 /* regs go into 4th parameter */
161 leaq (%rsp), %rcx
162
163GLOBAL(ftrace_regs_call)
164 call ftrace_stub
165
166 /* Copy flags back to SS, to restore them */
167 movq EFLAGS(%rsp), %rax
168 movq %rax, SS(%rsp)
169
170 /* Handlers can change the RIP */
171 movq RIP(%rsp), %rax
172 movq %rax, SS+8(%rsp)
173
174 /* restore the rest of pt_regs */
175 movq R15(%rsp), %r15
176 movq R14(%rsp), %r14
177 movq R13(%rsp), %r13
178 movq R12(%rsp), %r12
179 movq R10(%rsp), %r10
180 movq RBP(%rsp), %rbp
181 movq RBX(%rsp), %rbx
182
183 /* skip=8 to skip flags saved in SS */
184 MCOUNT_RESTORE_FRAME 8
185
186 /* Restore flags */
187 popfq
188
189 jmp ftrace_return
190ftrace_restore_flags:
191 popfq
192 jmp ftrace_stub
193
194END(ftrace_regs_caller)
195
196
197#else /* ! CONFIG_DYNAMIC_FTRACE */ 98#else /* ! CONFIG_DYNAMIC_FTRACE */
198 99ENTRY(mcount)
199ENTRY(function_hook)
200 cmpl $0, function_trace_stop 100 cmpl $0, function_trace_stop
201 jne ftrace_stub 101 jne ftrace_stub
202 102
@@ -217,12 +117,8 @@ GLOBAL(ftrace_stub)
217trace: 117trace:
218 MCOUNT_SAVE_FRAME 118 MCOUNT_SAVE_FRAME
219 119
220 movq RIP(%rsp), %rdi 120 movq 0x38(%rsp), %rdi
221#ifdef CC_USING_FENTRY
222 movq SS+16(%rsp), %rsi
223#else
224 movq 8(%rbp), %rsi 121 movq 8(%rbp), %rsi
225#endif
226 subq $MCOUNT_INSN_SIZE, %rdi 122 subq $MCOUNT_INSN_SIZE, %rdi
227 123
228 call *ftrace_trace_function 124 call *ftrace_trace_function
@@ -230,22 +126,20 @@ trace:
230 MCOUNT_RESTORE_FRAME 126 MCOUNT_RESTORE_FRAME
231 127
232 jmp ftrace_stub 128 jmp ftrace_stub
233END(function_hook) 129END(mcount)
234#endif /* CONFIG_DYNAMIC_FTRACE */ 130#endif /* CONFIG_DYNAMIC_FTRACE */
235#endif /* CONFIG_FUNCTION_TRACER */ 131#endif /* CONFIG_FUNCTION_TRACER */
236 132
237#ifdef CONFIG_FUNCTION_GRAPH_TRACER 133#ifdef CONFIG_FUNCTION_GRAPH_TRACER
238ENTRY(ftrace_graph_caller) 134ENTRY(ftrace_graph_caller)
135 cmpl $0, function_trace_stop
136 jne ftrace_stub
137
239 MCOUNT_SAVE_FRAME 138 MCOUNT_SAVE_FRAME
240 139
241#ifdef CC_USING_FENTRY
242 leaq SS+16(%rsp), %rdi
243 movq $0, %rdx /* No framepointers needed */
244#else
245 leaq 8(%rbp), %rdi 140 leaq 8(%rbp), %rdi
141 movq 0x38(%rsp), %rsi
246 movq (%rbp), %rdx 142 movq (%rbp), %rdx
247#endif
248 movq RIP(%rsp), %rsi
249 subq $MCOUNT_INSN_SIZE, %rsi 143 subq $MCOUNT_INSN_SIZE, %rsi
250 144
251 call prepare_ftrace_return 145 call prepare_ftrace_return
@@ -295,44 +189,6 @@ ENDPROC(native_usergs_sysret64)
295.endm 189.endm
296 190
297/* 191/*
298 * When dynamic function tracer is enabled it will add a breakpoint
299 * to all locations that it is about to modify, sync CPUs, update
300 * all the code, sync CPUs, then remove the breakpoints. In this time
301 * if lockdep is enabled, it might jump back into the debug handler
302 * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
303 *
304 * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
305 * make sure the stack pointer does not get reset back to the top
306 * of the debug stack, and instead just reuses the current stack.
307 */
308#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
309
310.macro TRACE_IRQS_OFF_DEBUG
311 call debug_stack_set_zero
312 TRACE_IRQS_OFF
313 call debug_stack_reset
314.endm
315
316.macro TRACE_IRQS_ON_DEBUG
317 call debug_stack_set_zero
318 TRACE_IRQS_ON
319 call debug_stack_reset
320.endm
321
322.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
323 bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
324 jnc 1f
325 TRACE_IRQS_ON_DEBUG
3261:
327.endm
328
329#else
330# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
331# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
332# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
333#endif
334
335/*
336 * C code is not supposed to know about undefined top of stack. Every time 192 * C code is not supposed to know about undefined top of stack. Every time
337 * a C function with an pt_regs argument is called from the SYSCALL based 193 * a C function with an pt_regs argument is called from the SYSCALL based
338 * fast path FIXUP_TOP_OF_STACK is needed. 194 * fast path FIXUP_TOP_OF_STACK is needed.
@@ -365,7 +221,7 @@ ENDPROC(native_usergs_sysret64)
365 /*CFI_REL_OFFSET ss,0*/ 221 /*CFI_REL_OFFSET ss,0*/
366 pushq_cfi %rax /* rsp */ 222 pushq_cfi %rax /* rsp */
367 CFI_REL_OFFSET rsp,0 223 CFI_REL_OFFSET rsp,0
368 pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ 224 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
369 /*CFI_REL_OFFSET rflags,0*/ 225 /*CFI_REL_OFFSET rflags,0*/
370 pushq_cfi $__KERNEL_CS /* cs */ 226 pushq_cfi $__KERNEL_CS /* cs */
371 /*CFI_REL_OFFSET cs,0*/ 227 /*CFI_REL_OFFSET cs,0*/
@@ -446,15 +302,15 @@ ENDPROC(native_usergs_sysret64)
446 .macro SAVE_ARGS_IRQ 302 .macro SAVE_ARGS_IRQ
447 cld 303 cld
448 /* start from rbp in pt_regs and jump over */ 304 /* start from rbp in pt_regs and jump over */
449 movq_cfi rdi, (RDI-RBP) 305 movq_cfi rdi, RDI-RBP
450 movq_cfi rsi, (RSI-RBP) 306 movq_cfi rsi, RSI-RBP
451 movq_cfi rdx, (RDX-RBP) 307 movq_cfi rdx, RDX-RBP
452 movq_cfi rcx, (RCX-RBP) 308 movq_cfi rcx, RCX-RBP
453 movq_cfi rax, (RAX-RBP) 309 movq_cfi rax, RAX-RBP
454 movq_cfi r8, (R8-RBP) 310 movq_cfi r8, R8-RBP
455 movq_cfi r9, (R9-RBP) 311 movq_cfi r9, R9-RBP
456 movq_cfi r10, (R10-RBP) 312 movq_cfi r10, R10-RBP
457 movq_cfi r11, (R11-RBP) 313 movq_cfi r11, R11-RBP
458 314
459 /* Save rbp so that we can unwind from get_irq_regs() */ 315 /* Save rbp so that we can unwind from get_irq_regs() */
460 movq_cfi rbp, 0 316 movq_cfi rbp, 0
@@ -463,7 +319,7 @@ ENDPROC(native_usergs_sysret64)
463 movq %rsp, %rsi 319 movq %rsp, %rsi
464 320
465 leaq -RBP(%rsp),%rdi /* arg1 for handler */ 321 leaq -RBP(%rsp),%rdi /* arg1 for handler */
466 testl $3, CS-RBP(%rsi) 322 testl $3, CS(%rdi)
467 je 1f 323 je 1f
468 SWAPGS 324 SWAPGS
469 /* 325 /*
@@ -473,22 +329,18 @@ ENDPROC(native_usergs_sysret64)
473 * moving irq_enter into assembly, which would be too much work) 329 * moving irq_enter into assembly, which would be too much work)
474 */ 330 */
4751: incl PER_CPU_VAR(irq_count) 3311: incl PER_CPU_VAR(irq_count)
476 cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp 332 jne 2f
477 CFI_DEF_CFA_REGISTER rsi 333 mov PER_CPU_VAR(irq_stack_ptr),%rsp
334 EMPTY_FRAME 0
478 335
479 /* Store previous stack value */ 3362: /* Store previous stack value */
480 pushq %rsi 337 pushq %rsi
481 CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
482 0x77 /* DW_OP_breg7 */, 0, \
483 0x06 /* DW_OP_deref */, \
484 0x08 /* DW_OP_const1u */, SS+8-RBP, \
485 0x22 /* DW_OP_plus */
486 /* We entered an interrupt context - irqs are off: */ 338 /* We entered an interrupt context - irqs are off: */
487 TRACE_IRQS_OFF 339 TRACE_IRQS_OFF
488 .endm 340 .endm
489 341
490ENTRY(save_rest) 342ENTRY(save_rest)
491 PARTIAL_FRAME 1 (REST_SKIP+8) 343 PARTIAL_FRAME 1 REST_SKIP+8
492 movq 5*8+16(%rsp), %r11 /* save return address */ 344 movq 5*8+16(%rsp), %r11 /* save return address */
493 movq_cfi rbx, RBX+16 345 movq_cfi rbx, RBX+16
494 movq_cfi rbp, RBP+16 346 movq_cfi rbp, RBP+16
@@ -544,7 +396,7 @@ ENTRY(ret_from_fork)
544 396
545 LOCK ; btr $TIF_FORK,TI_flags(%r8) 397 LOCK ; btr $TIF_FORK,TI_flags(%r8)
546 398
547 pushq_cfi $0x0002 399 pushq_cfi kernel_eflags(%rip)
548 popfq_cfi # reset kernel eflags 400 popfq_cfi # reset kernel eflags
549 401
550 call schedule_tail # rdi: 'prev' task parameter 402 call schedule_tail # rdi: 'prev' task parameter
@@ -554,7 +406,7 @@ ENTRY(ret_from_fork)
554 RESTORE_REST 406 RESTORE_REST
555 407
556 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? 408 testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
557 jz 1f 409 je int_ret_from_sys_call
558 410
559 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET 411 testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET
560 jnz int_ret_from_sys_call 412 jnz int_ret_from_sys_call
@@ -562,14 +414,6 @@ ENTRY(ret_from_fork)
562 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET 414 RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
563 jmp ret_from_sys_call # go to the SYSRET fastpath 415 jmp ret_from_sys_call # go to the SYSRET fastpath
564 416
5651:
566 subq $REST_SKIP, %rsp # leave space for volatiles
567 CFI_ADJUST_CFA_OFFSET REST_SKIP
568 movq %rbp, %rdi
569 call *%rbx
570 movl $0, RAX(%rsp)
571 RESTORE_REST
572 jmp int_ret_from_sys_call
573 CFI_ENDPROC 417 CFI_ENDPROC
574END(ret_from_fork) 418END(ret_from_fork)
575 419
@@ -577,8 +421,7 @@ END(ret_from_fork)
577 * System call entry. Up to 6 arguments in registers are supported. 421 * System call entry. Up to 6 arguments in registers are supported.
578 * 422 *
579 * SYSCALL does not save anything on the stack and does not change the 423 * SYSCALL does not save anything on the stack and does not change the
580 * stack pointer. However, it does mask the flags register for us, so 424 * stack pointer.
581 * CLD and CLAC are not needed.
582 */ 425 */
583 426
584/* 427/*
@@ -617,7 +460,7 @@ ENTRY(system_call)
617 * after the swapgs, so that it can do the swapgs 460 * after the swapgs, so that it can do the swapgs
618 * for the guest and jump here on syscall. 461 * for the guest and jump here on syscall.
619 */ 462 */
620GLOBAL(system_call_after_swapgs) 463ENTRY(system_call_after_swapgs)
621 464
622 movq %rsp,PER_CPU_VAR(old_rsp) 465 movq %rsp,PER_CPU_VAR(old_rsp)
623 movq PER_CPU_VAR(kernel_stack),%rsp 466 movq PER_CPU_VAR(kernel_stack),%rsp
@@ -630,15 +473,11 @@ GLOBAL(system_call_after_swapgs)
630 movq %rax,ORIG_RAX-ARGOFFSET(%rsp) 473 movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
631 movq %rcx,RIP-ARGOFFSET(%rsp) 474 movq %rcx,RIP-ARGOFFSET(%rsp)
632 CFI_REL_OFFSET rip,RIP-ARGOFFSET 475 CFI_REL_OFFSET rip,RIP-ARGOFFSET
633 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 476 GET_THREAD_INFO(%rcx)
477 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
634 jnz tracesys 478 jnz tracesys
635system_call_fastpath: 479system_call_fastpath:
636#if __SYSCALL_MASK == ~0
637 cmpq $__NR_syscall_max,%rax 480 cmpq $__NR_syscall_max,%rax
638#else
639 andl $__SYSCALL_MASK,%eax
640 cmpl $__NR_syscall_max,%eax
641#endif
642 ja badsys 481 ja badsys
643 movq %r10,%rcx 482 movq %r10,%rcx
644 call *sys_call_table(,%rax,8) # XXX: rip relative 483 call *sys_call_table(,%rax,8) # XXX: rip relative
@@ -652,9 +491,10 @@ ret_from_sys_call:
652 /* edi: flagmask */ 491 /* edi: flagmask */
653sysret_check: 492sysret_check:
654 LOCKDEP_SYS_EXIT 493 LOCKDEP_SYS_EXIT
494 GET_THREAD_INFO(%rcx)
655 DISABLE_INTERRUPTS(CLBR_NONE) 495 DISABLE_INTERRUPTS(CLBR_NONE)
656 TRACE_IRQS_OFF 496 TRACE_IRQS_OFF
657 movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx 497 movl TI_flags(%rcx),%edx
658 andl %edi,%edx 498 andl %edi,%edx
659 jnz sysret_careful 499 jnz sysret_careful
660 CFI_REMEMBER_STATE 500 CFI_REMEMBER_STATE
@@ -678,7 +518,7 @@ sysret_careful:
678 TRACE_IRQS_ON 518 TRACE_IRQS_ON
679 ENABLE_INTERRUPTS(CLBR_NONE) 519 ENABLE_INTERRUPTS(CLBR_NONE)
680 pushq_cfi %rdi 520 pushq_cfi %rdi
681 SCHEDULE_USER 521 call schedule
682 popq_cfi %rdi 522 popq_cfi %rdi
683 jmp sysret_check 523 jmp sysret_check
684 524
@@ -705,7 +545,7 @@ badsys:
705#ifdef CONFIG_AUDITSYSCALL 545#ifdef CONFIG_AUDITSYSCALL
706 /* 546 /*
707 * Fast path for syscall audit without full syscall trace. 547 * Fast path for syscall audit without full syscall trace.
708 * We just call __audit_syscall_entry() directly, and then 548 * We just call audit_syscall_entry() directly, and then
709 * jump back to the normal fast path. 549 * jump back to the normal fast path.
710 */ 550 */
711auditsys: 551auditsys:
@@ -715,21 +555,22 @@ auditsys:
715 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ 555 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
716 movq %rax,%rsi /* 2nd arg: syscall number */ 556 movq %rax,%rsi /* 2nd arg: syscall number */
717 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ 557 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
718 call __audit_syscall_entry 558 call audit_syscall_entry
719 LOAD_ARGS 0 /* reload call-clobbered registers */ 559 LOAD_ARGS 0 /* reload call-clobbered registers */
720 jmp system_call_fastpath 560 jmp system_call_fastpath
721 561
722 /* 562 /*
723 * Return fast path for syscall audit. Call __audit_syscall_exit() 563 * Return fast path for syscall audit. Call audit_syscall_exit()
724 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT 564 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
725 * masked off. 565 * masked off.
726 */ 566 */
727sysret_audit: 567sysret_audit:
728 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ 568 movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
729 cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ 569 cmpq $0,%rsi /* is it < 0? */
730 setbe %al /* 1 if so, 0 if not */ 570 setl %al /* 1 if so, 0 if not */
731 movzbl %al,%edi /* zero-extend that into %edi */ 571 movzbl %al,%edi /* zero-extend that into %edi */
732 call __audit_syscall_exit 572 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
573 call audit_syscall_exit
733 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi 574 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
734 jmp sysret_check 575 jmp sysret_check
735#endif /* CONFIG_AUDITSYSCALL */ 576#endif /* CONFIG_AUDITSYSCALL */
@@ -737,7 +578,7 @@ sysret_audit:
737 /* Do syscall tracing */ 578 /* Do syscall tracing */
738tracesys: 579tracesys:
739#ifdef CONFIG_AUDITSYSCALL 580#ifdef CONFIG_AUDITSYSCALL
740 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) 581 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
741 jz auditsys 582 jz auditsys
742#endif 583#endif
743 SAVE_REST 584 SAVE_REST
@@ -752,12 +593,7 @@ tracesys:
752 */ 593 */
753 LOAD_ARGS ARGOFFSET, 1 594 LOAD_ARGS ARGOFFSET, 1
754 RESTORE_REST 595 RESTORE_REST
755#if __SYSCALL_MASK == ~0
756 cmpq $__NR_syscall_max,%rax 596 cmpq $__NR_syscall_max,%rax
757#else
758 andl $__SYSCALL_MASK,%eax
759 cmpl $__NR_syscall_max,%eax
760#endif
761 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 597 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
762 movq %r10,%rcx /* fixup for C */ 598 movq %r10,%rcx /* fixup for C */
763 call *sys_call_table(,%rax,8) 599 call *sys_call_table(,%rax,8)
@@ -771,6 +607,8 @@ tracesys:
771GLOBAL(int_ret_from_sys_call) 607GLOBAL(int_ret_from_sys_call)
772 DISABLE_INTERRUPTS(CLBR_NONE) 608 DISABLE_INTERRUPTS(CLBR_NONE)
773 TRACE_IRQS_OFF 609 TRACE_IRQS_OFF
610 testl $3,CS-ARGOFFSET(%rsp)
611 je retint_restore_args
774 movl $_TIF_ALLWORK_MASK,%edi 612 movl $_TIF_ALLWORK_MASK,%edi
775 /* edi: mask to check */ 613 /* edi: mask to check */
776GLOBAL(int_with_check) 614GLOBAL(int_with_check)
@@ -791,7 +629,7 @@ int_careful:
791 TRACE_IRQS_ON 629 TRACE_IRQS_ON
792 ENABLE_INTERRUPTS(CLBR_NONE) 630 ENABLE_INTERRUPTS(CLBR_NONE)
793 pushq_cfi %rdi 631 pushq_cfi %rdi
794 SCHEDULE_USER 632 call schedule
795 popq_cfi %rdi 633 popq_cfi %rdi
796 DISABLE_INTERRUPTS(CLBR_NONE) 634 DISABLE_INTERRUPTS(CLBR_NONE)
797 TRACE_IRQS_OFF 635 TRACE_IRQS_OFF
@@ -845,25 +683,10 @@ ENTRY(\label)
845END(\label) 683END(\label)
846 .endm 684 .endm
847 685
848 .macro FORK_LIKE func 686 PTREGSCALL stub_clone, sys_clone, %r8
849ENTRY(stub_\func) 687 PTREGSCALL stub_fork, sys_fork, %rdi
850 CFI_STARTPROC 688 PTREGSCALL stub_vfork, sys_vfork, %rdi
851 popq %r11 /* save return address */ 689 PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
852 PARTIAL_FRAME 0
853 SAVE_REST
854 pushq %r11 /* put it back on stack */
855 FIXUP_TOP_OF_STACK %r11, 8
856 DEFAULT_FRAME 0 8 /* offset 8: return address */
857 call sys_\func
858 RESTORE_TOP_OF_STACK %r11, 8
859 ret $REST_SKIP /* pop extended registers */
860 CFI_ENDPROC
861END(stub_\func)
862 .endm
863
864 FORK_LIKE clone
865 FORK_LIKE fork
866 FORK_LIKE vfork
867 PTREGSCALL stub_iopl, sys_iopl, %rsi 690 PTREGSCALL stub_iopl, sys_iopl, %rsi
868 691
869ENTRY(ptregscall_common) 692ENTRY(ptregscall_common)
@@ -885,6 +708,7 @@ ENTRY(stub_execve)
885 PARTIAL_FRAME 0 708 PARTIAL_FRAME 0
886 SAVE_REST 709 SAVE_REST
887 FIXUP_TOP_OF_STACK %r11 710 FIXUP_TOP_OF_STACK %r11
711 movq %rsp, %rcx
888 call sys_execve 712 call sys_execve
889 RESTORE_TOP_OF_STACK %r11 713 RESTORE_TOP_OF_STACK %r11
890 movq %rax,RAX(%rsp) 714 movq %rax,RAX(%rsp)
@@ -911,37 +735,6 @@ ENTRY(stub_rt_sigreturn)
911 CFI_ENDPROC 735 CFI_ENDPROC
912END(stub_rt_sigreturn) 736END(stub_rt_sigreturn)
913 737
914#ifdef CONFIG_X86_X32_ABI
915ENTRY(stub_x32_rt_sigreturn)
916 CFI_STARTPROC
917 addq $8, %rsp
918 PARTIAL_FRAME 0
919 SAVE_REST
920 movq %rsp,%rdi
921 FIXUP_TOP_OF_STACK %r11
922 call sys32_x32_rt_sigreturn
923 movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
924 RESTORE_REST
925 jmp int_ret_from_sys_call
926 CFI_ENDPROC
927END(stub_x32_rt_sigreturn)
928
929ENTRY(stub_x32_execve)
930 CFI_STARTPROC
931 addq $8, %rsp
932 PARTIAL_FRAME 0
933 SAVE_REST
934 FIXUP_TOP_OF_STACK %r11
935 call compat_sys_execve
936 RESTORE_TOP_OF_STACK %r11
937 movq %rax,RAX(%rsp)
938 RESTORE_REST
939 jmp int_ret_from_sys_call
940 CFI_ENDPROC
941END(stub_x32_execve)
942
943#endif
944
945/* 738/*
946 * Build the entry stubs and pointer table with some assembler magic. 739 * Build the entry stubs and pointer table with some assembler magic.
947 * We pack 7 stubs into a single 32-byte chunk, which will fit in a 740 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -995,6 +788,7 @@ END(interrupt)
995 subq $ORIG_RAX-RBP, %rsp 788 subq $ORIG_RAX-RBP, %rsp
996 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP 789 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
997 SAVE_ARGS_IRQ 790 SAVE_ARGS_IRQ
791 PARTIAL_FRAME 0
998 call \func 792 call \func
999 .endm 793 .endm
1000 794
@@ -1009,7 +803,6 @@ END(interrupt)
1009 .p2align CONFIG_X86_L1_CACHE_SHIFT 803 .p2align CONFIG_X86_L1_CACHE_SHIFT
1010common_interrupt: 804common_interrupt:
1011 XCPT_FRAME 805 XCPT_FRAME
1012 ASM_CLAC
1013 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ 806 addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
1014 interrupt do_IRQ 807 interrupt do_IRQ
1015 /* 0(%rsp): old_rsp-ARGOFFSET */ 808 /* 0(%rsp): old_rsp-ARGOFFSET */
@@ -1020,10 +813,10 @@ ret_from_intr:
1020 813
1021 /* Restore saved previous stack */ 814 /* Restore saved previous stack */
1022 popq %rsi 815 popq %rsi
1023 CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ 816 leaq 16(%rsi), %rsp
1024 leaq ARGOFFSET-RBP(%rsi), %rsp 817
1025 CFI_DEF_CFA_REGISTER rsp 818 CFI_DEF_CFA_REGISTER rsp
1026 CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET 819 CFI_ADJUST_CFA_OFFSET -16
1027 820
1028exit_intr: 821exit_intr:
1029 GET_THREAD_INFO(%rcx) 822 GET_THREAD_INFO(%rcx)
@@ -1064,12 +857,18 @@ restore_args:
1064 857
1065irq_return: 858irq_return:
1066 INTERRUPT_RETURN 859 INTERRUPT_RETURN
1067 _ASM_EXTABLE(irq_return, bad_iret) 860
861 .section __ex_table, "a"
862 .quad irq_return, bad_iret
863 .previous
1068 864
1069#ifdef CONFIG_PARAVIRT 865#ifdef CONFIG_PARAVIRT
1070ENTRY(native_iret) 866ENTRY(native_iret)
1071 iretq 867 iretq
1072 _ASM_EXTABLE(native_iret, bad_iret) 868
869 .section __ex_table,"a"
870 .quad native_iret, bad_iret
871 .previous
1073#endif 872#endif
1074 873
1075 .section .fixup,"ax" 874 .section .fixup,"ax"
@@ -1099,7 +898,7 @@ retint_careful:
1099 TRACE_IRQS_ON 898 TRACE_IRQS_ON
1100 ENABLE_INTERRUPTS(CLBR_NONE) 899 ENABLE_INTERRUPTS(CLBR_NONE)
1101 pushq_cfi %rdi 900 pushq_cfi %rdi
1102 SCHEDULE_USER 901 call schedule
1103 popq_cfi %rdi 902 popq_cfi %rdi
1104 GET_THREAD_INFO(%rcx) 903 GET_THREAD_INFO(%rcx)
1105 DISABLE_INTERRUPTS(CLBR_NONE) 904 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1149,9 +948,7 @@ END(common_interrupt)
1149.macro apicinterrupt num sym do_sym 948.macro apicinterrupt num sym do_sym
1150ENTRY(\sym) 949ENTRY(\sym)
1151 INTR_FRAME 950 INTR_FRAME
1152 ASM_CLAC
1153 pushq_cfi $~(\num) 951 pushq_cfi $~(\num)
1154.Lcommon_\sym:
1155 interrupt \do_sym 952 interrupt \do_sym
1156 jmp ret_from_intr 953 jmp ret_from_intr
1157 CFI_ENDPROC 954 CFI_ENDPROC
@@ -1174,6 +971,16 @@ apicinterrupt LOCAL_TIMER_VECTOR \
1174apicinterrupt X86_PLATFORM_IPI_VECTOR \ 971apicinterrupt X86_PLATFORM_IPI_VECTOR \
1175 x86_platform_ipi smp_x86_platform_ipi 972 x86_platform_ipi smp_x86_platform_ipi
1176 973
974#ifdef CONFIG_SMP
975.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
976 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
977.if NUM_INVALIDATE_TLB_VECTORS > \idx
978apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
979 invalidate_interrupt\idx smp_invalidate_interrupt
980.endif
981.endr
982#endif
983
1177apicinterrupt THRESHOLD_APIC_VECTOR \ 984apicinterrupt THRESHOLD_APIC_VECTOR \
1178 threshold_interrupt smp_threshold_interrupt 985 threshold_interrupt smp_threshold_interrupt
1179apicinterrupt THERMAL_APIC_VECTOR \ 986apicinterrupt THERMAL_APIC_VECTOR \
@@ -1204,7 +1011,6 @@ apicinterrupt IRQ_WORK_VECTOR \
1204.macro zeroentry sym do_sym 1011.macro zeroentry sym do_sym
1205ENTRY(\sym) 1012ENTRY(\sym)
1206 INTR_FRAME 1013 INTR_FRAME
1207 ASM_CLAC
1208 PARAVIRT_ADJUST_EXCEPTION_FRAME 1014 PARAVIRT_ADJUST_EXCEPTION_FRAME
1209 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1015 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1210 subq $ORIG_RAX-R15, %rsp 1016 subq $ORIG_RAX-R15, %rsp
@@ -1222,7 +1028,6 @@ END(\sym)
1222.macro paranoidzeroentry sym do_sym 1028.macro paranoidzeroentry sym do_sym
1223ENTRY(\sym) 1029ENTRY(\sym)
1224 INTR_FRAME 1030 INTR_FRAME
1225 ASM_CLAC
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME 1031 PARAVIRT_ADJUST_EXCEPTION_FRAME
1227 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1032 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1228 subq $ORIG_RAX-R15, %rsp 1033 subq $ORIG_RAX-R15, %rsp
@@ -1241,13 +1046,12 @@ END(\sym)
1241.macro paranoidzeroentry_ist sym do_sym ist 1046.macro paranoidzeroentry_ist sym do_sym ist
1242ENTRY(\sym) 1047ENTRY(\sym)
1243 INTR_FRAME 1048 INTR_FRAME
1244 ASM_CLAC
1245 PARAVIRT_ADJUST_EXCEPTION_FRAME 1049 PARAVIRT_ADJUST_EXCEPTION_FRAME
1246 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1050 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1247 subq $ORIG_RAX-R15, %rsp 1051 subq $ORIG_RAX-R15, %rsp
1248 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1052 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1249 call save_paranoid 1053 call save_paranoid
1250 TRACE_IRQS_OFF_DEBUG 1054 TRACE_IRQS_OFF
1251 movq %rsp,%rdi /* pt_regs pointer */ 1055 movq %rsp,%rdi /* pt_regs pointer */
1252 xorl %esi,%esi /* no error code */ 1056 xorl %esi,%esi /* no error code */
1253 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) 1057 subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
@@ -1261,7 +1065,6 @@ END(\sym)
1261.macro errorentry sym do_sym 1065.macro errorentry sym do_sym
1262ENTRY(\sym) 1066ENTRY(\sym)
1263 XCPT_FRAME 1067 XCPT_FRAME
1264 ASM_CLAC
1265 PARAVIRT_ADJUST_EXCEPTION_FRAME 1068 PARAVIRT_ADJUST_EXCEPTION_FRAME
1266 subq $ORIG_RAX-R15, %rsp 1069 subq $ORIG_RAX-R15, %rsp
1267 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1070 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1280,7 +1083,6 @@ END(\sym)
1280.macro paranoiderrorentry sym do_sym 1083.macro paranoiderrorentry sym do_sym
1281ENTRY(\sym) 1084ENTRY(\sym)
1282 XCPT_FRAME 1085 XCPT_FRAME
1283 ASM_CLAC
1284 PARAVIRT_ADJUST_EXCEPTION_FRAME 1086 PARAVIRT_ADJUST_EXCEPTION_FRAME
1285 subq $ORIG_RAX-R15, %rsp 1087 subq $ORIG_RAX-R15, %rsp
1286 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1088 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1327,7 +1129,10 @@ gs_change:
1327 CFI_ENDPROC 1129 CFI_ENDPROC
1328END(native_load_gs_index) 1130END(native_load_gs_index)
1329 1131
1330 _ASM_EXTABLE(gs_change,bad_gs) 1132 .section __ex_table,"a"
1133 .align 8
1134 .quad gs_change,bad_gs
1135 .previous
1331 .section .fixup,"ax" 1136 .section .fixup,"ax"
1332 /* running with kernelgs */ 1137 /* running with kernelgs */
1333bad_gs: 1138bad_gs:
@@ -1337,6 +1142,52 @@ bad_gs:
1337 jmp 2b 1142 jmp 2b
1338 .previous 1143 .previous
1339 1144
1145ENTRY(kernel_thread_helper)
1146 pushq $0 # fake return address
1147 CFI_STARTPROC
1148 /*
1149 * Here we are in the child and the registers are set as they were
1150 * at kernel_thread() invocation in the parent.
1151 */
1152 call *%rsi
1153 # exit
1154 mov %eax, %edi
1155 call do_exit
1156 ud2 # padding for call trace
1157 CFI_ENDPROC
1158END(kernel_thread_helper)
1159
1160/*
1161 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1162 *
1163 * C extern interface:
1164 * extern long execve(const char *name, char **argv, char **envp)
1165 *
1166 * asm input arguments:
1167 * rdi: name, rsi: argv, rdx: envp
1168 *
1169 * We want to fallback into:
1170 * extern long sys_execve(const char *name, char **argv,char **envp, struct pt_regs *regs)
1171 *
1172 * do_sys_execve asm fallback arguments:
1173 * rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1174 */
1175ENTRY(kernel_execve)
1176 CFI_STARTPROC
1177 FAKE_STACK_FRAME $0
1178 SAVE_ALL
1179 movq %rsp,%rcx
1180 call sys_execve
1181 movq %rax, RAX(%rsp)
1182 RESTORE_REST
1183 testq %rax,%rax
1184 je int_ret_from_sys_call
1185 RESTORE_ARGS
1186 UNFAKE_STACK_FRAME
1187 ret
1188 CFI_ENDPROC
1189END(kernel_execve)
1190
1340/* Call softirq on interrupt stack. Interrupts are off. */ 1191/* Call softirq on interrupt stack. Interrupts are off. */
1341ENTRY(call_softirq) 1192ENTRY(call_softirq)
1342 CFI_STARTPROC 1193 CFI_STARTPROC
@@ -1448,7 +1299,7 @@ ENTRY(xen_failsafe_callback)
1448 CFI_RESTORE r11 1299 CFI_RESTORE r11
1449 addq $0x30,%rsp 1300 addq $0x30,%rsp
1450 CFI_ADJUST_CFA_OFFSET -0x30 1301 CFI_ADJUST_CFA_OFFSET -0x30
1451 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ 1302 pushq_cfi $0
1452 SAVE_ALL 1303 SAVE_ALL
1453 jmp error_exit 1304 jmp error_exit
1454 CFI_ENDPROC 1305 CFI_ENDPROC
@@ -1498,7 +1349,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1498ENTRY(paranoid_exit) 1349ENTRY(paranoid_exit)
1499 DEFAULT_FRAME 1350 DEFAULT_FRAME
1500 DISABLE_INTERRUPTS(CLBR_NONE) 1351 DISABLE_INTERRUPTS(CLBR_NONE)
1501 TRACE_IRQS_OFF_DEBUG 1352 TRACE_IRQS_OFF
1502 testl %ebx,%ebx /* swapgs needed? */ 1353 testl %ebx,%ebx /* swapgs needed? */
1503 jnz paranoid_restore 1354 jnz paranoid_restore
1504 testl $3,CS(%rsp) 1355 testl $3,CS(%rsp)
@@ -1509,7 +1360,7 @@ paranoid_swapgs:
1509 RESTORE_ALL 8 1360 RESTORE_ALL 8
1510 jmp irq_return 1361 jmp irq_return
1511paranoid_restore: 1362paranoid_restore:
1512 TRACE_IRQS_IRETQ_DEBUG 0 1363 TRACE_IRQS_IRETQ 0
1513 RESTORE_ALL 8 1364 RESTORE_ALL 8
1514 jmp irq_return 1365 jmp irq_return
1515paranoid_userspace: 1366paranoid_userspace:
@@ -1534,7 +1385,7 @@ paranoid_userspace:
1534paranoid_schedule: 1385paranoid_schedule:
1535 TRACE_IRQS_ON 1386 TRACE_IRQS_ON
1536 ENABLE_INTERRUPTS(CLBR_ANY) 1387 ENABLE_INTERRUPTS(CLBR_ANY)
1537 SCHEDULE_USER 1388 call schedule
1538 DISABLE_INTERRUPTS(CLBR_ANY) 1389 DISABLE_INTERRUPTS(CLBR_ANY)
1539 TRACE_IRQS_OFF 1390 TRACE_IRQS_OFF
1540 jmp paranoid_userspace 1391 jmp paranoid_userspace
@@ -1620,258 +1471,60 @@ ENTRY(error_exit)
1620 CFI_ENDPROC 1471 CFI_ENDPROC
1621END(error_exit) 1472END(error_exit)
1622 1473
1623/*
1624 * Test if a given stack is an NMI stack or not.
1625 */
1626 .macro test_in_nmi reg stack nmi_ret normal_ret
1627 cmpq %\reg, \stack
1628 ja \normal_ret
1629 subq $EXCEPTION_STKSZ, %\reg
1630 cmpq %\reg, \stack
1631 jb \normal_ret
1632 jmp \nmi_ret
1633 .endm
1634 1474
1635 /* runs on exception stack */ 1475 /* runs on exception stack */
1636ENTRY(nmi) 1476ENTRY(nmi)
1637 INTR_FRAME 1477 INTR_FRAME
1638 PARAVIRT_ADJUST_EXCEPTION_FRAME 1478 PARAVIRT_ADJUST_EXCEPTION_FRAME
1639 /* 1479 pushq_cfi $-1
1640 * We allow breakpoints in NMIs. If a breakpoint occurs, then
1641 * the iretq it performs will take us out of NMI context.
1642 * This means that we can have nested NMIs where the next
1643 * NMI is using the top of the stack of the previous NMI. We
1644 * can't let it execute because the nested NMI will corrupt the
1645 * stack of the previous NMI. NMI handlers are not re-entrant
1646 * anyway.
1647 *
1648 * To handle this case we do the following:
1649 * Check the a special location on the stack that contains
1650 * a variable that is set when NMIs are executing.
1651 * The interrupted task's stack is also checked to see if it
1652 * is an NMI stack.
1653 * If the variable is not set and the stack is not the NMI
1654 * stack then:
1655 * o Set the special variable on the stack
1656 * o Copy the interrupt frame into a "saved" location on the stack
1657 * o Copy the interrupt frame into a "copy" location on the stack
1658 * o Continue processing the NMI
1659 * If the variable is set or the previous stack is the NMI stack:
1660 * o Modify the "copy" location to jump to the repeate_nmi
1661 * o return back to the first NMI
1662 *
1663 * Now on exit of the first NMI, we first clear the stack variable
1664 * The NMI stack will tell any nested NMIs at that point that it is
1665 * nested. Then we pop the stack normally with iret, and if there was
1666 * a nested NMI that updated the copy interrupt stack frame, a
1667 * jump will be made to the repeat_nmi code that will handle the second
1668 * NMI.
1669 */
1670
1671 /* Use %rdx as out temp variable throughout */
1672 pushq_cfi %rdx
1673 CFI_REL_OFFSET rdx, 0
1674
1675 /*
1676 * If %cs was not the kernel segment, then the NMI triggered in user
1677 * space, which means it is definitely not nested.
1678 */
1679 cmpl $__KERNEL_CS, 16(%rsp)
1680 jne first_nmi
1681
1682 /*
1683 * Check the special variable on the stack to see if NMIs are
1684 * executing.
1685 */
1686 cmpl $1, -8(%rsp)
1687 je nested_nmi
1688
1689 /*
1690 * Now test if the previous stack was an NMI stack.
1691 * We need the double check. We check the NMI stack to satisfy the
1692 * race when the first NMI clears the variable before returning.
1693 * We check the variable because the first NMI could be in a
1694 * breakpoint routine using a breakpoint stack.
1695 */
1696 lea 6*8(%rsp), %rdx
1697 test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
1698 CFI_REMEMBER_STATE
1699
1700nested_nmi:
1701 /*
1702 * Do nothing if we interrupted the fixup in repeat_nmi.
1703 * It's about to repeat the NMI handler, so we are fine
1704 * with ignoring this one.
1705 */
1706 movq $repeat_nmi, %rdx
1707 cmpq 8(%rsp), %rdx
1708 ja 1f
1709 movq $end_repeat_nmi, %rdx
1710 cmpq 8(%rsp), %rdx
1711 ja nested_nmi_out
1712
17131:
1714 /* Set up the interrupted NMIs stack to jump to repeat_nmi */
1715 leaq -1*8(%rsp), %rdx
1716 movq %rdx, %rsp
1717 CFI_ADJUST_CFA_OFFSET 1*8
1718 leaq -10*8(%rsp), %rdx
1719 pushq_cfi $__KERNEL_DS
1720 pushq_cfi %rdx
1721 pushfq_cfi
1722 pushq_cfi $__KERNEL_CS
1723 pushq_cfi $repeat_nmi
1724
1725 /* Put stack back */
1726 addq $(6*8), %rsp
1727 CFI_ADJUST_CFA_OFFSET -6*8
1728
1729nested_nmi_out:
1730 popq_cfi %rdx
1731 CFI_RESTORE rdx
1732
1733 /* No need to check faults here */
1734 INTERRUPT_RETURN
1735
1736 CFI_RESTORE_STATE
1737first_nmi:
1738 /*
1739 * Because nested NMIs will use the pushed location that we
1740 * stored in rdx, we must keep that space available.
1741 * Here's what our stack frame will look like:
1742 * +-------------------------+
1743 * | original SS |
1744 * | original Return RSP |
1745 * | original RFLAGS |
1746 * | original CS |
1747 * | original RIP |
1748 * +-------------------------+
1749 * | temp storage for rdx |
1750 * +-------------------------+
1751 * | NMI executing variable |
1752 * +-------------------------+
1753 * | copied SS |
1754 * | copied Return RSP |
1755 * | copied RFLAGS |
1756 * | copied CS |
1757 * | copied RIP |
1758 * +-------------------------+
1759 * | Saved SS |
1760 * | Saved Return RSP |
1761 * | Saved RFLAGS |
1762 * | Saved CS |
1763 * | Saved RIP |
1764 * +-------------------------+
1765 * | pt_regs |
1766 * +-------------------------+
1767 *
1768 * The saved stack frame is used to fix up the copied stack frame
1769 * that a nested NMI may change to make the interrupted NMI iret jump
1770 * to the repeat_nmi. The original stack frame and the temp storage
1771 * is also used by nested NMIs and can not be trusted on exit.
1772 */
1773 /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
1774 movq (%rsp), %rdx
1775 CFI_RESTORE rdx
1776
1777 /* Set the NMI executing variable on the stack. */
1778 pushq_cfi $1
1779
1780 /*
1781 * Leave room for the "copied" frame
1782 */
1783 subq $(5*8), %rsp
1784
1785 /* Copy the stack frame to the Saved frame */
1786 .rept 5
1787 pushq_cfi 11*8(%rsp)
1788 .endr
1789 CFI_DEF_CFA_OFFSET SS+8-RIP
1790
1791 /* Everything up to here is safe from nested NMIs */
1792
1793 /*
1794 * If there was a nested NMI, the first NMI's iret will return
1795 * here. But NMIs are still enabled and we can take another
1796 * nested NMI. The nested NMI checks the interrupted RIP to see
1797 * if it is between repeat_nmi and end_repeat_nmi, and if so
1798 * it will just return, as we are about to repeat an NMI anyway.
1799 * This makes it safe to copy to the stack frame that a nested
1800 * NMI will update.
1801 */
1802repeat_nmi:
1803 /*
1804 * Update the stack variable to say we are still in NMI (the update
1805 * is benign for the non-repeat case, where 1 was pushed just above
1806 * to this very stack slot).
1807 */
1808 movq $1, 10*8(%rsp)
1809
1810 /* Make another copy, this one may be modified by nested NMIs */
1811 addq $(10*8), %rsp
1812 CFI_ADJUST_CFA_OFFSET -10*8
1813 .rept 5
1814 pushq_cfi -6*8(%rsp)
1815 .endr
1816 subq $(5*8), %rsp
1817 CFI_DEF_CFA_OFFSET SS+8-RIP
1818end_repeat_nmi:
1819
1820 /*
1821 * Everything below this point can be preempted by a nested
1822 * NMI if the first NMI took an exception and reset our iret stack
1823 * so that we repeat another NMI.
1824 */
1825 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1826 subq $ORIG_RAX-R15, %rsp 1480 subq $ORIG_RAX-R15, %rsp
1827 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 1481 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1828 /*
1829 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
1830 * as we should not be calling schedule in NMI context.
1831 * Even with normal interrupts enabled. An NMI should not be
1832 * setting NEED_RESCHED or anything that normal interrupts and
1833 * exceptions might do.
1834 */
1835 call save_paranoid 1482 call save_paranoid
1836 DEFAULT_FRAME 0 1483 DEFAULT_FRAME 0
1837
1838 /*
1839 * Save off the CR2 register. If we take a page fault in the NMI then
1840 * it could corrupt the CR2 value. If the NMI preempts a page fault
1841 * handler before it was able to read the CR2 register, and then the
1842 * NMI itself takes a page fault, the page fault that was preempted
1843 * will read the information from the NMI page fault and not the
1844 * origin fault. Save it off and restore it if it changes.
1845 * Use the r12 callee-saved register.
1846 */
1847 movq %cr2, %r12
1848
1849 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1484 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1850 movq %rsp,%rdi 1485 movq %rsp,%rdi
1851 movq $-1,%rsi 1486 movq $-1,%rsi
1852 call do_nmi 1487 call do_nmi
1853 1488#ifdef CONFIG_TRACE_IRQFLAGS
1854 /* Did the NMI take a page fault? Restore cr2 if it did */ 1489 /* paranoidexit; without TRACE_IRQS_OFF */
1855 movq %cr2, %rcx 1490 /* ebx: no swapgs flag */
1856 cmpq %rcx, %r12 1491 DISABLE_INTERRUPTS(CLBR_NONE)
1857 je 1f
1858 movq %r12, %cr2
18591:
1860
1861 testl %ebx,%ebx /* swapgs needed? */ 1492 testl %ebx,%ebx /* swapgs needed? */
1862 jnz nmi_restore 1493 jnz nmi_restore
1494 testl $3,CS(%rsp)
1495 jnz nmi_userspace
1863nmi_swapgs: 1496nmi_swapgs:
1864 SWAPGS_UNSAFE_STACK 1497 SWAPGS_UNSAFE_STACK
1865nmi_restore: 1498nmi_restore:
1866 RESTORE_ALL 8 1499 RESTORE_ALL 8
1867
1868 /* Pop the extra iret frame */
1869 addq $(5*8), %rsp
1870
1871 /* Clear the NMI executing stack variable */
1872 movq $0, 5*8(%rsp)
1873 jmp irq_return 1500 jmp irq_return
1501nmi_userspace:
1502 GET_THREAD_INFO(%rcx)
1503 movl TI_flags(%rcx),%ebx
1504 andl $_TIF_WORK_MASK,%ebx
1505 jz nmi_swapgs
1506 movq %rsp,%rdi /* &pt_regs */
1507 call sync_regs
1508 movq %rax,%rsp /* switch stack for scheduling */
1509 testl $_TIF_NEED_RESCHED,%ebx
1510 jnz nmi_schedule
1511 movl %ebx,%edx /* arg3: thread flags */
1512 ENABLE_INTERRUPTS(CLBR_NONE)
1513 xorl %esi,%esi /* arg2: oldset */
1514 movq %rsp,%rdi /* arg1: &pt_regs */
1515 call do_notify_resume
1516 DISABLE_INTERRUPTS(CLBR_NONE)
1517 jmp nmi_userspace
1518nmi_schedule:
1519 ENABLE_INTERRUPTS(CLBR_ANY)
1520 call schedule
1521 DISABLE_INTERRUPTS(CLBR_ANY)
1522 jmp nmi_userspace
1874 CFI_ENDPROC 1523 CFI_ENDPROC
1524#else
1525 jmp paranoid_exit
1526 CFI_ENDPROC
1527#endif
1875END(nmi) 1528END(nmi)
1876 1529
1877ENTRY(ignore_sysret) 1530ENTRY(ignore_sysret)
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 1d414029f1d..c9a281f272f 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -24,21 +24,40 @@
24#include <trace/syscall.h> 24#include <trace/syscall.h>
25 25
26#include <asm/cacheflush.h> 26#include <asm/cacheflush.h>
27#include <asm/kprobes.h>
28#include <asm/ftrace.h> 27#include <asm/ftrace.h>
29#include <asm/nops.h> 28#include <asm/nops.h>
29#include <asm/nmi.h>
30
30 31
31#ifdef CONFIG_DYNAMIC_FTRACE 32#ifdef CONFIG_DYNAMIC_FTRACE
32 33
34/*
35 * modifying_code is set to notify NMIs that they need to use
36 * memory barriers when entering or exiting. But we don't want
37 * to burden NMIs with unnecessary memory barriers when code
38 * modification is not being done (which is most of the time).
39 *
40 * A mutex is already held when ftrace_arch_code_modify_prepare
41 * and post_process are called. No locks need to be taken here.
42 *
43 * Stop machine will make sure currently running NMIs are done
44 * and new NMIs will see the updated variable before we need
45 * to worry about NMIs doing memory barriers.
46 */
47static int modifying_code __read_mostly;
48static DEFINE_PER_CPU(int, save_modifying_code);
49
33int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
34{ 51{
35 set_kernel_text_rw(); 52 set_kernel_text_rw();
36 set_all_modules_text_rw(); 53 set_all_modules_text_rw();
54 modifying_code = 1;
37 return 0; 55 return 0;
38} 56}
39 57
40int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
41{ 59{
60 modifying_code = 0;
42 set_all_modules_text_ro(); 61 set_all_modules_text_ro();
43 set_kernel_text_ro(); 62 set_kernel_text_ro();
44 return 0; 63 return 0;
@@ -71,204 +90,142 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
71 return calc.code; 90 return calc.code;
72} 91}
73 92
74static inline int 93/*
75within(unsigned long addr, unsigned long start, unsigned long end) 94 * Modifying code must take extra care. On an SMP machine, if
76{ 95 * the code being modified is also being executed on another CPU
77 return addr >= start && addr < end; 96 * that CPU will have undefined results and possibly take a GPF.
78} 97 * We use kstop_machine to stop other CPUS from exectuing code.
98 * But this does not stop NMIs from happening. We still need
99 * to protect against that. We separate out the modification of
100 * the code to take care of this.
101 *
102 * Two buffers are added: An IP buffer and a "code" buffer.
103 *
104 * 1) Put the instruction pointer into the IP buffer
105 * and the new code into the "code" buffer.
106 * 2) Wait for any running NMIs to finish and set a flag that says
107 * we are modifying code, it is done in an atomic operation.
108 * 3) Write the code
109 * 4) clear the flag.
110 * 5) Wait for any running NMIs to finish.
111 *
112 * If an NMI is executed, the first thing it does is to call
113 * "ftrace_nmi_enter". This will check if the flag is set to write
114 * and if it is, it will write what is in the IP and "code" buffers.
115 *
116 * The trick is, it does not matter if everyone is writing the same
117 * content to the code location. Also, if a CPU is executing code
118 * it is OK to write to that code location if the contents being written
119 * are the same as what exists.
120 */
79 121
80static int 122#define MOD_CODE_WRITE_FLAG (1 << 31) /* set when NMI should do the write */
81do_ftrace_mod_code(unsigned long ip, const void *new_code) 123static atomic_t nmi_running = ATOMIC_INIT(0);
82{ 124static int mod_code_status; /* holds return value of text write */
83 /* 125static void *mod_code_ip; /* holds the IP to write to */
84 * On x86_64, kernel text mappings are mapped read-only with 126static const void *mod_code_newcode; /* holds the text to write to the IP */
85 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
86 * of the kernel text mapping to modify the kernel text.
87 *
88 * For 32bit kernels, these mappings are same and we can use
89 * kernel identity mapping to modify code.
90 */
91 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
92 ip = (unsigned long)__va(__pa(ip));
93 127
94 return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); 128static unsigned nmi_wait_count;
95} 129static atomic_t nmi_update_count = ATOMIC_INIT(0);
96 130
97static const unsigned char *ftrace_nop_replace(void) 131int ftrace_arch_read_dyn_info(char *buf, int size)
98{ 132{
99 return ideal_nops[NOP_ATOMIC5]; 133 int r;
134
135 r = snprintf(buf, size, "%u %u",
136 nmi_wait_count,
137 atomic_read(&nmi_update_count));
138 return r;
100} 139}
101 140
102static int 141static void clear_mod_flag(void)
103ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code,
104 unsigned const char *new_code)
105{ 142{
106 unsigned char replaced[MCOUNT_INSN_SIZE]; 143 int old = atomic_read(&nmi_running);
107
108 /*
109 * Note: Due to modules and __init, code can
110 * disappear and change, we need to protect against faulting
111 * as well as code changing. We do this by using the
112 * probe_kernel_* functions.
113 *
114 * No real locking needed, this code is run through
115 * kstop_machine, or before SMP starts.
116 */
117 144
118 /* read the text we want to modify */ 145 for (;;) {
119 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) 146 int new = old & ~MOD_CODE_WRITE_FLAG;
120 return -EFAULT;
121 147
122 /* Make sure it is what we expect it to be */ 148 if (old == new)
123 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) 149 break;
124 return -EINVAL;
125 150
126 /* replace the text with the new text */ 151 old = atomic_cmpxchg(&nmi_running, old, new);
127 if (do_ftrace_mod_code(ip, new_code)) 152 }
128 return -EPERM;
129
130 sync_core();
131
132 return 0;
133} 153}
134 154
135int ftrace_make_nop(struct module *mod, 155static void ftrace_mod_code(void)
136 struct dyn_ftrace *rec, unsigned long addr)
137{ 156{
138 unsigned const char *new, *old;
139 unsigned long ip = rec->ip;
140
141 old = ftrace_call_replace(ip, addr);
142 new = ftrace_nop_replace();
143
144 /* 157 /*
145 * On boot up, and when modules are loaded, the MCOUNT_ADDR 158 * Yes, more than one CPU process can be writing to mod_code_status.
146 * is converted to a nop, and will never become MCOUNT_ADDR 159 * (and the code itself)
147 * again. This code is either running before SMP (on boot up) 160 * But if one were to fail, then they all should, and if one were
148 * or before the code will ever be executed (module load). 161 * to succeed, then they all should.
149 * We do not want to use the breakpoint version in this case,
150 * just modify the code directly.
151 */ 162 */
152 if (addr == MCOUNT_ADDR) 163 mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode,
153 return ftrace_modify_code_direct(rec->ip, old, new); 164 MCOUNT_INSN_SIZE);
154 165
155 /* Normal cases use add_brk_on_nop */ 166 /* if we fail, then kill any new writers */
156 WARN_ONCE(1, "invalid use of ftrace_make_nop"); 167 if (mod_code_status)
157 return -EINVAL; 168 clear_mod_flag();
158} 169}
159 170
160int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 171void ftrace_nmi_enter(void)
161{ 172{
162 unsigned const char *new, *old; 173 __this_cpu_write(save_modifying_code, modifying_code);
163 unsigned long ip = rec->ip;
164 174
165 old = ftrace_nop_replace(); 175 if (!__this_cpu_read(save_modifying_code))
166 new = ftrace_call_replace(ip, addr); 176 return;
167 177
168 /* Should only be called when module is loaded */ 178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
169 return ftrace_modify_code_direct(rec->ip, old, new); 179 smp_rmb();
180 ftrace_mod_code();
181 atomic_inc(&nmi_update_count);
182 }
183 /* Must have previous changes seen before executions */
184 smp_mb();
170} 185}
171 186
172/* 187void ftrace_nmi_exit(void)
173 * The modifying_ftrace_code is used to tell the breakpoint
174 * handler to call ftrace_int3_handler(). If it fails to
175 * call this handler for a breakpoint added by ftrace, then
176 * the kernel may crash.
177 *
178 * As atomic_writes on x86 do not need a barrier, we do not
179 * need to add smp_mb()s for this to work. It is also considered
180 * that we can not read the modifying_ftrace_code before
181 * executing the breakpoint. That would be quite remarkable if
182 * it could do that. Here's the flow that is required:
183 *
184 * CPU-0 CPU-1
185 *
186 * atomic_inc(mfc);
187 * write int3s
188 * <trap-int3> // implicit (r)mb
189 * if (atomic_read(mfc))
190 * call ftrace_int3_handler()
191 *
192 * Then when we are finished:
193 *
194 * atomic_dec(mfc);
195 *
196 * If we hit a breakpoint that was not set by ftrace, it does not
197 * matter if ftrace_int3_handler() is called or not. It will
198 * simply be ignored. But it is crucial that a ftrace nop/caller
199 * breakpoint is handled. No other user should ever place a
200 * breakpoint on an ftrace nop/caller location. It must only
201 * be done by this code.
202 */
203atomic_t modifying_ftrace_code __read_mostly;
204
205static int
206ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
207 unsigned const char *new_code);
208
209/*
210 * Should never be called:
211 * As it is only called by __ftrace_replace_code() which is called by
212 * ftrace_replace_code() that x86 overrides, and by ftrace_update_code()
213 * which is called to turn mcount into nops or nops into function calls
214 * but not to convert a function from not using regs to one that uses
215 * regs, which ftrace_modify_call() is for.
216 */
217int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr,
218 unsigned long addr)
219{ 188{
220 WARN_ON(1); 189 if (!__this_cpu_read(save_modifying_code))
221 return -EINVAL; 190 return;
191
192 /* Finish all executions before clearing nmi_running */
193 smp_mb();
194 atomic_dec(&nmi_running);
222} 195}
223 196
224int ftrace_update_ftrace_func(ftrace_func_t func) 197static void wait_for_nmi_and_set_mod_flag(void)
225{ 198{
226 unsigned long ip = (unsigned long)(&ftrace_call); 199 if (!atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG))
227 unsigned char old[MCOUNT_INSN_SIZE], *new; 200 return;
228 int ret;
229
230 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
231 new = ftrace_call_replace(ip, (unsigned long)func);
232
233 /* See comment above by declaration of modifying_ftrace_code */
234 atomic_inc(&modifying_ftrace_code);
235
236 ret = ftrace_modify_code(ip, old, new);
237
238 /* Also update the regs callback function */
239 if (!ret) {
240 ip = (unsigned long)(&ftrace_regs_call);
241 memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE);
242 new = ftrace_call_replace(ip, (unsigned long)func);
243 ret = ftrace_modify_code(ip, old, new);
244 }
245 201
246 atomic_dec(&modifying_ftrace_code); 202 do {
203 cpu_relax();
204 } while (atomic_cmpxchg(&nmi_running, 0, MOD_CODE_WRITE_FLAG));
247 205
248 return ret; 206 nmi_wait_count++;
249} 207}
250 208
251/* 209static void wait_for_nmi(void)
252 * A breakpoint was added to the code address we are about to
253 * modify, and this is the handle that will just skip over it.
254 * We are either changing a nop into a trace call, or a trace
255 * call to a nop. While the change is taking place, we treat
256 * it just like it was a nop.
257 */
258int ftrace_int3_handler(struct pt_regs *regs)
259{ 210{
260 if (WARN_ON_ONCE(!regs)) 211 if (!atomic_read(&nmi_running))
261 return 0; 212 return;
262 213
263 if (!ftrace_location(regs->ip - 1)) 214 do {
264 return 0; 215 cpu_relax();
216 } while (atomic_read(&nmi_running));
265 217
266 regs->ip += MCOUNT_INSN_SIZE - 1; 218 nmi_wait_count++;
219}
267 220
268 return 1; 221static inline int
222within(unsigned long addr, unsigned long start, unsigned long end)
223{
224 return addr >= start && addr < end;
269} 225}
270 226
271static int ftrace_write(unsigned long ip, const char *val, int size) 227static int
228do_ftrace_mod_code(unsigned long ip, const void *new_code)
272{ 229{
273 /* 230 /*
274 * On x86_64, kernel text mappings are mapped read-only with 231 * On x86_64, kernel text mappings are mapped read-only with
@@ -281,374 +238,100 @@ static int ftrace_write(unsigned long ip, const char *val, int size)
281 if (within(ip, (unsigned long)_text, (unsigned long)_etext)) 238 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
282 ip = (unsigned long)__va(__pa(ip)); 239 ip = (unsigned long)__va(__pa(ip));
283 240
284 return probe_kernel_write((void *)ip, val, size); 241 mod_code_ip = (void *)ip;
285} 242 mod_code_newcode = new_code;
286 243
287static int add_break(unsigned long ip, const char *old) 244 /* The buffers need to be visible before we let NMIs write them */
288{ 245 smp_mb();
289 unsigned char replaced[MCOUNT_INSN_SIZE];
290 unsigned char brk = BREAKPOINT_INSTRUCTION;
291 246
292 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE)) 247 wait_for_nmi_and_set_mod_flag();
293 return -EFAULT;
294 248
295 /* Make sure it is what we expect it to be */ 249 /* Make sure all running NMIs have finished before we write the code */
296 if (memcmp(replaced, old, MCOUNT_INSN_SIZE) != 0) 250 smp_mb();
297 return -EINVAL;
298 251
299 if (ftrace_write(ip, &brk, 1)) 252 ftrace_mod_code();
300 return -EPERM;
301 253
302 return 0; 254 /* Make sure the write happens before clearing the bit */
303} 255 smp_mb();
304 256
305static int add_brk_on_call(struct dyn_ftrace *rec, unsigned long addr) 257 clear_mod_flag();
306{ 258 wait_for_nmi();
307 unsigned const char *old;
308 unsigned long ip = rec->ip;
309
310 old = ftrace_call_replace(ip, addr);
311 259
312 return add_break(rec->ip, old); 260 return mod_code_status;
313}
314
315
316static int add_brk_on_nop(struct dyn_ftrace *rec)
317{
318 unsigned const char *old;
319
320 old = ftrace_nop_replace();
321
322 return add_break(rec->ip, old);
323}
324
325/*
326 * If the record has the FTRACE_FL_REGS set, that means that it
327 * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS
328 * is not not set, then it wants to convert to the normal callback.
329 */
330static unsigned long get_ftrace_addr(struct dyn_ftrace *rec)
331{
332 if (rec->flags & FTRACE_FL_REGS)
333 return (unsigned long)FTRACE_REGS_ADDR;
334 else
335 return (unsigned long)FTRACE_ADDR;
336} 261}
337 262
338/* 263static const unsigned char *ftrace_nop_replace(void)
339 * The FTRACE_FL_REGS_EN is set when the record already points to
340 * a function that saves all the regs. Basically the '_EN' version
341 * represents the current state of the function.
342 */
343static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec)
344{
345 if (rec->flags & FTRACE_FL_REGS_EN)
346 return (unsigned long)FTRACE_REGS_ADDR;
347 else
348 return (unsigned long)FTRACE_ADDR;
349}
350
351static int add_breakpoints(struct dyn_ftrace *rec, int enable)
352{ 264{
353 unsigned long ftrace_addr; 265 return ideal_nops[NOP_ATOMIC5];
354 int ret;
355
356 ret = ftrace_test_record(rec, enable);
357
358 ftrace_addr = get_ftrace_addr(rec);
359
360 switch (ret) {
361 case FTRACE_UPDATE_IGNORE:
362 return 0;
363
364 case FTRACE_UPDATE_MAKE_CALL:
365 /* converting nop to call */
366 return add_brk_on_nop(rec);
367
368 case FTRACE_UPDATE_MODIFY_CALL_REGS:
369 case FTRACE_UPDATE_MODIFY_CALL:
370 ftrace_addr = get_ftrace_old_addr(rec);
371 /* fall through */
372 case FTRACE_UPDATE_MAKE_NOP:
373 /* converting a call to a nop */
374 return add_brk_on_call(rec, ftrace_addr);
375 }
376 return 0;
377} 266}
378 267
379/* 268static int
380 * On error, we need to remove breakpoints. This needs to 269ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
381 * be done caefully. If the address does not currently have a 270 unsigned const char *new_code)
382 * breakpoint, we know we are done. Otherwise, we look at the
383 * remaining 4 bytes of the instruction. If it matches a nop
384 * we replace the breakpoint with the nop. Otherwise we replace
385 * it with the call instruction.
386 */
387static int remove_breakpoint(struct dyn_ftrace *rec)
388{ 271{
389 unsigned char ins[MCOUNT_INSN_SIZE]; 272 unsigned char replaced[MCOUNT_INSN_SIZE];
390 unsigned char brk = BREAKPOINT_INSTRUCTION;
391 const unsigned char *nop;
392 unsigned long ftrace_addr;
393 unsigned long ip = rec->ip;
394
395 /* If we fail the read, just give up */
396 if (probe_kernel_read(ins, (void *)ip, MCOUNT_INSN_SIZE))
397 return -EFAULT;
398
399 /* If this does not have a breakpoint, we are done */
400 if (ins[0] != brk)
401 return -1;
402
403 nop = ftrace_nop_replace();
404 273
405 /* 274 /*
406 * If the last 4 bytes of the instruction do not match 275 * Note: Due to modules and __init, code can
407 * a nop, then we assume that this is a call to ftrace_addr. 276 * disappear and change, we need to protect against faulting
277 * as well as code changing. We do this by using the
278 * probe_kernel_* functions.
279 *
280 * No real locking needed, this code is run through
281 * kstop_machine, or before SMP starts.
408 */ 282 */
409 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0) {
410 /*
411 * For extra paranoidism, we check if the breakpoint is on
412 * a call that would actually jump to the ftrace_addr.
413 * If not, don't touch the breakpoint, we make just create
414 * a disaster.
415 */
416 ftrace_addr = get_ftrace_addr(rec);
417 nop = ftrace_call_replace(ip, ftrace_addr);
418
419 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0)
420 goto update;
421
422 /* Check both ftrace_addr and ftrace_old_addr */
423 ftrace_addr = get_ftrace_old_addr(rec);
424 nop = ftrace_call_replace(ip, ftrace_addr);
425
426 if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)
427 return -EINVAL;
428 }
429 283
430 update: 284 /* read the text we want to modify */
431 return probe_kernel_write((void *)ip, &nop[0], 1); 285 if (probe_kernel_read(replaced, (void *)ip, MCOUNT_INSN_SIZE))
432} 286 return -EFAULT;
433 287
434static int add_update_code(unsigned long ip, unsigned const char *new) 288 /* Make sure it is what we expect it to be */
435{ 289 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
436 /* skip breakpoint */ 290 return -EINVAL;
437 ip++; 291
438 new++; 292 /* replace the text with the new text */
439 if (ftrace_write(ip, new, MCOUNT_INSN_SIZE - 1)) 293 if (do_ftrace_mod_code(ip, new_code))
440 return -EPERM; 294 return -EPERM;
441 return 0;
442}
443 295
444static int add_update_call(struct dyn_ftrace *rec, unsigned long addr) 296 sync_core();
445{
446 unsigned long ip = rec->ip;
447 unsigned const char *new;
448 297
449 new = ftrace_call_replace(ip, addr); 298 return 0;
450 return add_update_code(ip, new);
451} 299}
452 300
453static int add_update_nop(struct dyn_ftrace *rec) 301int ftrace_make_nop(struct module *mod,
302 struct dyn_ftrace *rec, unsigned long addr)
454{ 303{
304 unsigned const char *new, *old;
455 unsigned long ip = rec->ip; 305 unsigned long ip = rec->ip;
456 unsigned const char *new;
457 306
307 old = ftrace_call_replace(ip, addr);
458 new = ftrace_nop_replace(); 308 new = ftrace_nop_replace();
459 return add_update_code(ip, new);
460}
461
462static int add_update(struct dyn_ftrace *rec, int enable)
463{
464 unsigned long ftrace_addr;
465 int ret;
466
467 ret = ftrace_test_record(rec, enable);
468
469 ftrace_addr = get_ftrace_addr(rec);
470
471 switch (ret) {
472 case FTRACE_UPDATE_IGNORE:
473 return 0;
474 309
475 case FTRACE_UPDATE_MODIFY_CALL_REGS: 310 return ftrace_modify_code(rec->ip, old, new);
476 case FTRACE_UPDATE_MODIFY_CALL:
477 case FTRACE_UPDATE_MAKE_CALL:
478 /* converting nop to call */
479 return add_update_call(rec, ftrace_addr);
480
481 case FTRACE_UPDATE_MAKE_NOP:
482 /* converting a call to a nop */
483 return add_update_nop(rec);
484 }
485
486 return 0;
487} 311}
488 312
489static int finish_update_call(struct dyn_ftrace *rec, unsigned long addr) 313int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
490{ 314{
315 unsigned const char *new, *old;
491 unsigned long ip = rec->ip; 316 unsigned long ip = rec->ip;
492 unsigned const char *new;
493 317
318 old = ftrace_nop_replace();
494 new = ftrace_call_replace(ip, addr); 319 new = ftrace_call_replace(ip, addr);
495 320
496 if (ftrace_write(ip, new, 1)) 321 return ftrace_modify_code(rec->ip, old, new);
497 return -EPERM;
498
499 return 0;
500}
501
502static int finish_update_nop(struct dyn_ftrace *rec)
503{
504 unsigned long ip = rec->ip;
505 unsigned const char *new;
506
507 new = ftrace_nop_replace();
508
509 if (ftrace_write(ip, new, 1))
510 return -EPERM;
511 return 0;
512}
513
514static int finish_update(struct dyn_ftrace *rec, int enable)
515{
516 unsigned long ftrace_addr;
517 int ret;
518
519 ret = ftrace_update_record(rec, enable);
520
521 ftrace_addr = get_ftrace_addr(rec);
522
523 switch (ret) {
524 case FTRACE_UPDATE_IGNORE:
525 return 0;
526
527 case FTRACE_UPDATE_MODIFY_CALL_REGS:
528 case FTRACE_UPDATE_MODIFY_CALL:
529 case FTRACE_UPDATE_MAKE_CALL:
530 /* converting nop to call */
531 return finish_update_call(rec, ftrace_addr);
532
533 case FTRACE_UPDATE_MAKE_NOP:
534 /* converting a call to a nop */
535 return finish_update_nop(rec);
536 }
537
538 return 0;
539}
540
541static void do_sync_core(void *data)
542{
543 sync_core();
544}
545
546static void run_sync(void)
547{
548 int enable_irqs = irqs_disabled();
549
550 /* We may be called with interrupts disbled (on bootup). */
551 if (enable_irqs)
552 local_irq_enable();
553 on_each_cpu(do_sync_core, NULL, 1);
554 if (enable_irqs)
555 local_irq_disable();
556}
557
558void ftrace_replace_code(int enable)
559{
560 struct ftrace_rec_iter *iter;
561 struct dyn_ftrace *rec;
562 const char *report = "adding breakpoints";
563 int count = 0;
564 int ret;
565
566 for_ftrace_rec_iter(iter) {
567 rec = ftrace_rec_iter_record(iter);
568
569 ret = add_breakpoints(rec, enable);
570 if (ret)
571 goto remove_breakpoints;
572 count++;
573 }
574
575 run_sync();
576
577 report = "updating code";
578
579 for_ftrace_rec_iter(iter) {
580 rec = ftrace_rec_iter_record(iter);
581
582 ret = add_update(rec, enable);
583 if (ret)
584 goto remove_breakpoints;
585 }
586
587 run_sync();
588
589 report = "removing breakpoints";
590
591 for_ftrace_rec_iter(iter) {
592 rec = ftrace_rec_iter_record(iter);
593
594 ret = finish_update(rec, enable);
595 if (ret)
596 goto remove_breakpoints;
597 }
598
599 run_sync();
600
601 return;
602
603 remove_breakpoints:
604 ftrace_bug(ret, rec ? rec->ip : 0);
605 printk(KERN_WARNING "Failed on %s (%d):\n", report, count);
606 for_ftrace_rec_iter(iter) {
607 rec = ftrace_rec_iter_record(iter);
608 remove_breakpoint(rec);
609 }
610} 322}
611 323
612static int 324int ftrace_update_ftrace_func(ftrace_func_t func)
613ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
614 unsigned const char *new_code)
615{ 325{
326 unsigned long ip = (unsigned long)(&ftrace_call);
327 unsigned char old[MCOUNT_INSN_SIZE], *new;
616 int ret; 328 int ret;
617 329
618 ret = add_break(ip, old_code); 330 memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
619 if (ret) 331 new = ftrace_call_replace(ip, (unsigned long)func);
620 goto out; 332 ret = ftrace_modify_code(ip, old, new);
621
622 run_sync();
623
624 ret = add_update_code(ip, new_code);
625 if (ret)
626 goto fail_update;
627
628 run_sync();
629 333
630 ret = ftrace_write(ip, new_code, 1);
631 if (ret) {
632 ret = -EPERM;
633 goto out;
634 }
635 run_sync();
636 out:
637 return ret; 334 return ret;
638
639 fail_update:
640 probe_kernel_write((void *)ip, &old_code[0], 1);
641 goto out;
642}
643
644void arch_ftrace_update_code(int command)
645{
646 /* See comment above by declaration of modifying_ftrace_code */
647 atomic_inc(&modifying_ftrace_code);
648
649 ftrace_modify_all_code(command);
650
651 atomic_dec(&modifying_ftrace_code);
652} 335}
653 336
654int __init ftrace_dyn_arch_init(void *data) 337int __init ftrace_dyn_arch_init(void *data)
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 48d9d4ea102..af0699ba48c 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -52,5 +52,5 @@ void __init reserve_ebda_region(void)
52 lowmem = 0x9f000; 52 lowmem = 0x9f000;
53 53
54 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
55 memblock_reserve(lowmem, 0x100000 - lowmem); 55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
56} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index c18f59d1010..3bb08509a7a 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -14,6 +14,7 @@
14#include <asm/sections.h> 14#include <asm/sections.h>
15#include <asm/e820.h> 15#include <asm/e820.h>
16#include <asm/page.h> 16#include <asm/page.h>
17#include <asm/trampoline.h>
17#include <asm/apic.h> 18#include <asm/apic.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 20#include <asm/bios_ebda.h>
@@ -30,8 +31,9 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33 memblock_reserve(__pa_symbol(&_text), 34 memblock_init();
34 __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); 35
36 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 37
36#ifdef CONFIG_BLK_DEV_INITRD 38#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */ 39 /* Reserve INITRD */
@@ -40,7 +42,7 @@ void __init i386_start_kernel(void)
40 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 42 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
41 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 43 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
42 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 44 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
43 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); 45 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
44 } 46 }
45#endif 47#endif
46 48
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57a99a..5655c2272ad 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -24,6 +24,7 @@
24#include <asm/sections.h> 24#include <asm/sections.h>
25#include <asm/kdebug.h> 25#include <asm/kdebug.h>
26#include <asm/e820.h> 26#include <asm/e820.h>
27#include <asm/trampoline.h>
27#include <asm/bios_ebda.h> 28#include <asm/bios_ebda.h>
28 29
29static void __init zap_identity_mappings(void) 30static void __init zap_identity_mappings(void)
@@ -97,8 +98,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
97{ 98{
98 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
99 100
100 memblock_reserve(__pa_symbol(&_text), 101 memblock_init();
101 __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); 102
103 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
102 104
103#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 106 /* Reserve INITRD */
@@ -107,7 +109,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); 112 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
111 } 113 }
112#endif 114#endif
113 115
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8e7f6556028..ce0be7cd085 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -21,7 +21,6 @@
21#include <asm/msr-index.h> 21#include <asm/msr-index.h>
22#include <asm/cpufeature.h> 22#include <asm/cpufeature.h>
23#include <asm/percpu.h> 23#include <asm/percpu.h>
24#include <asm/nops.h>
25 24
26/* Physical address */ 25/* Physical address */
27#define pa(X) ((X) - __PAGE_OFFSET) 26#define pa(X) ((X) - __PAGE_OFFSET)
@@ -266,19 +265,6 @@ num_subarch_entries = (. - subarch_entries) / 4
266 jmp default_entry 265 jmp default_entry
267#endif /* CONFIG_PARAVIRT */ 266#endif /* CONFIG_PARAVIRT */
268 267
269#ifdef CONFIG_HOTPLUG_CPU
270/*
271 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
272 * up already except stack. We just set up stack here. Then call
273 * start_secondary().
274 */
275ENTRY(start_cpu0)
276 movl stack_start, %ecx
277 movl %ecx, %esp
278 jmp *(initial_code)
279ENDPROC(start_cpu0)
280#endif
281
282/* 268/*
283 * Non-boot CPU entry point; entered from trampoline.S 269 * Non-boot CPU entry point; entered from trampoline.S
284 * We can't lgdt here, because lgdt itself uses a data segment, but 270 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -287,7 +273,10 @@ ENDPROC(start_cpu0)
287 * If cpu hotplug is not supported then this code can go in init section 273 * If cpu hotplug is not supported then this code can go in init section
288 * which will be freed later 274 * which will be freed later
289 */ 275 */
276
290__CPUINIT 277__CPUINIT
278
279#ifdef CONFIG_SMP
291ENTRY(startup_32_smp) 280ENTRY(startup_32_smp)
292 cld 281 cld
293 movl $(__BOOT_DS),%eax 282 movl $(__BOOT_DS),%eax
@@ -298,35 +287,29 @@ ENTRY(startup_32_smp)
298 movl pa(stack_start),%ecx 287 movl pa(stack_start),%ecx
299 movl %eax,%ss 288 movl %eax,%ss
300 leal -__PAGE_OFFSET(%ecx),%esp 289 leal -__PAGE_OFFSET(%ecx),%esp
301 290#endif /* CONFIG_SMP */
302default_entry: 291default_entry:
292
303/* 293/*
304 * New page tables may be in 4Mbyte page mode and may 294 * New page tables may be in 4Mbyte page mode and may
305 * be using the global pages. 295 * be using the global pages.
306 * 296 *
307 * NOTE! If we are on a 486 we may have no cr4 at all! 297 * NOTE! If we are on a 486 we may have no cr4 at all!
308 * Specifically, cr4 exists if and only if CPUID exists 298 * So we do not try to touch it unless we really have
309 * and has flags other than the FPU flag set. 299 * some bits in it to set. This won't work if the BSP
300 * implements cr4 but this AP does not -- very unlikely
301 * but be warned! The same applies to the pse feature
302 * if not equally supported. --macro
303 *
304 * NOTE! We have to correct for the fact that we're
305 * not yet offset PAGE_OFFSET..
310 */ 306 */
311 movl $X86_EFLAGS_ID,%ecx 307#define cr4_bits pa(mmu_cr4_features)
312 pushl %ecx 308 movl cr4_bits,%edx
313 popfl 309 andl %edx,%edx
314 pushfl 310 jz 6f
315 popl %eax 311 movl %cr4,%eax # Turn on paging options (PSE,PAE,..)
316 pushl $0 312 orl %edx,%eax
317 popfl
318 pushfl
319 popl %edx
320 xorl %edx,%eax
321 testl %ecx,%eax
322 jz 6f # No ID flag = no CPUID = no CR4
323
324 movl $1,%eax
325 cpuid
326 andl $~1,%edx # Ignore CPUID.FPU
327 jz 6f # No flags or only CPUID.FPU = no CR4
328
329 movl pa(mmu_cr4_features),%eax
330 movl %eax,%cr4 313 movl %eax,%cr4
331 314
332 testb $X86_CR4_PAE, %al # check if PAE is enabled 315 testb $X86_CR4_PAE, %al # check if PAE is enabled
@@ -380,23 +363,28 @@ default_entry:
380 pushl $0 363 pushl $0
381 popfl 364 popfl
382 365
366#ifdef CONFIG_SMP
367 cmpb $0, ready
368 jnz checkCPUtype
369#endif /* CONFIG_SMP */
370
383/* 371/*
384 * start system 32-bit setup. We need to re-do some of the things done 372 * start system 32-bit setup. We need to re-do some of the things done
385 * in 16-bit mode for the "real" operations. 373 * in 16-bit mode for the "real" operations.
386 */ 374 */
387 movl setup_once_ref,%eax 375 call setup_idt
388 andl %eax,%eax 376
389 jz 1f # Did we do this already? 377checkCPUtype:
390 call *%eax 378
3911: 379 movl $-1,X86_CPUID # -1 for no CPUID initially
392 380
393/* check if it is 486 or 386. */ 381/* check if it is 486 or 386. */
394/* 382/*
395 * XXX - this does a lot of unnecessary setup. Alignment checks don't 383 * XXX - this does a lot of unnecessary setup. Alignment checks don't
396 * apply at our cpl of 0 and the stack ought to be aligned already, and 384 * apply at our cpl of 0 and the stack ought to be aligned already, and
397 * we don't need to preserve eflags. 385 * we don't need to preserve eflags.
398 */ 386 */
399 movl $-1,X86_CPUID # -1 for no CPUID initially 387
400 movb $3,X86 # at least 386 388 movb $3,X86 # at least 386
401 pushfl # push EFLAGS 389 pushfl # push EFLAGS
402 popl %eax # get EFLAGS 390 popl %eax # get EFLAGS
@@ -462,6 +450,21 @@ is386: movl $2,%ecx # set MP
462 movl $(__KERNEL_PERCPU), %eax 450 movl $(__KERNEL_PERCPU), %eax
463 movl %eax,%fs # set this cpu's percpu 451 movl %eax,%fs # set this cpu's percpu
464 452
453#ifdef CONFIG_CC_STACKPROTECTOR
454 /*
455 * The linker can't handle this by relocation. Manually set
456 * base address in stack canary segment descriptor.
457 */
458 cmpb $0,ready
459 jne 1f
460 movl $gdt_page,%eax
461 movl $stack_canary,%ecx
462 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
463 shrl $16, %ecx
464 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
465 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax)
4661:
467#endif
465 movl $(__KERNEL_STACK_CANARY),%eax 468 movl $(__KERNEL_STACK_CANARY),%eax
466 movl %eax,%gs 469 movl %eax,%gs
467 470
@@ -470,6 +473,7 @@ is386: movl $2,%ecx # set MP
470 473
471 cld # gcc2 wants the direction flag cleared at all times 474 cld # gcc2 wants the direction flag cleared at all times
472 pushl $0 # fake return address for unwinder 475 pushl $0 # fake return address for unwinder
476 movb $1, ready
473 jmp *(initial_code) 477 jmp *(initial_code)
474 478
475/* 479/*
@@ -491,122 +495,81 @@ check_x87:
491 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ 495 .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */
492 ret 496 ret
493 497
494
495#include "verify_cpu.S"
496
497/* 498/*
498 * setup_once 499 * setup_idt
499 * 500 *
500 * The setup work we only want to run on the BSP. 501 * sets up a idt with 256 entries pointing to
502 * ignore_int, interrupt gates. It doesn't actually load
503 * idt - that can be done only after paging has been enabled
504 * and the kernel moved to PAGE_OFFSET. Interrupts
505 * are enabled elsewhere, when we can be relatively
506 * sure everything is ok.
501 * 507 *
502 * Warning: %esi is live across this function. 508 * Warning: %esi is live across this function.
503 */ 509 */
504__INIT 510setup_idt:
505setup_once: 511 lea ignore_int,%edx
506 /*
507 * Set up a idt with 256 entries pointing to ignore_int,
508 * interrupt gates. It doesn't actually load idt - that needs
509 * to be done on each CPU. Interrupts are enabled elsewhere,
510 * when we can be relatively sure everything is ok.
511 */
512
513 movl $idt_table,%edi
514 movl $early_idt_handlers,%eax
515 movl $NUM_EXCEPTION_VECTORS,%ecx
5161:
517 movl %eax,(%edi)
518 movl %eax,4(%edi)
519 /* interrupt gate, dpl=0, present */
520 movl $(0x8E000000 + __KERNEL_CS),2(%edi)
521 addl $9,%eax
522 addl $8,%edi
523 loop 1b
524
525 movl $256 - NUM_EXCEPTION_VECTORS,%ecx
526 movl $ignore_int,%edx
527 movl $(__KERNEL_CS << 16),%eax 512 movl $(__KERNEL_CS << 16),%eax
528 movw %dx,%ax /* selector = 0x0010 = cs */ 513 movw %dx,%ax /* selector = 0x0010 = cs */
529 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ 514 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
5302: 515
516 lea idt_table,%edi
517 mov $256,%ecx
518rp_sidt:
531 movl %eax,(%edi) 519 movl %eax,(%edi)
532 movl %edx,4(%edi) 520 movl %edx,4(%edi)
533 addl $8,%edi 521 addl $8,%edi
534 loop 2b 522 dec %ecx
523 jne rp_sidt
535 524
536#ifdef CONFIG_CC_STACKPROTECTOR 525.macro set_early_handler handler,trapno
537 /* 526 lea \handler,%edx
538 * Configure the stack canary. The linker can't handle this by 527 movl $(__KERNEL_CS << 16),%eax
539 * relocation. Manually set base address in stack canary 528 movw %dx,%ax
540 * segment descriptor. 529 movw $0x8E00,%dx /* interrupt gate - dpl=0, present */
541 */ 530 lea idt_table,%edi
542 movl $gdt_page,%eax 531 movl %eax,8*\trapno(%edi)
543 movl $stack_canary,%ecx 532 movl %edx,8*\trapno+4(%edi)
544 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 533.endm
545 shrl $16, %ecx 534
546 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 535 set_early_handler handler=early_divide_err,trapno=0
547 movb %ch, 8 * GDT_ENTRY_STACK_CANARY + 7(%eax) 536 set_early_handler handler=early_illegal_opcode,trapno=6
548#endif 537 set_early_handler handler=early_protection_fault,trapno=13
538 set_early_handler handler=early_page_fault,trapno=14
549 539
550 andl $0,setup_once_ref /* Once is enough, thanks */
551 ret 540 ret
552 541
553ENTRY(early_idt_handlers) 542early_divide_err:
554 # 36(%esp) %eflags 543 xor %edx,%edx
555 # 32(%esp) %cs 544 pushl $0 /* fake errcode */
556 # 28(%esp) %eip 545 jmp early_fault
557 # 24(%rsp) error code
558 i = 0
559 .rept NUM_EXCEPTION_VECTORS
560 .if (EXCEPTION_ERRCODE_MASK >> i) & 1
561 ASM_NOP2
562 .else
563 pushl $0 # Dummy error code, to make stack frame uniform
564 .endif
565 pushl $i # 20(%esp) Vector number
566 jmp early_idt_handler
567 i = i + 1
568 .endr
569ENDPROC(early_idt_handlers)
570
571 /* This is global to keep gas from relaxing the jumps */
572ENTRY(early_idt_handler)
573 cld
574 cmpl $2,%ss:early_recursion_flag
575 je hlt_loop
576 incl %ss:early_recursion_flag
577 546
578 push %eax # 16(%esp) 547early_illegal_opcode:
579 push %ecx # 12(%esp) 548 movl $6,%edx
580 push %edx # 8(%esp) 549 pushl $0 /* fake errcode */
581 push %ds # 4(%esp) 550 jmp early_fault
582 push %es # 0(%esp)
583 movl $(__KERNEL_DS),%eax
584 movl %eax,%ds
585 movl %eax,%es
586 551
587 cmpl $(__KERNEL_CS),32(%esp) 552early_protection_fault:
588 jne 10f 553 movl $13,%edx
554 jmp early_fault
589 555
590 leal 28(%esp),%eax # Pointer to %eip 556early_page_fault:
591 call early_fixup_exception 557 movl $14,%edx
592 andl %eax,%eax 558 jmp early_fault
593 jnz ex_entry /* found an exception entry */
594 559
59510: 560early_fault:
561 cld
596#ifdef CONFIG_PRINTK 562#ifdef CONFIG_PRINTK
597 xorl %eax,%eax 563 pusha
598 movw %ax,2(%esp) /* clean up the segment values on some cpus */ 564 movl $(__KERNEL_DS),%eax
599 movw %ax,6(%esp) 565 movl %eax,%ds
600 movw %ax,34(%esp) 566 movl %eax,%es
601 leal 40(%esp),%eax 567 cmpl $2,early_recursion_flag
602 pushl %eax /* %esp before the exception */ 568 je hlt_loop
603 pushl %ebx 569 incl early_recursion_flag
604 pushl %ebp
605 pushl %esi
606 pushl %edi
607 movl %cr2,%eax 570 movl %cr2,%eax
608 pushl %eax 571 pushl %eax
609 pushl (20+6*4)(%esp) /* trapno */ 572 pushl %edx /* trapno */
610 pushl $fault_msg 573 pushl $fault_msg
611 call printk 574 call printk
612#endif 575#endif
@@ -615,17 +578,6 @@ hlt_loop:
615 hlt 578 hlt
616 jmp hlt_loop 579 jmp hlt_loop
617 580
618ex_entry:
619 pop %es
620 pop %ds
621 pop %edx
622 pop %ecx
623 pop %eax
624 addl $8,%esp /* drop vector number and error code */
625 decl %ss:early_recursion_flag
626 iret
627ENDPROC(early_idt_handler)
628
629/* This is the default interrupt "handler" :-) */ 581/* This is the default interrupt "handler" :-) */
630 ALIGN 582 ALIGN
631ignore_int: 583ignore_int:
@@ -659,18 +611,13 @@ ignore_int:
659 popl %eax 611 popl %eax
660#endif 612#endif
661 iret 613 iret
662ENDPROC(ignore_int)
663__INITDATA
664 .align 4
665early_recursion_flag:
666 .long 0
667 614
668__REFDATA 615#include "verify_cpu.S"
669 .align 4 616
617 __REFDATA
618.align 4
670ENTRY(initial_code) 619ENTRY(initial_code)
671 .long i386_start_kernel 620 .long i386_start_kernel
672ENTRY(setup_once_ref)
673 .long setup_once
674 621
675/* 622/*
676 * BSS section 623 * BSS section
@@ -723,19 +670,22 @@ ENTRY(initial_page_table)
723ENTRY(stack_start) 670ENTRY(stack_start)
724 .long init_thread_union+THREAD_SIZE 671 .long init_thread_union+THREAD_SIZE
725 672
726__INITRODATA 673early_recursion_flag:
674 .long 0
675
676ready: .byte 0
677
727int_msg: 678int_msg:
728 .asciz "Unknown interrupt or fault at: %p %p %p\n" 679 .asciz "Unknown interrupt or fault at: %p %p %p\n"
729 680
730fault_msg: 681fault_msg:
731/* fault info: */ 682/* fault info: */
732 .ascii "BUG: Int %d: CR2 %p\n" 683 .ascii "BUG: Int %d: CR2 %p\n"
733/* regs pushed in early_idt_handler: */ 684/* pusha regs: */
734 .ascii " EDI %p ESI %p EBP %p EBX %p\n" 685 .ascii " EDI %p ESI %p EBP %p ESP %p\n"
735 .ascii " ESP %p ES %p DS %p\n" 686 .ascii " EBX %p EDX %p ECX %p EAX %p\n"
736 .ascii " EDX %p ECX %p EAX %p\n"
737/* fault frame: */ 687/* fault frame: */
738 .ascii " vec %p err %p EIP %p CS %p flg %p\n" 688 .ascii " err %p EIP %p CS %p flg %p\n"
739 .ascii "Stack: %p %p %p %p %p %p %p %p\n" 689 .ascii "Stack: %p %p %p %p %p %p %p %p\n"
740 .ascii " %p %p %p %p %p %p %p %p\n" 690 .ascii " %p %p %p %p %p %p %p %p\n"
741 .asciz " %p %p %p %p %p %p %p %p\n" 691 .asciz " %p %p %p %p %p %p %p %p\n"
@@ -749,7 +699,6 @@ fault_msg:
749 * segment size, and 32-bit linear address value: 699 * segment size, and 32-bit linear address value:
750 */ 700 */
751 701
752 .data
753.globl boot_gdt_descr 702.globl boot_gdt_descr
754.globl idt_descr 703.globl idt_descr
755 704
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 980053c4b9c..e11e39478a4 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -19,15 +19,12 @@
19#include <asm/cache.h> 19#include <asm/cache.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/percpu.h> 21#include <asm/percpu.h>
22#include <asm/nops.h>
23 22
24#ifdef CONFIG_PARAVIRT 23#ifdef CONFIG_PARAVIRT
25#include <asm/asm-offsets.h> 24#include <asm/asm-offsets.h>
26#include <asm/paravirt.h> 25#include <asm/paravirt.h>
27#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
28#else 26#else
29#define GET_CR2_INTO(reg) movq %cr2, reg 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
30#define INTERRUPT_RETURN iretq
31#endif 28#endif
32 29
33/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -139,6 +136,10 @@ ident_complete:
139 /* Fixup phys_base */ 136 /* Fixup phys_base */
140 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
141 138
139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142
142 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
143 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
144 * filled with 0x90 (nop) 145 * filled with 0x90 (nop)
@@ -252,22 +253,6 @@ ENTRY(secondary_startup_64)
252 pushq %rax # target address in negative space 253 pushq %rax # target address in negative space
253 lretq 254 lretq
254 255
255#ifdef CONFIG_HOTPLUG_CPU
256/*
257 * Boot CPU0 entry point. It's called from play_dead(). Everything has been set
258 * up already except stack. We just set up stack here. Then call
259 * start_secondary().
260 */
261ENTRY(start_cpu0)
262 movq stack_start(%rip),%rsp
263 movq initial_code(%rip),%rax
264 pushq $0 # fake return address to stop unwinder
265 pushq $__KERNEL_CS # set correct cs
266 pushq %rax # target address in negative space
267 lretq
268ENDPROC(start_cpu0)
269#endif
270
271 /* SMP bootup changes these two */ 256 /* SMP bootup changes these two */
272 __REFDATA 257 __REFDATA
273 .align 8 258 .align 8
@@ -285,56 +270,36 @@ bad_address:
285 jmp bad_address 270 jmp bad_address
286 271
287 .section ".init.text","ax" 272 .section ".init.text","ax"
273#ifdef CONFIG_EARLY_PRINTK
288 .globl early_idt_handlers 274 .globl early_idt_handlers
289early_idt_handlers: 275early_idt_handlers:
290 # 104(%rsp) %rflags
291 # 96(%rsp) %cs
292 # 88(%rsp) %rip
293 # 80(%rsp) error code
294 i = 0 276 i = 0
295 .rept NUM_EXCEPTION_VECTORS 277 .rept NUM_EXCEPTION_VECTORS
296 .if (EXCEPTION_ERRCODE_MASK >> i) & 1 278 movl $i, %esi
297 ASM_NOP2
298 .else
299 pushq $0 # Dummy error code, to make stack frame uniform
300 .endif
301 pushq $i # 72(%rsp) Vector number
302 jmp early_idt_handler 279 jmp early_idt_handler
303 i = i + 1 280 i = i + 1
304 .endr 281 .endr
282#endif
305 283
306ENTRY(early_idt_handler) 284ENTRY(early_idt_handler)
307 cld 285#ifdef CONFIG_EARLY_PRINTK
308
309 cmpl $2,early_recursion_flag(%rip) 286 cmpl $2,early_recursion_flag(%rip)
310 jz 1f 287 jz 1f
311 incl early_recursion_flag(%rip) 288 incl early_recursion_flag(%rip)
312 289 GET_CR2_INTO_RCX
313 pushq %rax # 64(%rsp) 290 movq %rcx,%r9
314 pushq %rcx # 56(%rsp) 291 xorl %r8d,%r8d # zero for error code
315 pushq %rdx # 48(%rsp) 292 movl %esi,%ecx # get vector number
316 pushq %rsi # 40(%rsp) 293 # Test %ecx against mask of vectors that push error code.
317 pushq %rdi # 32(%rsp) 294 cmpl $31,%ecx
318 pushq %r8 # 24(%rsp) 295 ja 0f
319 pushq %r9 # 16(%rsp) 296 movl $1,%eax
320 pushq %r10 # 8(%rsp) 297 salq %cl,%rax
321 pushq %r11 # 0(%rsp) 298 testl $0x27d00,%eax
322 299 je 0f
323 cmpl $__KERNEL_CS,96(%rsp) 300 popq %r8 # get error code
324 jne 10f 3010: movq 0(%rsp),%rcx # get ip
325 302 movq 8(%rsp),%rdx # get cs
326 leaq 88(%rsp),%rdi # Pointer to %rip
327 call early_fixup_exception
328 andl %eax,%eax
329 jnz 20f # Found an exception entry
330
33110:
332#ifdef CONFIG_EARLY_PRINTK
333 GET_CR2_INTO(%r9) # can clobber any volatile register if pv
334 movl 80(%rsp),%r8d # error code
335 movl 72(%rsp),%esi # vector number
336 movl 96(%rsp),%edx # %cs
337 movq 88(%rsp),%rcx # %rip
338 xorl %eax,%eax 303 xorl %eax,%eax
339 leaq early_idt_msg(%rip),%rdi 304 leaq early_idt_msg(%rip),%rdi
340 call early_printk 305 call early_printk
@@ -343,32 +308,17 @@ ENTRY(early_idt_handler)
343 call dump_stack 308 call dump_stack
344#ifdef CONFIG_KALLSYMS 309#ifdef CONFIG_KALLSYMS
345 leaq early_idt_ripmsg(%rip),%rdi 310 leaq early_idt_ripmsg(%rip),%rdi
346 movq 40(%rsp),%rsi # %rip again 311 movq 0(%rsp),%rsi # get rip again
347 call __print_symbol 312 call __print_symbol
348#endif 313#endif
349#endif /* EARLY_PRINTK */ 314#endif /* EARLY_PRINTK */
3501: hlt 3151: hlt
351 jmp 1b 316 jmp 1b
352 317
35320: # Exception table entry found 318#ifdef CONFIG_EARLY_PRINTK
354 popq %r11
355 popq %r10
356 popq %r9
357 popq %r8
358 popq %rdi
359 popq %rsi
360 popq %rdx
361 popq %rcx
362 popq %rax
363 addq $16,%rsp # drop vector number and error code
364 decl early_recursion_flag(%rip)
365 INTERRUPT_RETURN
366
367 .balign 4
368early_recursion_flag: 319early_recursion_flag:
369 .long 0 320 .long 0
370 321
371#ifdef CONFIG_EARLY_PRINTK
372early_idt_msg: 322early_idt_msg:
373 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n" 323 .asciz "PANIC: early exception %02lx rip %lx:%lx error %lx cr2 %lx\n"
374early_idt_ripmsg: 324early_idt_ripmsg:
@@ -467,10 +417,6 @@ ENTRY(phys_base)
467ENTRY(idt_table) 417ENTRY(idt_table)
468 .skip IDT_ENTRIES * 16 418 .skip IDT_ENTRIES * 16
469 419
470 .align L1_CACHE_BYTES
471ENTRY(nmi_idt_table)
472 .skip IDT_ENTRIES * 16
473
474 __PAGE_ALIGNED_BSS 420 __PAGE_ALIGNED_BSS
475 .align PAGE_SIZE 421 .align PAGE_SIZE
476ENTRY(empty_zero_page) 422ENTRY(empty_zero_page)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index e28670f9a58..4d5a1005420 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,7 +1,7 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h> 3#include <linux/interrupt.h>
4#include <linux/export.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/i8253.h> 7#include <linux/i8253.h>
@@ -31,6 +31,8 @@
31#define HPET_MIN_CYCLES 128 31#define HPET_MIN_CYCLES 128
32#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) 32#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
33 33
34#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
35
34/* 36/*
35 * HPET address is set in acpi/boot.c, when an ACPI entry exists 37 * HPET address is set in acpi/boot.c, when an ACPI entry exists
36 */ 38 */
@@ -52,11 +54,6 @@ struct hpet_dev {
52 char name[10]; 54 char name[10];
53}; 55};
54 56
55inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev)
56{
57 return container_of(evtdev, struct hpet_dev, evt);
58}
59
60inline unsigned int hpet_readl(unsigned int a) 57inline unsigned int hpet_readl(unsigned int a)
61{ 58{
62 return readl(hpet_virt_address + a); 59 return readl(hpet_virt_address + a);
@@ -94,18 +91,13 @@ static int hpet_verbose;
94 91
95static int __init hpet_setup(char *str) 92static int __init hpet_setup(char *str)
96{ 93{
97 while (str) { 94 if (str) {
98 char *next = strchr(str, ',');
99
100 if (next)
101 *next++ = 0;
102 if (!strncmp("disable", str, 7)) 95 if (!strncmp("disable", str, 7))
103 boot_hpet_disable = 1; 96 boot_hpet_disable = 1;
104 if (!strncmp("force", str, 5)) 97 if (!strncmp("force", str, 5))
105 hpet_force_user = 1; 98 hpet_force_user = 1;
106 if (!strncmp("verbose", str, 7)) 99 if (!strncmp("verbose", str, 7))
107 hpet_verbose = 1; 100 hpet_verbose = 1;
108 str = next;
109 } 101 }
110 return 1; 102 return 1;
111} 103}
@@ -324,6 +316,8 @@ static void hpet_set_mode(enum clock_event_mode mode,
324 now = hpet_readl(HPET_COUNTER); 316 now = hpet_readl(HPET_COUNTER);
325 cmp = now + (unsigned int) delta; 317 cmp = now + (unsigned int) delta;
326 cfg = hpet_readl(HPET_Tn_CFG(timer)); 318 cfg = hpet_readl(HPET_Tn_CFG(timer));
319 /* Make sure we use edge triggered interrupts */
320 cfg &= ~HPET_TN_LEVEL;
327 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 321 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
328 HPET_TN_SETVAL | HPET_TN_32BIT; 322 HPET_TN_SETVAL | HPET_TN_32BIT;
329 hpet_writel(cfg, HPET_Tn_CFG(timer)); 323 hpet_writel(cfg, HPET_Tn_CFG(timer));
@@ -434,7 +428,7 @@ void hpet_msi_unmask(struct irq_data *data)
434 428
435 /* unmask it */ 429 /* unmask it */
436 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 430 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
437 cfg |= HPET_TN_ENABLE | HPET_TN_FSB; 431 cfg |= HPET_TN_FSB;
438 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 432 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
439} 433}
440 434
@@ -445,7 +439,7 @@ void hpet_msi_mask(struct irq_data *data)
445 439
446 /* mask it */ 440 /* mask it */
447 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 441 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
448 cfg &= ~(HPET_TN_ENABLE | HPET_TN_FSB); 442 cfg &= ~HPET_TN_FSB;
449 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 443 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
450} 444}
451 445
@@ -790,16 +784,15 @@ static int hpet_clocksource_register(void)
790 return 0; 784 return 0;
791} 785}
792 786
793static u32 *hpet_boot_cfg;
794
795/** 787/**
796 * hpet_enable - Try to setup the HPET timer. Returns 1 on success. 788 * hpet_enable - Try to setup the HPET timer. Returns 1 on success.
797 */ 789 */
798int __init hpet_enable(void) 790int __init hpet_enable(void)
799{ 791{
800 u32 hpet_period, cfg, id; 792 unsigned long hpet_period;
793 unsigned int id;
801 u64 freq; 794 u64 freq;
802 unsigned int i, last; 795 int i;
803 796
804 if (!is_hpet_capable()) 797 if (!is_hpet_capable())
805 return 0; 798 return 0;
@@ -851,45 +844,15 @@ int __init hpet_enable(void)
851 id = hpet_readl(HPET_ID); 844 id = hpet_readl(HPET_ID);
852 hpet_print_config(); 845 hpet_print_config();
853 846
854 last = (id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT;
855
856#ifdef CONFIG_HPET_EMULATE_RTC 847#ifdef CONFIG_HPET_EMULATE_RTC
857 /* 848 /*
858 * The legacy routing mode needs at least two channels, tick timer 849 * The legacy routing mode needs at least two channels, tick timer
859 * and the rtc emulation channel. 850 * and the rtc emulation channel.
860 */ 851 */
861 if (!last) 852 if (!(id & HPET_ID_NUMBER))
862 goto out_nohpet; 853 goto out_nohpet;
863#endif 854#endif
864 855
865 cfg = hpet_readl(HPET_CFG);
866 hpet_boot_cfg = kmalloc((last + 2) * sizeof(*hpet_boot_cfg),
867 GFP_KERNEL);
868 if (hpet_boot_cfg)
869 *hpet_boot_cfg = cfg;
870 else
871 pr_warn("HPET initial state will not be saved\n");
872 cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY);
873 hpet_writel(cfg, HPET_CFG);
874 if (cfg)
875 pr_warn("HPET: Unrecognized bits %#x set in global cfg\n",
876 cfg);
877
878 for (i = 0; i <= last; ++i) {
879 cfg = hpet_readl(HPET_Tn_CFG(i));
880 if (hpet_boot_cfg)
881 hpet_boot_cfg[i + 1] = cfg;
882 cfg &= ~(HPET_TN_ENABLE | HPET_TN_LEVEL | HPET_TN_FSB);
883 hpet_writel(cfg, HPET_Tn_CFG(i));
884 cfg &= ~(HPET_TN_PERIODIC | HPET_TN_PERIODIC_CAP
885 | HPET_TN_64BIT_CAP | HPET_TN_32BIT | HPET_TN_ROUTE
886 | HPET_TN_FSB | HPET_TN_FSB_CAP);
887 if (cfg)
888 pr_warn("HPET: Unrecognized bits %#x set in cfg#%u\n",
889 cfg, i);
890 }
891 hpet_print_config();
892
893 if (hpet_clocksource_register()) 856 if (hpet_clocksource_register())
894 goto out_nohpet; 857 goto out_nohpet;
895 858
@@ -957,28 +920,14 @@ fs_initcall(hpet_late_init);
957void hpet_disable(void) 920void hpet_disable(void)
958{ 921{
959 if (is_hpet_capable() && hpet_virt_address) { 922 if (is_hpet_capable() && hpet_virt_address) {
960 unsigned int cfg = hpet_readl(HPET_CFG), id, last; 923 unsigned int cfg = hpet_readl(HPET_CFG);
961 924
962 if (hpet_boot_cfg) 925 if (hpet_legacy_int_enabled) {
963 cfg = *hpet_boot_cfg;
964 else if (hpet_legacy_int_enabled) {
965 cfg &= ~HPET_CFG_LEGACY; 926 cfg &= ~HPET_CFG_LEGACY;
966 hpet_legacy_int_enabled = 0; 927 hpet_legacy_int_enabled = 0;
967 } 928 }
968 cfg &= ~HPET_CFG_ENABLE; 929 cfg &= ~HPET_CFG_ENABLE;
969 hpet_writel(cfg, HPET_CFG); 930 hpet_writel(cfg, HPET_CFG);
970
971 if (!hpet_boot_cfg)
972 return;
973
974 id = hpet_readl(HPET_ID);
975 last = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
976
977 for (id = 0; id <= last; ++id)
978 hpet_writel(hpet_boot_cfg[id + 1], HPET_Tn_CFG(id));
979
980 if (*hpet_boot_cfg & HPET_CFG_ENABLE)
981 hpet_writel(*hpet_boot_cfg, HPET_CFG);
982 } 931 }
983} 932}
984 933
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 245a71db401..739d8598f78 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -16,108 +16,39 @@
16#include <asm/uaccess.h> 16#include <asm/uaccess.h>
17#include <asm/ptrace.h> 17#include <asm/ptrace.h>
18#include <asm/i387.h> 18#include <asm/i387.h>
19#include <asm/fpu-internal.h>
20#include <asm/user.h> 19#include <asm/user.h>
21 20
22/* 21#ifdef CONFIG_X86_64
23 * Were we in an interrupt that interrupted kernel mode? 22# include <asm/sigcontext32.h>
24 * 23# include <asm/user32.h>
25 * For now, with eagerfpu we will return interrupted kernel FPU 24#else
26 * state as not-idle. TBD: Ideally we can change the return value 25# define save_i387_xstate_ia32 save_i387_xstate
27 * to something like __thread_has_fpu(current). But we need to 26# define restore_i387_xstate_ia32 restore_i387_xstate
28 * be careful of doing __thread_clear_has_fpu() before saving 27# define _fpstate_ia32 _fpstate
29 * the FPU etc for supporting nested uses etc. For now, take 28# define _xstate_ia32 _xstate
30 * the simple route! 29# define sig_xstate_ia32_size sig_xstate_size
31 * 30# define fx_sw_reserved_ia32 fx_sw_reserved
32 * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that 31# define user_i387_ia32_struct user_i387_struct
33 * pair does nothing at all: the thread must not have fpu (so 32# define user32_fxsr_struct user_fxsr_struct
34 * that we don't try to save the FPU state), and TS must 33#endif
35 * be set (so that the clts/stts pair does nothing that is
36 * visible in the interrupted kernel thread).
37 */
38static inline bool interrupted_kernel_fpu_idle(void)
39{
40 if (use_eager_fpu())
41 return 0;
42
43 return !__thread_has_fpu(current) &&
44 (read_cr0() & X86_CR0_TS);
45}
46
47/*
48 * Were we in user mode (or vm86 mode) when we were
49 * interrupted?
50 *
51 * Doing kernel_fpu_begin/end() is ok if we are running
52 * in an interrupt context from user mode - we'll just
53 * save the FPU state as required.
54 */
55static inline bool interrupted_user_mode(void)
56{
57 struct pt_regs *regs = get_irq_regs();
58 return regs && user_mode_vm(regs);
59}
60
61/*
62 * Can we use the FPU in kernel mode with the
63 * whole "kernel_fpu_begin/end()" sequence?
64 *
65 * It's always ok in process context (ie "not interrupt")
66 * but it is sometimes ok even from an irq.
67 */
68bool irq_fpu_usable(void)
69{
70 return !in_interrupt() ||
71 interrupted_user_mode() ||
72 interrupted_kernel_fpu_idle();
73}
74EXPORT_SYMBOL(irq_fpu_usable);
75
76void __kernel_fpu_begin(void)
77{
78 struct task_struct *me = current;
79
80 if (__thread_has_fpu(me)) {
81 __save_init_fpu(me);
82 __thread_clear_has_fpu(me);
83 /* We do 'stts()' in __kernel_fpu_end() */
84 } else if (!use_eager_fpu()) {
85 this_cpu_write(fpu_owner_task, NULL);
86 clts();
87 }
88}
89EXPORT_SYMBOL(__kernel_fpu_begin);
90
91void __kernel_fpu_end(void)
92{
93 if (use_eager_fpu())
94 math_state_restore();
95 else
96 stts();
97}
98EXPORT_SYMBOL(__kernel_fpu_end);
99 34
100void unlazy_fpu(struct task_struct *tsk) 35#ifdef CONFIG_MATH_EMULATION
101{ 36# define HAVE_HWFP (boot_cpu_data.hard_math)
102 preempt_disable(); 37#else
103 if (__thread_has_fpu(tsk)) { 38# define HAVE_HWFP 1
104 __save_init_fpu(tsk); 39#endif
105 __thread_fpu_end(tsk);
106 } else
107 tsk->fpu_counter = 0;
108 preempt_enable();
109}
110EXPORT_SYMBOL(unlazy_fpu);
111 40
112unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 41static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
113unsigned int xstate_size; 42unsigned int xstate_size;
114EXPORT_SYMBOL_GPL(xstate_size); 43EXPORT_SYMBOL_GPL(xstate_size);
44unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
115static struct i387_fxsave_struct fx_scratch __cpuinitdata; 45static struct i387_fxsave_struct fx_scratch __cpuinitdata;
116 46
117static void __cpuinit mxcsr_feature_mask_init(void) 47void __cpuinit mxcsr_feature_mask_init(void)
118{ 48{
119 unsigned long mask = 0; 49 unsigned long mask = 0;
120 50
51 clts();
121 if (cpu_has_fxsr) { 52 if (cpu_has_fxsr) {
122 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct)); 53 memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
123 asm volatile("fxsave %0" : : "m" (fx_scratch)); 54 asm volatile("fxsave %0" : : "m" (fx_scratch));
@@ -126,6 +57,7 @@ static void __cpuinit mxcsr_feature_mask_init(void)
126 mask = 0x0000ffbf; 57 mask = 0x0000ffbf;
127 } 58 }
128 mxcsr_feature_mask &= mask; 59 mxcsr_feature_mask &= mask;
60 stts();
129} 61}
130 62
131static void __cpuinit init_thread_xstate(void) 63static void __cpuinit init_thread_xstate(void)
@@ -175,16 +107,13 @@ void __cpuinit fpu_init(void)
175 cr0 |= X86_CR0_EM; 107 cr0 |= X86_CR0_EM;
176 write_cr0(cr0); 108 write_cr0(cr0);
177 109
178 /* 110 if (!smp_processor_id())
179 * init_thread_xstate is only called once to avoid overriding
180 * xstate_size during boot time or during CPU hotplug.
181 */
182 if (xstate_size == 0)
183 init_thread_xstate(); 111 init_thread_xstate();
184 112
185 mxcsr_feature_mask_init(); 113 mxcsr_feature_mask_init();
186 xsave_init(); 114 /* clean state in init */
187 eager_fpu_init(); 115 current_thread_info()->status = 0;
116 clear_used_math();
188} 117}
189 118
190void fpu_finit(struct fpu *fpu) 119void fpu_finit(struct fpu *fpu)
@@ -195,7 +124,12 @@ void fpu_finit(struct fpu *fpu)
195 } 124 }
196 125
197 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
198 fx_finit(&fpu->state->fxsave); 127 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
128
129 memset(fx, 0, xstate_size);
130 fx->cwd = 0x37f;
131 if (cpu_has_xmm)
132 fx->mxcsr = MXCSR_DEFAULT;
199 } else { 133 } else {
200 struct i387_fsave_struct *fp = &fpu->state->fsave; 134 struct i387_fsave_struct *fp = &fpu->state->fsave;
201 memset(fp, 0, xstate_size); 135 memset(fp, 0, xstate_size);
@@ -220,7 +154,6 @@ int init_fpu(struct task_struct *tsk)
220 if (tsk_used_math(tsk)) { 154 if (tsk_used_math(tsk)) {
221 if (HAVE_HWFP && tsk == current) 155 if (HAVE_HWFP && tsk == current)
222 unlazy_fpu(tsk); 156 unlazy_fpu(tsk);
223 tsk->thread.fpu.last_cpu = ~0;
224 return 0; 157 return 0;
225 } 158 }
226 159
@@ -439,7 +372,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
439 * FXSR floating point environment conversions. 372 * FXSR floating point environment conversions.
440 */ 373 */
441 374
442void 375static void
443convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) 376convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
444{ 377{
445 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; 378 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -476,8 +409,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
476 memcpy(&to[i], &from[i], sizeof(to[0])); 409 memcpy(&to[i], &from[i], sizeof(to[0]));
477} 410}
478 411
479void convert_to_fxsr(struct task_struct *tsk, 412static void convert_to_fxsr(struct task_struct *tsk,
480 const struct user_i387_ia32_struct *env) 413 const struct user_i387_ia32_struct *env)
481 414
482{ 415{
483 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; 416 struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
@@ -574,6 +507,223 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
574} 507}
575 508
576/* 509/*
510 * Signal frame handlers.
511 */
512
513static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
514{
515 struct task_struct *tsk = current;
516 struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
517
518 fp->status = fp->swd;
519 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
520 return -1;
521 return 1;
522}
523
524static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
525{
526 struct task_struct *tsk = current;
527 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
528 struct user_i387_ia32_struct env;
529 int err = 0;
530
531 convert_from_fxsr(&env, tsk);
532 if (__copy_to_user(buf, &env, sizeof(env)))
533 return -1;
534
535 err |= __put_user(fx->swd, &buf->status);
536 err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
537 if (err)
538 return -1;
539
540 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
541 return -1;
542 return 1;
543}
544
545static int save_i387_xsave(void __user *buf)
546{
547 struct task_struct *tsk = current;
548 struct _fpstate_ia32 __user *fx = buf;
549 int err = 0;
550
551
552 sanitize_i387_state(tsk);
553
554 /*
555 * For legacy compatible, we always set FP/SSE bits in the bit
556 * vector while saving the state to the user context.
557 * This will enable us capturing any changes(during sigreturn) to
558 * the FP/SSE bits by the legacy applications which don't touch
559 * xstate_bv in the xsave header.
560 *
561 * xsave aware applications can change the xstate_bv in the xsave
562 * header as well as change any contents in the memory layout.
563 * xrestore as part of sigreturn will capture all the changes.
564 */
565 tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
566
567 if (save_i387_fxsave(fx) < 0)
568 return -1;
569
570 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
571 sizeof(struct _fpx_sw_bytes));
572 err |= __put_user(FP_XSTATE_MAGIC2,
573 (__u32 __user *) (buf + sig_xstate_ia32_size
574 - FP_XSTATE_MAGIC2_SIZE));
575 if (err)
576 return -1;
577
578 return 1;
579}
580
581int save_i387_xstate_ia32(void __user *buf)
582{
583 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
584 struct task_struct *tsk = current;
585
586 if (!used_math())
587 return 0;
588
589 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
590 return -EACCES;
591 /*
592 * This will cause a "finit" to be triggered by the next
593 * attempted FPU operation by the 'current' process.
594 */
595 clear_used_math();
596
597 if (!HAVE_HWFP) {
598 return fpregs_soft_get(current, NULL,
599 0, sizeof(struct user_i387_ia32_struct),
600 NULL, fp) ? -1 : 1;
601 }
602
603 unlazy_fpu(tsk);
604
605 if (cpu_has_xsave)
606 return save_i387_xsave(fp);
607 if (cpu_has_fxsr)
608 return save_i387_fxsave(fp);
609 else
610 return save_i387_fsave(fp);
611}
612
613static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
614{
615 struct task_struct *tsk = current;
616
617 return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
618 sizeof(struct i387_fsave_struct));
619}
620
621static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
622 unsigned int size)
623{
624 struct task_struct *tsk = current;
625 struct user_i387_ia32_struct env;
626 int err;
627
628 err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
629 size);
630 /* mxcsr reserved bits must be masked to zero for security reasons */
631 tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
632 if (err || __copy_from_user(&env, buf, sizeof(env)))
633 return 1;
634 convert_to_fxsr(tsk, &env);
635
636 return 0;
637}
638
639static int restore_i387_xsave(void __user *buf)
640{
641 struct _fpx_sw_bytes fx_sw_user;
642 struct _fpstate_ia32 __user *fx_user =
643 ((struct _fpstate_ia32 __user *) buf);
644 struct i387_fxsave_struct __user *fx =
645 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
646 struct xsave_hdr_struct *xsave_hdr =
647 &current->thread.fpu.state->xsave.xsave_hdr;
648 u64 mask;
649 int err;
650
651 if (check_for_xstate(fx, buf, &fx_sw_user))
652 goto fx_only;
653
654 mask = fx_sw_user.xstate_bv;
655
656 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
657
658 xsave_hdr->xstate_bv &= pcntxt_mask;
659 /*
660 * These bits must be zero.
661 */
662 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
663
664 /*
665 * Init the state that is not present in the memory layout
666 * and enabled by the OS.
667 */
668 mask = ~(pcntxt_mask & ~mask);
669 xsave_hdr->xstate_bv &= mask;
670
671 return err;
672fx_only:
673 /*
674 * Couldn't find the extended state information in the memory
675 * layout. Restore the FP/SSE and init the other extended state
676 * enabled by the OS.
677 */
678 xsave_hdr->xstate_bv = XSTATE_FPSSE;
679 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
680}
681
682int restore_i387_xstate_ia32(void __user *buf)
683{
684 int err;
685 struct task_struct *tsk = current;
686 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
687
688 if (HAVE_HWFP)
689 clear_fpu(tsk);
690
691 if (!buf) {
692 if (used_math()) {
693 clear_fpu(tsk);
694 clear_used_math();
695 }
696
697 return 0;
698 } else
699 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
700 return -EACCES;
701
702 if (!used_math()) {
703 err = init_fpu(tsk);
704 if (err)
705 return err;
706 }
707
708 if (HAVE_HWFP) {
709 if (cpu_has_xsave)
710 err = restore_i387_xsave(buf);
711 else if (cpu_has_fxsr)
712 err = restore_i387_fxsave(fp, sizeof(struct
713 i387_fxsave_struct));
714 else
715 err = restore_i387_fsave(fp);
716 } else {
717 err = fpregs_soft_set(current, NULL,
718 0, sizeof(struct user_i387_ia32_struct),
719 NULL, fp) != 0;
720 }
721 set_used_math();
722
723 return err;
724}
725
726/*
577 * FPU state for core dumps. 727 * FPU state for core dumps.
578 * This is only used for a.out dumps now. 728 * This is only used for a.out dumps now.
579 * It is declared generically using elf_fpregset_t (which is 729 * It is declared generically using elf_fpregset_t (which is
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 9a5c460404d..610485223bd 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -15,6 +15,7 @@
15#include <linux/delay.h> 15#include <linux/delay.h>
16 16
17#include <linux/atomic.h> 17#include <linux/atomic.h>
18#include <asm/system.h>
18#include <asm/timer.h> 19#include <asm/timer.h>
19#include <asm/hw_irq.h> 20#include <asm/hw_irq.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
@@ -263,7 +264,7 @@ static void i8259A_shutdown(void)
263 * out of. 264 * out of.
264 */ 265 */
265 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
266 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 267 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
267} 268}
268 269
269static struct syscore_ops i8259_syscore_ops = { 270static struct syscore_ops i8259_syscore_ops = {
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e4595f10591..6c0802eb2f7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -9,7 +9,6 @@
9#include <linux/smp.h> 9#include <linux/smp.h>
10#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h> 11#include <linux/delay.h>
12#include <linux/export.h>
13 12
14#include <asm/apic.h> 13#include <asm/apic.h>
15#include <asm/io_apic.h> 14#include <asm/io_apic.h>
@@ -74,10 +73,6 @@ int arch_show_interrupts(struct seq_file *p, int prec)
74 for_each_online_cpu(j) 73 for_each_online_cpu(j)
75 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); 74 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
76 seq_printf(p, " IRQ work interrupts\n"); 75 seq_printf(p, " IRQ work interrupts\n");
77 seq_printf(p, "%*s: ", prec, "RTR");
78 for_each_online_cpu(j)
79 seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count);
80 seq_printf(p, " APIC ICR read retries\n");
81#endif 76#endif
82 if (x86_platform_ipi_callback) { 77 if (x86_platform_ipi_callback) {
83 seq_printf(p, "%*s: ", prec, "PLT"); 78 seq_printf(p, "%*s: ", prec, "PLT");
@@ -92,8 +87,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, " Rescheduling interrupts\n"); 87 seq_printf(p, " Rescheduling interrupts\n");
93 seq_printf(p, "%*s: ", prec, "CAL"); 88 seq_printf(p, "%*s: ", prec, "CAL");
94 for_each_online_cpu(j) 89 for_each_online_cpu(j)
95 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - 90 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
96 irq_stats(j)->irq_tlb_count);
97 seq_printf(p, " Function call interrupts\n"); 91 seq_printf(p, " Function call interrupts\n");
98 seq_printf(p, "%*s: ", prec, "TLB"); 92 seq_printf(p, "%*s: ", prec, "TLB");
99 for_each_online_cpu(j) 93 for_each_online_cpu(j)
@@ -141,13 +135,13 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
141 sum += irq_stats(cpu)->irq_spurious_count; 135 sum += irq_stats(cpu)->irq_spurious_count;
142 sum += irq_stats(cpu)->apic_perf_irqs; 136 sum += irq_stats(cpu)->apic_perf_irqs;
143 sum += irq_stats(cpu)->apic_irq_work_irqs; 137 sum += irq_stats(cpu)->apic_irq_work_irqs;
144 sum += irq_stats(cpu)->icr_read_retry_count;
145#endif 138#endif
146 if (x86_platform_ipi_callback) 139 if (x86_platform_ipi_callback)
147 sum += irq_stats(cpu)->x86_platform_ipis; 140 sum += irq_stats(cpu)->x86_platform_ipis;
148#ifdef CONFIG_SMP 141#ifdef CONFIG_SMP
149 sum += irq_stats(cpu)->irq_resched_count; 142 sum += irq_stats(cpu)->irq_resched_count;
150 sum += irq_stats(cpu)->irq_call_count; 143 sum += irq_stats(cpu)->irq_call_count;
144 sum += irq_stats(cpu)->irq_tlb_count;
151#endif 145#endif
152#ifdef CONFIG_X86_THERMAL_VECTOR 146#ifdef CONFIG_X86_THERMAL_VECTOR
153 sum += irq_stats(cpu)->irq_thermal_count; 147 sum += irq_stats(cpu)->irq_thermal_count;
@@ -186,8 +180,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
186 unsigned vector = ~regs->orig_ax; 180 unsigned vector = ~regs->orig_ax;
187 unsigned irq; 181 unsigned irq;
188 182
189 irq_enter();
190 exit_idle(); 183 exit_idle();
184 irq_enter();
191 185
192 irq = __this_cpu_read(vector_irq[vector]); 186 irq = __this_cpu_read(vector_irq[vector]);
193 187
@@ -214,10 +208,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
214 208
215 ack_APIC_irq(); 209 ack_APIC_irq();
216 210
217 irq_enter();
218
219 exit_idle(); 211 exit_idle();
220 212
213 irq_enter();
214
221 inc_irq_stat(x86_platform_ipis); 215 inc_irq_stat(x86_platform_ipis);
222 216
223 if (x86_platform_ipi_callback) 217 if (x86_platform_ipi_callback)
@@ -270,7 +264,7 @@ void fixup_irqs(void)
270 264
271 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { 265 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
272 break_affinity = 1; 266 break_affinity = 1;
273 affinity = cpu_online_mask; 267 affinity = cpu_all_mask;
274 } 268 }
275 269
276 chip = irq_data_get_irq_chip(data); 270 chip = irq_data_get_irq_chip(data);
@@ -282,21 +276,16 @@ void fixup_irqs(void)
282 else if (!(warned++)) 276 else if (!(warned++))
283 set_affinity = 0; 277 set_affinity = 0;
284 278
285 /*
286 * We unmask if the irq was not marked masked by the
287 * core code. That respects the lazy irq disable
288 * behaviour.
289 */
290 if (!irqd_can_move_in_process_context(data) && 279 if (!irqd_can_move_in_process_context(data) &&
291 !irqd_irq_masked(data) && chip->irq_unmask) 280 !irqd_irq_disabled(data) && chip->irq_unmask)
292 chip->irq_unmask(data); 281 chip->irq_unmask(data);
293 282
294 raw_spin_unlock(&desc->lock); 283 raw_spin_unlock(&desc->lock);
295 284
296 if (break_affinity && set_affinity) 285 if (break_affinity && set_affinity)
297 pr_notice("Broke affinity for irq %i\n", irq); 286 printk("Broke affinity for irq %i\n", irq);
298 else if (!set_affinity) 287 else if (!set_affinity)
299 pr_notice("Cannot set affinity for irq %i\n", irq); 288 printk("Cannot set affinity for irq %i\n", irq);
300 } 289 }
301 290
302 /* 291 /*
@@ -328,7 +317,6 @@ void fixup_irqs(void)
328 chip->irq_retrigger(data); 317 chip->irq_retrigger(data);
329 raw_spin_unlock(&desc->lock); 318 raw_spin_unlock(&desc->lock);
330 } 319 }
331 __this_cpu_write(vector_irq[vector], -1);
332 } 320 }
333} 321}
334#endif 322#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 344faf8d0d6..72090705a65 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -28,9 +28,6 @@ DEFINE_PER_CPU(struct pt_regs *, irq_regs);
28EXPORT_PER_CPU_SYMBOL(irq_regs); 28EXPORT_PER_CPU_SYMBOL(irq_regs);
29 29
30#ifdef CONFIG_DEBUG_STACKOVERFLOW 30#ifdef CONFIG_DEBUG_STACKOVERFLOW
31
32int sysctl_panic_on_stackoverflow __read_mostly;
33
34/* Debugging check for stack overflow: is there less than 1KB free? */ 31/* Debugging check for stack overflow: is there less than 1KB free? */
35static int check_stack_overflow(void) 32static int check_stack_overflow(void)
36{ 33{
@@ -46,8 +43,6 @@ static void print_stack_overflow(void)
46{ 43{
47 printk(KERN_WARNING "low stack detected by irq handler\n"); 44 printk(KERN_WARNING "low stack detected by irq handler\n");
48 dump_stack(); 45 dump_stack();
49 if (sysctl_panic_on_stackoverflow)
50 panic("low stack detected by irq handler - check messages\n");
51} 46}
52 47
53#else 48#else
@@ -100,8 +95,13 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
100 irqctx->tinfo.task = curctx->tinfo.task; 95 irqctx->tinfo.task = curctx->tinfo.task;
101 irqctx->tinfo.previous_esp = current_stack_pointer; 96 irqctx->tinfo.previous_esp = current_stack_pointer;
102 97
103 /* Copy the preempt_count so that the [soft]irq checks work. */ 98 /*
104 irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count; 99 * Copy the softirq bits in preempt_count so that the
100 * softirq checks work in the hardirq context.
101 */
102 irqctx->tinfo.preempt_count =
103 (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
104 (curctx->tinfo.preempt_count & SOFTIRQ_MASK);
105 105
106 if (unlikely(overflow)) 106 if (unlikely(overflow))
107 call_on_stack(print_stack_overflow, isp); 107 call_on_stack(print_stack_overflow, isp);
@@ -127,8 +127,8 @@ void __cpuinit irq_ctx_init(int cpu)
127 return; 127 return;
128 128
129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
130 THREADINFO_GFP, 130 THREAD_FLAGS,
131 THREAD_SIZE_ORDER)); 131 THREAD_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
133 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
@@ -137,8 +137,8 @@ void __cpuinit irq_ctx_init(int cpu)
137 per_cpu(hardirq_ctx, cpu) = irqctx; 137 per_cpu(hardirq_ctx, cpu) = irqctx;
138 138
139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
140 THREADINFO_GFP, 140 THREAD_FLAGS,
141 THREAD_SIZE_ORDER)); 141 THREAD_ORDER));
142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info)); 142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
143 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -191,7 +191,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
191 if (unlikely(!desc)) 191 if (unlikely(!desc))
192 return false; 192 return false;
193 193
194 if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { 194 if (!execute_on_irq_stack(overflow, desc, irq)) {
195 if (unlikely(overflow)) 195 if (unlikely(overflow))
196 print_stack_overflow(); 196 print_stack_overflow();
197 desc->handle_irq(irq, desc); 197 desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index d04d3ecded6..acf8fbf8fbd 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -26,8 +26,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
26DEFINE_PER_CPU(struct pt_regs *, irq_regs); 26DEFINE_PER_CPU(struct pt_regs *, irq_regs);
27EXPORT_PER_CPU_SYMBOL(irq_regs); 27EXPORT_PER_CPU_SYMBOL(irq_regs);
28 28
29int sysctl_panic_on_stackoverflow;
30
31/* 29/*
32 * Probabilistic stack overflow check: 30 * Probabilistic stack overflow check:
33 * 31 *
@@ -38,39 +36,15 @@ int sysctl_panic_on_stackoverflow;
38static inline void stack_overflow_check(struct pt_regs *regs) 36static inline void stack_overflow_check(struct pt_regs *regs)
39{ 37{
40#ifdef CONFIG_DEBUG_STACKOVERFLOW 38#ifdef CONFIG_DEBUG_STACKOVERFLOW
41#define STACK_TOP_MARGIN 128
42 struct orig_ist *oist;
43 u64 irq_stack_top, irq_stack_bottom;
44 u64 estack_top, estack_bottom;
45 u64 curbase = (u64)task_stack_page(current); 39 u64 curbase = (u64)task_stack_page(current);
46 40
47 if (user_mode_vm(regs)) 41 WARN_ONCE(regs->sp >= curbase &&
48 return; 42 regs->sp <= curbase + THREAD_SIZE &&
49 43 regs->sp < curbase + sizeof(struct thread_info) +
50 if (regs->sp >= curbase + sizeof(struct thread_info) + 44 sizeof(struct pt_regs) + 128,
51 sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
52 regs->sp <= curbase + THREAD_SIZE)
53 return;
54
55 irq_stack_top = (u64)__get_cpu_var(irq_stack_union.irq_stack) +
56 STACK_TOP_MARGIN;
57 irq_stack_bottom = (u64)__get_cpu_var(irq_stack_ptr);
58 if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
59 return;
60
61 oist = &__get_cpu_var(orig_ist);
62 estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
63 estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
64 if (regs->sp >= estack_top && regs->sp <= estack_bottom)
65 return;
66
67 WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n",
68 current->comm, curbase, regs->sp,
69 irq_stack_top, irq_stack_bottom,
70 estack_top, estack_bottom);
71 45
72 if (sysctl_panic_on_stackoverflow) 46 "do_IRQ: %s near stack overflow (cur:%Lx,sp:%lx)\n",
73 panic("low stack detected by irq handler - check messages\n"); 47 current->comm, curbase, regs->sp);
74#endif 48#endif
75} 49}
76 50
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 7dc4e459c2b..b3300e6bace 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -9,13 +9,14 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/kernel_stat.h> 11#include <linux/kernel_stat.h>
12#include <linux/device.h> 12#include <linux/sysdev.h>
13#include <linux/bitops.h> 13#include <linux/bitops.h>
14#include <linux/acpi.h> 14#include <linux/acpi.h>
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/delay.h> 16#include <linux/delay.h>
17 17
18#include <linux/atomic.h> 18#include <linux/atomic.h>
19#include <asm/system.h>
19#include <asm/timer.h> 20#include <asm/timer.h>
20#include <asm/hw_irq.h> 21#include <asm/hw_irq.h>
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
@@ -42,6 +43,39 @@
42 * (these are usually mapped into the 0x30-0xff vector range) 43 * (these are usually mapped into the 0x30-0xff vector range)
43 */ 44 */
44 45
46#ifdef CONFIG_X86_32
47/*
48 * Note that on a 486, we don't want to do a SIGFPE on an irq13
49 * as the irq is unreliable, and exception 16 works correctly
50 * (ie as explained in the intel literature). On a 386, you
51 * can't use exception 16 due to bad IBM design, so we have to
52 * rely on the less exact irq13.
53 *
54 * Careful.. Not only is IRQ13 unreliable, but it is also
55 * leads to races. IBM designers who came up with it should
56 * be shot.
57 */
58
59static irqreturn_t math_error_irq(int cpl, void *dev_id)
60{
61 outb(0, 0xF0);
62 if (ignore_fpu_irq || !boot_cpu_data.hard_math)
63 return IRQ_NONE;
64 math_error(get_irq_regs(), 0, 16);
65 return IRQ_HANDLED;
66}
67
68/*
69 * New motherboards sometimes make IRQ 13 be a PCI interrupt,
70 * so allow interrupt sharing.
71 */
72static struct irqaction fpu_irq = {
73 .handler = math_error_irq,
74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
76};
77#endif
78
45/* 79/*
46 * IRQ2 is cascade interrupt to second interrupt controller 80 * IRQ2 is cascade interrupt to second interrupt controller
47 */ 81 */
@@ -138,6 +172,79 @@ static void __init smp_intr_init(void)
138 */ 172 */
139 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
140 174
175 /* IPIs for invalidation */
176#define ALLOC_INVTLB_VEC(NR) \
177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
178 invalidate_interrupt##NR)
179
180 switch (NUM_INVALIDATE_TLB_VECTORS) {
181 default:
182 ALLOC_INVTLB_VEC(31);
183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
247
141 /* IPI for generic function call */ 248 /* IPI for generic function call */
142 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
143 250
@@ -199,16 +306,23 @@ void __init native_init_IRQ(void)
199 * us. (some of these will be overridden and become 306 * us. (some of these will be overridden and become
200 * 'special' SMP interrupts) 307 * 'special' SMP interrupts)
201 */ 308 */
202 i = FIRST_EXTERNAL_VECTOR; 309 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
203 for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
204 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ 310 /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
205 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 311 if (!test_bit(i, used_vectors))
312 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
206 } 313 }
207 314
208 if (!acpi_ioapic && !of_ioapic) 315 if (!acpi_ioapic && !of_ioapic)
209 setup_irq(2, &irq2); 316 setup_irq(2, &irq2);
210 317
211#ifdef CONFIG_X86_32 318#ifdef CONFIG_X86_32
319 /*
320 * External FPU? Set up irq13 if so, for
321 * original braindamaged IBM FERR coupling.
322 */
323 if (boot_cpu_data.hard_math && !cpu_has_fpu)
324 setup_irq(FPU_IRQ, &fpu_irq);
325
212 irq_ctx_init(smp_processor_id()); 326 irq_ctx_init(smp_processor_id());
213#endif 327#endif
214} 328}
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 2889b3d4388..3fee346ef54 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -24,9 +24,8 @@ union jump_code_union {
24 } __attribute__((packed)); 24 } __attribute__((packed));
25}; 25};
26 26
27static void __jump_label_transform(struct jump_entry *entry, 27void arch_jump_label_transform(struct jump_entry *entry,
28 enum jump_label_type type, 28 enum jump_label_type type)
29 void *(*poker)(void *, const void *, size_t))
30{ 29{
31 union jump_code_union code; 30 union jump_code_union code;
32 31
@@ -36,24 +35,17 @@ static void __jump_label_transform(struct jump_entry *entry,
36 (entry->code + JUMP_LABEL_NOP_SIZE); 35 (entry->code + JUMP_LABEL_NOP_SIZE);
37 } else 36 } else
38 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); 37 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
39
40 (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
41}
42
43void arch_jump_label_transform(struct jump_entry *entry,
44 enum jump_label_type type)
45{
46 get_online_cpus(); 38 get_online_cpus();
47 mutex_lock(&text_mutex); 39 mutex_lock(&text_mutex);
48 __jump_label_transform(entry, type, text_poke_smp); 40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
49 mutex_unlock(&text_mutex); 41 mutex_unlock(&text_mutex);
50 put_online_cpus(); 42 put_online_cpus();
51} 43}
52 44
53__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, 45void arch_jump_label_text_poke_early(jump_label_t addr)
54 enum jump_label_type type)
55{ 46{
56 __jump_label_transform(entry, type, text_poke_early); 47 text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
48 JUMP_LABEL_NOP_SIZE);
57} 49}
58 50
59#endif 51#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index dc1404bf8e4..90fcf62854b 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -68,9 +68,16 @@ static ssize_t setup_data_read(struct file *file, char __user *user_buf,
68 return count; 68 return count;
69} 69}
70 70
71static int setup_data_open(struct inode *inode, struct file *file)
72{
73 file->private_data = inode->i_private;
74
75 return 0;
76}
77
71static const struct file_operations fops_setup_data = { 78static const struct file_operations fops_setup_data = {
72 .read = setup_data_read, 79 .read = setup_data_read,
73 .open = simple_open, 80 .open = setup_data_open,
74 .llseek = default_llseek, 81 .llseek = default_llseek,
75}; 82};
76 83
@@ -107,7 +114,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
107{ 114{
108 struct setup_data_node *node; 115 struct setup_data_node *node;
109 struct setup_data *data; 116 struct setup_data *data;
110 int error; 117 int error = -ENOMEM;
111 struct dentry *d; 118 struct dentry *d;
112 struct page *pg; 119 struct page *pg;
113 u64 pa_data; 120 u64 pa_data;
@@ -121,10 +128,8 @@ static int __init create_setup_data_nodes(struct dentry *parent)
121 128
122 while (pa_data) { 129 while (pa_data) {
123 node = kmalloc(sizeof(*node), GFP_KERNEL); 130 node = kmalloc(sizeof(*node), GFP_KERNEL);
124 if (!node) { 131 if (!node)
125 error = -ENOMEM;
126 goto err_dir; 132 goto err_dir;
127 }
128 133
129 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 134 pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
130 if (PageHighMem(pg)) { 135 if (PageHighMem(pg)) {
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 836f8322960..00354d4919a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -43,11 +43,10 @@
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h> 45#include <linux/hw_breakpoint.h>
46#include <linux/uaccess.h>
47#include <linux/memory.h>
48 46
49#include <asm/debugreg.h> 47#include <asm/debugreg.h>
50#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h>
51#include <asm/apic.h> 50#include <asm/apic.h>
52#include <asm/nmi.h> 51#include <asm/nmi.h>
53 52
@@ -68,6 +67,8 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
68 { "ss", 4, offsetof(struct pt_regs, ss) }, 67 { "ss", 4, offsetof(struct pt_regs, ss) },
69 { "ds", 4, offsetof(struct pt_regs, ds) }, 68 { "ds", 4, offsetof(struct pt_regs, ds) },
70 { "es", 4, offsetof(struct pt_regs, es) }, 69 { "es", 4, offsetof(struct pt_regs, es) },
70 { "fs", 4, -1 },
71 { "gs", 4, -1 },
71#else 72#else
72 { "ax", 8, offsetof(struct pt_regs, ax) }, 73 { "ax", 8, offsetof(struct pt_regs, ax) },
73 { "bx", 8, offsetof(struct pt_regs, bx) }, 74 { "bx", 8, offsetof(struct pt_regs, bx) },
@@ -89,11 +90,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
89 { "flags", 4, offsetof(struct pt_regs, flags) }, 90 { "flags", 4, offsetof(struct pt_regs, flags) },
90 { "cs", 4, offsetof(struct pt_regs, cs) }, 91 { "cs", 4, offsetof(struct pt_regs, cs) },
91 { "ss", 4, offsetof(struct pt_regs, ss) }, 92 { "ss", 4, offsetof(struct pt_regs, ss) },
92 { "ds", 4, -1 },
93 { "es", 4, -1 },
94#endif 93#endif
95 { "fs", 4, -1 },
96 { "gs", 4, -1 },
97}; 94};
98 95
99int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) 96int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
@@ -444,12 +441,12 @@ void kgdb_roundup_cpus(unsigned long flags)
444 441
445/** 442/**
446 * kgdb_arch_handle_exception - Handle architecture specific GDB packets. 443 * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
447 * @e_vector: The error vector of the exception that happened. 444 * @vector: The error vector of the exception that happened.
448 * @signo: The signal number of the exception that happened. 445 * @signo: The signal number of the exception that happened.
449 * @err_code: The error code of the exception that happened. 446 * @err_code: The error code of the exception that happened.
450 * @remcomInBuffer: The buffer of the packet we have read. 447 * @remcom_in_buffer: The buffer of the packet we have read.
451 * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. 448 * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
452 * @linux_regs: The &struct pt_regs of the current process. 449 * @regs: The &struct pt_regs of the current process.
453 * 450 *
454 * This function MUST handle the 'c' and 's' command packets, 451 * This function MUST handle the 'c' and 's' command packets,
455 * as well packets to set / remove a hardware breakpoint, if used. 452 * as well packets to set / remove a hardware breakpoint, if used.
@@ -514,37 +511,28 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
514 511
515static int was_in_debug_nmi[NR_CPUS]; 512static int was_in_debug_nmi[NR_CPUS];
516 513
517static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) 514static int __kgdb_notify(struct die_args *args, unsigned long cmd)
518{ 515{
516 struct pt_regs *regs = args->regs;
517
519 switch (cmd) { 518 switch (cmd) {
520 case NMI_LOCAL: 519 case DIE_NMI:
521 if (atomic_read(&kgdb_active) != -1) { 520 if (atomic_read(&kgdb_active) != -1) {
522 /* KGDB CPU roundup */ 521 /* KGDB CPU roundup */
523 kgdb_nmicallback(raw_smp_processor_id(), regs); 522 kgdb_nmicallback(raw_smp_processor_id(), regs);
524 was_in_debug_nmi[raw_smp_processor_id()] = 1; 523 was_in_debug_nmi[raw_smp_processor_id()] = 1;
525 touch_nmi_watchdog(); 524 touch_nmi_watchdog();
526 return NMI_HANDLED; 525 return NOTIFY_STOP;
527 } 526 }
528 break; 527 return NOTIFY_DONE;
529 528
530 case NMI_UNKNOWN: 529 case DIE_NMIUNKNOWN:
531 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
532 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
533 return NMI_HANDLED; 532 return NOTIFY_STOP;
534 } 533 }
535 break; 534 return NOTIFY_DONE;
536 default:
537 /* do nothing */
538 break;
539 }
540 return NMI_DONE;
541}
542
543static int __kgdb_notify(struct die_args *args, unsigned long cmd)
544{
545 struct pt_regs *regs = args->regs;
546 535
547 switch (cmd) {
548 case DIE_DEBUG: 536 case DIE_DEBUG:
549 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
550 if (user_mode(regs)) 538 if (user_mode(regs))
@@ -602,6 +590,11 @@ kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
602 590
603static struct notifier_block kgdb_notifier = { 591static struct notifier_block kgdb_notifier = {
604 .notifier_call = kgdb_notify, 592 .notifier_call = kgdb_notify,
593
594 /*
595 * Lowest-prio notifier priority, we want to be notified last:
596 */
597 .priority = NMI_LOCAL_LOW_PRIOR,
605}; 598};
606 599
607/** 600/**
@@ -612,31 +605,7 @@ static struct notifier_block kgdb_notifier = {
612 */ 605 */
613int kgdb_arch_init(void) 606int kgdb_arch_init(void)
614{ 607{
615 int retval; 608 return register_die_notifier(&kgdb_notifier);
616
617 retval = register_die_notifier(&kgdb_notifier);
618 if (retval)
619 goto out;
620
621 retval = register_nmi_handler(NMI_LOCAL, kgdb_nmi_handler,
622 0, "kgdb");
623 if (retval)
624 goto out1;
625
626 retval = register_nmi_handler(NMI_UNKNOWN, kgdb_nmi_handler,
627 0, "kgdb");
628
629 if (retval)
630 goto out2;
631
632 return retval;
633
634out2:
635 unregister_nmi_handler(NMI_LOCAL, "kgdb");
636out1:
637 unregister_die_notifier(&kgdb_notifier);
638out:
639 return retval;
640} 609}
641 610
642static void kgdb_hw_overflow_handler(struct perf_event *event, 611static void kgdb_hw_overflow_handler(struct perf_event *event,
@@ -704,8 +673,6 @@ void kgdb_arch_exit(void)
704 breakinfo[i].pev = NULL; 673 breakinfo[i].pev = NULL;
705 } 674 }
706 } 675 }
707 unregister_nmi_handler(NMI_UNKNOWN, "kgdb");
708 unregister_nmi_handler(NMI_LOCAL, "kgdb");
709 unregister_die_notifier(&kgdb_notifier); 676 unregister_die_notifier(&kgdb_notifier);
710} 677}
711 678
@@ -743,66 +710,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
743 regs->ip = ip; 710 regs->ip = ip;
744} 711}
745 712
746int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
747{
748 int err;
749#ifdef CONFIG_DEBUG_RODATA
750 char opc[BREAK_INSTR_SIZE];
751#endif /* CONFIG_DEBUG_RODATA */
752
753 bpt->type = BP_BREAKPOINT;
754 err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr,
755 BREAK_INSTR_SIZE);
756 if (err)
757 return err;
758 err = probe_kernel_write((char *)bpt->bpt_addr,
759 arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE);
760#ifdef CONFIG_DEBUG_RODATA
761 if (!err)
762 return err;
763 /*
764 * It is safe to call text_poke() because normal kernel execution
765 * is stopped on all cores, so long as the text_mutex is not locked.
766 */
767 if (mutex_is_locked(&text_mutex))
768 return -EBUSY;
769 text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr,
770 BREAK_INSTR_SIZE);
771 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
772 if (err)
773 return err;
774 if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE))
775 return -EINVAL;
776 bpt->type = BP_POKE_BREAKPOINT;
777#endif /* CONFIG_DEBUG_RODATA */
778 return err;
779}
780
781int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
782{
783#ifdef CONFIG_DEBUG_RODATA
784 int err;
785 char opc[BREAK_INSTR_SIZE];
786
787 if (bpt->type != BP_POKE_BREAKPOINT)
788 goto knl_write;
789 /*
790 * It is safe to call text_poke() because normal kernel execution
791 * is stopped on all cores, so long as the text_mutex is not locked.
792 */
793 if (mutex_is_locked(&text_mutex))
794 goto knl_write;
795 text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE);
796 err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE);
797 if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE))
798 goto knl_write;
799 return err;
800knl_write:
801#endif /* CONFIG_DEBUG_RODATA */
802 return probe_kernel_write((char *)bpt->bpt_addr,
803 (char *)bpt->saved_instr, BREAK_INSTR_SIZE);
804}
805
806struct kgdb_arch arch_kgdb_ops = { 713struct kgdb_arch arch_kgdb_ops = {
807 /* Breakpoint instruction: */ 714 /* Breakpoint instruction: */
808 .gdb_bpt_instr = { 0xcc }, 715 .gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes-common.h b/arch/x86/kernel/kprobes-common.h
deleted file mode 100644
index 3230b68ef29..00000000000
--- a/arch/x86/kernel/kprobes-common.h
+++ /dev/null
@@ -1,102 +0,0 @@
1#ifndef __X86_KERNEL_KPROBES_COMMON_H
2#define __X86_KERNEL_KPROBES_COMMON_H
3
4/* Kprobes and Optprobes common header */
5
6#ifdef CONFIG_X86_64
7#define SAVE_REGS_STRING \
8 /* Skip cs, ip, orig_ax. */ \
9 " subq $24, %rsp\n" \
10 " pushq %rdi\n" \
11 " pushq %rsi\n" \
12 " pushq %rdx\n" \
13 " pushq %rcx\n" \
14 " pushq %rax\n" \
15 " pushq %r8\n" \
16 " pushq %r9\n" \
17 " pushq %r10\n" \
18 " pushq %r11\n" \
19 " pushq %rbx\n" \
20 " pushq %rbp\n" \
21 " pushq %r12\n" \
22 " pushq %r13\n" \
23 " pushq %r14\n" \
24 " pushq %r15\n"
25#define RESTORE_REGS_STRING \
26 " popq %r15\n" \
27 " popq %r14\n" \
28 " popq %r13\n" \
29 " popq %r12\n" \
30 " popq %rbp\n" \
31 " popq %rbx\n" \
32 " popq %r11\n" \
33 " popq %r10\n" \
34 " popq %r9\n" \
35 " popq %r8\n" \
36 " popq %rax\n" \
37 " popq %rcx\n" \
38 " popq %rdx\n" \
39 " popq %rsi\n" \
40 " popq %rdi\n" \
41 /* Skip orig_ax, ip, cs */ \
42 " addq $24, %rsp\n"
43#else
44#define SAVE_REGS_STRING \
45 /* Skip cs, ip, orig_ax and gs. */ \
46 " subl $16, %esp\n" \
47 " pushl %fs\n" \
48 " pushl %es\n" \
49 " pushl %ds\n" \
50 " pushl %eax\n" \
51 " pushl %ebp\n" \
52 " pushl %edi\n" \
53 " pushl %esi\n" \
54 " pushl %edx\n" \
55 " pushl %ecx\n" \
56 " pushl %ebx\n"
57#define RESTORE_REGS_STRING \
58 " popl %ebx\n" \
59 " popl %ecx\n" \
60 " popl %edx\n" \
61 " popl %esi\n" \
62 " popl %edi\n" \
63 " popl %ebp\n" \
64 " popl %eax\n" \
65 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
66 " addl $24, %esp\n"
67#endif
68
69/* Ensure if the instruction can be boostable */
70extern int can_boost(kprobe_opcode_t *instruction);
71/* Recover instruction if given address is probed */
72extern unsigned long recover_probed_instruction(kprobe_opcode_t *buf,
73 unsigned long addr);
74/*
75 * Copy an instruction and adjust the displacement if the instruction
76 * uses the %rip-relative addressing mode.
77 */
78extern int __copy_instruction(u8 *dest, u8 *src);
79
80/* Generate a relative-jump/call instruction */
81extern void synthesize_reljump(void *from, void *to);
82extern void synthesize_relcall(void *from, void *to);
83
84#ifdef CONFIG_OPTPROBES
85extern int arch_init_optprobes(void);
86extern int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter);
87extern unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr);
88#else /* !CONFIG_OPTPROBES */
89static inline int arch_init_optprobes(void)
90{
91 return 0;
92}
93static inline int setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
94{
95 return 0;
96}
97static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
98{
99 return addr;
100}
101#endif
102#endif
diff --git a/arch/x86/kernel/kprobes-opt.c b/arch/x86/kernel/kprobes-opt.c
deleted file mode 100644
index c5e410eed40..00000000000
--- a/arch/x86/kernel/kprobes-opt.c
+++ /dev/null
@@ -1,512 +0,0 @@
1/*
2 * Kernel Probes Jump Optimization (Optprobes)
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004
19 * Copyright (C) Hitachi Ltd., 2012
20 */
21#include <linux/kprobes.h>
22#include <linux/ptrace.h>
23#include <linux/string.h>
24#include <linux/slab.h>
25#include <linux/hardirq.h>
26#include <linux/preempt.h>
27#include <linux/module.h>
28#include <linux/kdebug.h>
29#include <linux/kallsyms.h>
30#include <linux/ftrace.h>
31
32#include <asm/cacheflush.h>
33#include <asm/desc.h>
34#include <asm/pgtable.h>
35#include <asm/uaccess.h>
36#include <asm/alternative.h>
37#include <asm/insn.h>
38#include <asm/debugreg.h>
39
40#include "kprobes-common.h"
41
42unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, unsigned long addr)
43{
44 struct optimized_kprobe *op;
45 struct kprobe *kp;
46 long offs;
47 int i;
48
49 for (i = 0; i < RELATIVEJUMP_SIZE; i++) {
50 kp = get_kprobe((void *)addr - i);
51 /* This function only handles jump-optimized kprobe */
52 if (kp && kprobe_optimized(kp)) {
53 op = container_of(kp, struct optimized_kprobe, kp);
54 /* If op->list is not empty, op is under optimizing */
55 if (list_empty(&op->list))
56 goto found;
57 }
58 }
59
60 return addr;
61found:
62 /*
63 * If the kprobe can be optimized, original bytes which can be
64 * overwritten by jump destination address. In this case, original
65 * bytes must be recovered from op->optinsn.copied_insn buffer.
66 */
67 memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
68 if (addr == (unsigned long)kp->addr) {
69 buf[0] = kp->opcode;
70 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
71 } else {
72 offs = addr - (unsigned long)kp->addr - 1;
73 memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs);
74 }
75
76 return (unsigned long)buf;
77}
78
79/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
80static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, unsigned long val)
81{
82#ifdef CONFIG_X86_64
83 *addr++ = 0x48;
84 *addr++ = 0xbf;
85#else
86 *addr++ = 0xb8;
87#endif
88 *(unsigned long *)addr = val;
89}
90
91static void __used __kprobes kprobes_optinsn_template_holder(void)
92{
93 asm volatile (
94 ".global optprobe_template_entry\n"
95 "optprobe_template_entry:\n"
96#ifdef CONFIG_X86_64
97 /* We don't bother saving the ss register */
98 " pushq %rsp\n"
99 " pushfq\n"
100 SAVE_REGS_STRING
101 " movq %rsp, %rsi\n"
102 ".global optprobe_template_val\n"
103 "optprobe_template_val:\n"
104 ASM_NOP5
105 ASM_NOP5
106 ".global optprobe_template_call\n"
107 "optprobe_template_call:\n"
108 ASM_NOP5
109 /* Move flags to rsp */
110 " movq 144(%rsp), %rdx\n"
111 " movq %rdx, 152(%rsp)\n"
112 RESTORE_REGS_STRING
113 /* Skip flags entry */
114 " addq $8, %rsp\n"
115 " popfq\n"
116#else /* CONFIG_X86_32 */
117 " pushf\n"
118 SAVE_REGS_STRING
119 " movl %esp, %edx\n"
120 ".global optprobe_template_val\n"
121 "optprobe_template_val:\n"
122 ASM_NOP5
123 ".global optprobe_template_call\n"
124 "optprobe_template_call:\n"
125 ASM_NOP5
126 RESTORE_REGS_STRING
127 " addl $4, %esp\n" /* skip cs */
128 " popf\n"
129#endif
130 ".global optprobe_template_end\n"
131 "optprobe_template_end:\n");
132}
133
134#define TMPL_MOVE_IDX \
135 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
136#define TMPL_CALL_IDX \
137 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
138#define TMPL_END_IDX \
139 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
140
141#define INT3_SIZE sizeof(kprobe_opcode_t)
142
143/* Optimized kprobe call back function: called from optinsn */
144static void __kprobes optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs)
145{
146 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
147 unsigned long flags;
148
149 /* This is possible if op is under delayed unoptimizing */
150 if (kprobe_disabled(&op->kp))
151 return;
152
153 local_irq_save(flags);
154 if (kprobe_running()) {
155 kprobes_inc_nmissed_count(&op->kp);
156 } else {
157 /* Save skipped registers */
158#ifdef CONFIG_X86_64
159 regs->cs = __KERNEL_CS;
160#else
161 regs->cs = __KERNEL_CS | get_kernel_rpl();
162 regs->gs = 0;
163#endif
164 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
165 regs->orig_ax = ~0UL;
166
167 __this_cpu_write(current_kprobe, &op->kp);
168 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
169 opt_pre_handler(&op->kp, regs);
170 __this_cpu_write(current_kprobe, NULL);
171 }
172 local_irq_restore(flags);
173}
174
175static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
176{
177 int len = 0, ret;
178
179 while (len < RELATIVEJUMP_SIZE) {
180 ret = __copy_instruction(dest + len, src + len);
181 if (!ret || !can_boost(dest + len))
182 return -EINVAL;
183 len += ret;
184 }
185 /* Check whether the address range is reserved */
186 if (ftrace_text_reserved(src, src + len - 1) ||
187 alternatives_text_reserved(src, src + len - 1) ||
188 jump_label_text_reserved(src, src + len - 1))
189 return -EBUSY;
190
191 return len;
192}
193
194/* Check whether insn is indirect jump */
195static int __kprobes insn_is_indirect_jump(struct insn *insn)
196{
197 return ((insn->opcode.bytes[0] == 0xff &&
198 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
199 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
200}
201
202/* Check whether insn jumps into specified address range */
203static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
204{
205 unsigned long target = 0;
206
207 switch (insn->opcode.bytes[0]) {
208 case 0xe0: /* loopne */
209 case 0xe1: /* loope */
210 case 0xe2: /* loop */
211 case 0xe3: /* jcxz */
212 case 0xe9: /* near relative jump */
213 case 0xeb: /* short relative jump */
214 break;
215 case 0x0f:
216 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
217 break;
218 return 0;
219 default:
220 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
221 break;
222 return 0;
223 }
224 target = (unsigned long)insn->next_byte + insn->immediate.value;
225
226 return (start <= target && target <= start + len);
227}
228
229/* Decode whole function to ensure any instructions don't jump into target */
230static int __kprobes can_optimize(unsigned long paddr)
231{
232 unsigned long addr, size = 0, offset = 0;
233 struct insn insn;
234 kprobe_opcode_t buf[MAX_INSN_SIZE];
235
236 /* Lookup symbol including addr */
237 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
238 return 0;
239
240 /*
241 * Do not optimize in the entry code due to the unstable
242 * stack handling.
243 */
244 if ((paddr >= (unsigned long)__entry_text_start) &&
245 (paddr < (unsigned long)__entry_text_end))
246 return 0;
247
248 /* Check there is enough space for a relative jump. */
249 if (size - offset < RELATIVEJUMP_SIZE)
250 return 0;
251
252 /* Decode instructions */
253 addr = paddr - offset;
254 while (addr < paddr - offset + size) { /* Decode until function end */
255 if (search_exception_tables(addr))
256 /*
257 * Since some fixup code will jumps into this function,
258 * we can't optimize kprobe in this function.
259 */
260 return 0;
261 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr));
262 insn_get_length(&insn);
263 /* Another subsystem puts a breakpoint */
264 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
265 return 0;
266 /* Recover address */
267 insn.kaddr = (void *)addr;
268 insn.next_byte = (void *)(addr + insn.length);
269 /* Check any instructions don't jump into target */
270 if (insn_is_indirect_jump(&insn) ||
271 insn_jump_into_range(&insn, paddr + INT3_SIZE,
272 RELATIVE_ADDR_SIZE))
273 return 0;
274 addr += insn.length;
275 }
276
277 return 1;
278}
279
280/* Check optimized_kprobe can actually be optimized. */
281int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
282{
283 int i;
284 struct kprobe *p;
285
286 for (i = 1; i < op->optinsn.size; i++) {
287 p = get_kprobe(op->kp.addr + i);
288 if (p && !kprobe_disabled(p))
289 return -EEXIST;
290 }
291
292 return 0;
293}
294
295/* Check the addr is within the optimized instructions. */
296int __kprobes
297arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr)
298{
299 return ((unsigned long)op->kp.addr <= addr &&
300 (unsigned long)op->kp.addr + op->optinsn.size > addr);
301}
302
303/* Free optimized instruction slot */
304static __kprobes
305void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
306{
307 if (op->optinsn.insn) {
308 free_optinsn_slot(op->optinsn.insn, dirty);
309 op->optinsn.insn = NULL;
310 op->optinsn.size = 0;
311 }
312}
313
314void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
315{
316 __arch_remove_optimized_kprobe(op, 1);
317}
318
319/*
320 * Copy replacing target instructions
321 * Target instructions MUST be relocatable (checked inside)
322 * This is called when new aggr(opt)probe is allocated or reused.
323 */
324int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
325{
326 u8 *buf;
327 int ret;
328 long rel;
329
330 if (!can_optimize((unsigned long)op->kp.addr))
331 return -EILSEQ;
332
333 op->optinsn.insn = get_optinsn_slot();
334 if (!op->optinsn.insn)
335 return -ENOMEM;
336
337 /*
338 * Verify if the address gap is in 2GB range, because this uses
339 * a relative jump.
340 */
341 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
342 if (abs(rel) > 0x7fffffff)
343 return -ERANGE;
344
345 buf = (u8 *)op->optinsn.insn;
346
347 /* Copy instructions into the out-of-line buffer */
348 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
349 if (ret < 0) {
350 __arch_remove_optimized_kprobe(op, 0);
351 return ret;
352 }
353 op->optinsn.size = ret;
354
355 /* Copy arch-dep-instance from template */
356 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
357
358 /* Set probe information */
359 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
360
361 /* Set probe function call */
362 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
363
364 /* Set returning jmp instruction at the tail of out-of-line buffer */
365 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
366 (u8 *)op->kp.addr + op->optinsn.size);
367
368 flush_icache_range((unsigned long) buf,
369 (unsigned long) buf + TMPL_END_IDX +
370 op->optinsn.size + RELATIVEJUMP_SIZE);
371 return 0;
372}
373
374#define MAX_OPTIMIZE_PROBES 256
375static struct text_poke_param *jump_poke_params;
376static struct jump_poke_buffer {
377 u8 buf[RELATIVEJUMP_SIZE];
378} *jump_poke_bufs;
379
380static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
381 u8 *insn_buf,
382 struct optimized_kprobe *op)
383{
384 s32 rel = (s32)((long)op->optinsn.insn -
385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
386
387 /* Backup instructions which will be replaced by jump address */
388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
389 RELATIVE_ADDR_SIZE);
390
391 insn_buf[0] = RELATIVEJUMP_OPCODE;
392 *(s32 *)(&insn_buf[1]) = rel;
393
394 tprm->addr = op->kp.addr;
395 tprm->opcode = insn_buf;
396 tprm->len = RELATIVEJUMP_SIZE;
397}
398
399/*
400 * Replace breakpoints (int3) with relative jumps.
401 * Caller must call with locking kprobe_mutex and text_mutex.
402 */
403void __kprobes arch_optimize_kprobes(struct list_head *oplist)
404{
405 struct optimized_kprobe *op, *tmp;
406 int c = 0;
407
408 list_for_each_entry_safe(op, tmp, oplist, list) {
409 WARN_ON(kprobe_disabled(&op->kp));
410 /* Setup param */
411 setup_optimize_kprobe(&jump_poke_params[c],
412 jump_poke_bufs[c].buf, op);
413 list_del_init(&op->list);
414 if (++c >= MAX_OPTIMIZE_PROBES)
415 break;
416 }
417
418 /*
419 * text_poke_smp doesn't support NMI/MCE code modifying.
420 * However, since kprobes itself also doesn't support NMI/MCE
421 * code probing, it's not a problem.
422 */
423 text_poke_smp_batch(jump_poke_params, c);
424}
425
426static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
427 u8 *insn_buf,
428 struct optimized_kprobe *op)
429{
430 /* Set int3 to first byte for kprobes */
431 insn_buf[0] = BREAKPOINT_INSTRUCTION;
432 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
433
434 tprm->addr = op->kp.addr;
435 tprm->opcode = insn_buf;
436 tprm->len = RELATIVEJUMP_SIZE;
437}
438
439/*
440 * Recover original instructions and breakpoints from relative jumps.
441 * Caller must call with locking kprobe_mutex.
442 */
443extern void arch_unoptimize_kprobes(struct list_head *oplist,
444 struct list_head *done_list)
445{
446 struct optimized_kprobe *op, *tmp;
447 int c = 0;
448
449 list_for_each_entry_safe(op, tmp, oplist, list) {
450 /* Setup param */
451 setup_unoptimize_kprobe(&jump_poke_params[c],
452 jump_poke_bufs[c].buf, op);
453 list_move(&op->list, done_list);
454 if (++c >= MAX_OPTIMIZE_PROBES)
455 break;
456 }
457
458 /*
459 * text_poke_smp doesn't support NMI/MCE code modifying.
460 * However, since kprobes itself also doesn't support NMI/MCE
461 * code probing, it's not a problem.
462 */
463 text_poke_smp_batch(jump_poke_params, c);
464}
465
466/* Replace a relative jump with a breakpoint (int3). */
467void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
468{
469 u8 buf[RELATIVEJUMP_SIZE];
470
471 /* Set int3 to first byte for kprobes */
472 buf[0] = BREAKPOINT_INSTRUCTION;
473 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
474 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
475}
476
477int __kprobes
478setup_detour_execution(struct kprobe *p, struct pt_regs *regs, int reenter)
479{
480 struct optimized_kprobe *op;
481
482 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
483 /* This kprobe is really able to run optimized path. */
484 op = container_of(p, struct optimized_kprobe, kp);
485 /* Detour through copied instructions */
486 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
487 if (!reenter)
488 reset_current_kprobe();
489 preempt_enable_no_resched();
490 return 1;
491 }
492 return 0;
493}
494
495int __kprobes arch_init_optprobes(void)
496{
497 /* Allocate code buffer and parameter array */
498 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
499 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
500 if (!jump_poke_bufs)
501 return -ENOMEM;
502
503 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
504 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
505 if (!jump_poke_params) {
506 kfree(jump_poke_bufs);
507 jump_poke_bufs = NULL;
508 return -ENOMEM;
509 }
510
511 return 0;
512}
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 57916c0d3cf..794bc95134c 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -30,15 +30,16 @@
30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi 30 * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
31 * <prasanna@in.ibm.com> added function-return probes. 31 * <prasanna@in.ibm.com> added function-return probes.
32 * 2005-May Rusty Lynch <rusty.lynch@intel.com> 32 * 2005-May Rusty Lynch <rusty.lynch@intel.com>
33 * Added function return probes functionality 33 * Added function return probes functionality
34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added 34 * 2006-Feb Masami Hiramatsu <hiramatu@sdl.hitachi.co.jp> added
35 * kprobe-booster and kretprobe-booster for i386. 35 * kprobe-booster and kretprobe-booster for i386.
36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster 36 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com> added kprobe-booster
37 * and kretprobe-booster for x86-64 37 * and kretprobe-booster for x86-64
38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven 38 * 2007-Dec Masami Hiramatsu <mhiramat@redhat.com>, Arjan van de Ven
39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com> 39 * <arjan@infradead.org> and Jim Keniston <jkenisto@us.ibm.com>
40 * unified x86 kprobes code. 40 * unified x86 kprobes code.
41 */ 41 */
42
42#include <linux/kprobes.h> 43#include <linux/kprobes.h>
43#include <linux/ptrace.h> 44#include <linux/ptrace.h>
44#include <linux/string.h> 45#include <linux/string.h>
@@ -58,8 +59,6 @@
58#include <asm/insn.h> 59#include <asm/insn.h>
59#include <asm/debugreg.h> 60#include <asm/debugreg.h>
60 61
61#include "kprobes-common.h"
62
63void jprobe_return_end(void); 62void jprobe_return_end(void);
64 63
65DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -76,11 +75,10 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
76 /* 75 /*
77 * Undefined/reserved opcodes, conditional jump, Opcode Extension 76 * Undefined/reserved opcodes, conditional jump, Opcode Extension
78 * Groups, and some special opcodes can not boost. 77 * Groups, and some special opcodes can not boost.
79 * This is non-const and volatile to keep gcc from statically 78 * This is non-const to keep gcc from statically optimizing it out, as
80 * optimizing it out, as variable_test_bit makes gcc think only 79 * variable_test_bit makes gcc think only *(unsigned long*) is used.
81 * *(unsigned long*) is used.
82 */ 80 */
83static volatile u32 twobyte_is_boostable[256 / 32] = { 81static u32 twobyte_is_boostable[256 / 32] = {
84 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 82 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
85 /* ---------------------------------------------- */ 83 /* ---------------------------------------------- */
86 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */ 84 W(0x00, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0) | /* 00 */
@@ -109,7 +107,6 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
109 doesn't switch kernel stack.*/ 107 doesn't switch kernel stack.*/
110 {NULL, NULL} /* Terminator */ 108 {NULL, NULL} /* Terminator */
111}; 109};
112
113const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 110const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
114 111
115static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) 112static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
@@ -125,17 +122,11 @@ static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
125} 122}
126 123
127/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 124/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
128void __kprobes synthesize_reljump(void *from, void *to) 125static void __kprobes synthesize_reljump(void *from, void *to)
129{ 126{
130 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); 127 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
131} 128}
132 129
133/* Insert a call instruction at address 'from', which calls address 'to'.*/
134void __kprobes synthesize_relcall(void *from, void *to)
135{
136 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
137}
138
139/* 130/*
140 * Skip the prefixes of the instruction. 131 * Skip the prefixes of the instruction.
141 */ 132 */
@@ -159,7 +150,7 @@ static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
159 * Returns non-zero if opcode is boostable. 150 * Returns non-zero if opcode is boostable.
160 * RIP relative instructions are adjusted at copying time in 64 bits mode 151 * RIP relative instructions are adjusted at copying time in 64 bits mode
161 */ 152 */
162int __kprobes can_boost(kprobe_opcode_t *opcodes) 153static int __kprobes can_boost(kprobe_opcode_t *opcodes)
163{ 154{
164 kprobe_opcode_t opcode; 155 kprobe_opcode_t opcode;
165 kprobe_opcode_t *orig_opcodes = opcodes; 156 kprobe_opcode_t *orig_opcodes = opcodes;
@@ -215,15 +206,13 @@ retry:
215 } 206 }
216} 207}
217 208
218static unsigned long 209/* Recover the probed instruction at addr for further analysis. */
219__recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr) 210static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
220{ 211{
221 struct kprobe *kp; 212 struct kprobe *kp;
222
223 kp = get_kprobe((void *)addr); 213 kp = get_kprobe((void *)addr);
224 /* There is no probe, return original address */
225 if (!kp) 214 if (!kp)
226 return addr; 215 return -EINVAL;
227 216
228 /* 217 /*
229 * Basically, kp->ainsn.insn has an original instruction. 218 * Basically, kp->ainsn.insn has an original instruction.
@@ -240,29 +229,14 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
240 */ 229 */
241 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 230 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
242 buf[0] = kp->opcode; 231 buf[0] = kp->opcode;
243 return (unsigned long)buf; 232 return 0;
244}
245
246/*
247 * Recover the probed instruction at addr for further analysis.
248 * Caller must lock kprobes by kprobe_mutex, or disable preemption
249 * for preventing to release referencing kprobes.
250 */
251unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
252{
253 unsigned long __addr;
254
255 __addr = __recover_optprobed_insn(buf, addr);
256 if (__addr != addr)
257 return __addr;
258
259 return __recover_probed_insn(buf, addr);
260} 233}
261 234
262/* Check if paddr is at an instruction boundary */ 235/* Check if paddr is at an instruction boundary */
263static int __kprobes can_probe(unsigned long paddr) 236static int __kprobes can_probe(unsigned long paddr)
264{ 237{
265 unsigned long addr, __addr, offset = 0; 238 int ret;
239 unsigned long addr, offset = 0;
266 struct insn insn; 240 struct insn insn;
267 kprobe_opcode_t buf[MAX_INSN_SIZE]; 241 kprobe_opcode_t buf[MAX_INSN_SIZE];
268 242
@@ -272,24 +246,26 @@ static int __kprobes can_probe(unsigned long paddr)
272 /* Decode instructions */ 246 /* Decode instructions */
273 addr = paddr - offset; 247 addr = paddr - offset;
274 while (addr < paddr) { 248 while (addr < paddr) {
249 kernel_insn_init(&insn, (void *)addr);
250 insn_get_opcode(&insn);
251
275 /* 252 /*
276 * Check if the instruction has been modified by another 253 * Check if the instruction has been modified by another
277 * kprobe, in which case we replace the breakpoint by the 254 * kprobe, in which case we replace the breakpoint by the
278 * original instruction in our buffer. 255 * original instruction in our buffer.
279 * Also, jump optimization will change the breakpoint to
280 * relative-jump. Since the relative-jump itself is
281 * normally used, we just go through if there is no kprobe.
282 */ 256 */
283 __addr = recover_probed_instruction(buf, addr); 257 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
284 kernel_insn_init(&insn, (void *)__addr); 258 ret = recover_probed_instruction(buf, addr);
259 if (ret)
260 /*
261 * Another debugging subsystem might insert
262 * this breakpoint. In that case, we can't
263 * recover it.
264 */
265 return 0;
266 kernel_insn_init(&insn, buf);
267 }
285 insn_get_length(&insn); 268 insn_get_length(&insn);
286
287 /*
288 * Another debugging subsystem might insert this breakpoint.
289 * In that case, we can't recover it.
290 */
291 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
292 return 0;
293 addr += insn.length; 269 addr += insn.length;
294 } 270 }
295 271
@@ -322,16 +298,24 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
322 * If not, return null. 298 * If not, return null.
323 * Only applicable to 64-bit x86. 299 * Only applicable to 64-bit x86.
324 */ 300 */
325int __kprobes __copy_instruction(u8 *dest, u8 *src) 301static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
326{ 302{
327 struct insn insn; 303 struct insn insn;
304 int ret;
328 kprobe_opcode_t buf[MAX_INSN_SIZE]; 305 kprobe_opcode_t buf[MAX_INSN_SIZE];
329 306
330 kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); 307 kernel_insn_init(&insn, src);
308 if (recover) {
309 insn_get_opcode(&insn);
310 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
311 ret = recover_probed_instruction(buf,
312 (unsigned long)src);
313 if (ret)
314 return 0;
315 kernel_insn_init(&insn, buf);
316 }
317 }
331 insn_get_length(&insn); 318 insn_get_length(&insn);
332 /* Another subsystem puts a breakpoint, failed to recover */
333 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
334 return 0;
335 memcpy(dest, insn.kaddr, insn.length); 319 memcpy(dest, insn.kaddr, insn.length);
336 320
337#ifdef CONFIG_X86_64 321#ifdef CONFIG_X86_64
@@ -352,7 +336,8 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
352 * extension of the original signed 32-bit displacement would 336 * extension of the original signed 32-bit displacement would
353 * have given. 337 * have given.
354 */ 338 */
355 newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; 339 newdisp = (u8 *) src + (s64) insn.displacement.value -
340 (u8 *) dest;
356 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ 341 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
357 disp = (u8 *) dest + insn_offset_displacement(&insn); 342 disp = (u8 *) dest + insn_offset_displacement(&insn);
358 *(s32 *) disp = (s32) newdisp; 343 *(s32 *) disp = (s32) newdisp;
@@ -363,20 +348,18 @@ int __kprobes __copy_instruction(u8 *dest, u8 *src)
363 348
364static void __kprobes arch_copy_kprobe(struct kprobe *p) 349static void __kprobes arch_copy_kprobe(struct kprobe *p)
365{ 350{
366 /* Copy an instruction with recovering if other optprobe modifies it.*/
367 __copy_instruction(p->ainsn.insn, p->addr);
368
369 /* 351 /*
370 * __copy_instruction can modify the displacement of the instruction, 352 * Copy an instruction without recovering int3, because it will be
371 * but it doesn't affect boostable check. 353 * put by another subsystem.
372 */ 354 */
373 if (can_boost(p->ainsn.insn)) 355 __copy_instruction(p->ainsn.insn, p->addr, 0);
356
357 if (can_boost(p->addr))
374 p->ainsn.boostable = 0; 358 p->ainsn.boostable = 0;
375 else 359 else
376 p->ainsn.boostable = -1; 360 p->ainsn.boostable = -1;
377 361
378 /* Also, displacement change doesn't affect the first byte */ 362 p->opcode = *p->addr;
379 p->opcode = p->ainsn.insn[0];
380} 363}
381 364
382int __kprobes arch_prepare_kprobe(struct kprobe *p) 365int __kprobes arch_prepare_kprobe(struct kprobe *p)
@@ -458,8 +441,8 @@ static void __kprobes restore_btf(void)
458 } 441 }
459} 442}
460 443
461void __kprobes 444void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
462arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) 445 struct pt_regs *regs)
463{ 446{
464 unsigned long *sara = stack_addr(regs); 447 unsigned long *sara = stack_addr(regs);
465 448
@@ -469,8 +452,16 @@ arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
469 *sara = (unsigned long) &kretprobe_trampoline; 452 *sara = (unsigned long) &kretprobe_trampoline;
470} 453}
471 454
472static void __kprobes 455#ifdef CONFIG_OPTPROBES
473setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) 456static int __kprobes setup_detour_execution(struct kprobe *p,
457 struct pt_regs *regs,
458 int reenter);
459#else
460#define setup_detour_execution(p, regs, reenter) (0)
461#endif
462
463static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
464 struct kprobe_ctlblk *kcb, int reenter)
474{ 465{
475 if (setup_detour_execution(p, regs, reenter)) 466 if (setup_detour_execution(p, regs, reenter))
476 return; 467 return;
@@ -512,8 +503,8 @@ setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *k
512 * within the handler. We save the original kprobes variables and just single 503 * within the handler. We save the original kprobes variables and just single
513 * step on the instruction of the new probe without calling any user handlers. 504 * step on the instruction of the new probe without calling any user handlers.
514 */ 505 */
515static int __kprobes 506static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
516reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) 507 struct kprobe_ctlblk *kcb)
517{ 508{
518 switch (kcb->kprobe_status) { 509 switch (kcb->kprobe_status) {
519 case KPROBE_HIT_SSDONE: 510 case KPROBE_HIT_SSDONE:
@@ -541,23 +532,6 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb
541 return 1; 532 return 1;
542} 533}
543 534
544#ifdef KPROBES_CAN_USE_FTRACE
545static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs,
546 struct kprobe_ctlblk *kcb)
547{
548 /*
549 * Emulate singlestep (and also recover regs->ip)
550 * as if there is a 5byte nop
551 */
552 regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
553 if (unlikely(p->post_handler)) {
554 kcb->kprobe_status = KPROBE_HIT_SSDONE;
555 p->post_handler(p, regs, 0);
556 }
557 __this_cpu_write(current_kprobe, NULL);
558}
559#endif
560
561/* 535/*
562 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 536 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
563 * remain disabled throughout this function. 537 * remain disabled throughout this function.
@@ -616,12 +590,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
616 } else if (kprobe_running()) { 590 } else if (kprobe_running()) {
617 p = __this_cpu_read(current_kprobe); 591 p = __this_cpu_read(current_kprobe);
618 if (p->break_handler && p->break_handler(p, regs)) { 592 if (p->break_handler && p->break_handler(p, regs)) {
619#ifdef KPROBES_CAN_USE_FTRACE
620 if (kprobe_ftrace(p)) {
621 skip_singlestep(p, regs, kcb);
622 return 1;
623 }
624#endif
625 setup_singlestep(p, regs, kcb, 0); 593 setup_singlestep(p, regs, kcb, 0);
626 return 1; 594 return 1;
627 } 595 }
@@ -631,6 +599,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
631 return 0; 599 return 0;
632} 600}
633 601
602#ifdef CONFIG_X86_64
603#define SAVE_REGS_STRING \
604 /* Skip cs, ip, orig_ax. */ \
605 " subq $24, %rsp\n" \
606 " pushq %rdi\n" \
607 " pushq %rsi\n" \
608 " pushq %rdx\n" \
609 " pushq %rcx\n" \
610 " pushq %rax\n" \
611 " pushq %r8\n" \
612 " pushq %r9\n" \
613 " pushq %r10\n" \
614 " pushq %r11\n" \
615 " pushq %rbx\n" \
616 " pushq %rbp\n" \
617 " pushq %r12\n" \
618 " pushq %r13\n" \
619 " pushq %r14\n" \
620 " pushq %r15\n"
621#define RESTORE_REGS_STRING \
622 " popq %r15\n" \
623 " popq %r14\n" \
624 " popq %r13\n" \
625 " popq %r12\n" \
626 " popq %rbp\n" \
627 " popq %rbx\n" \
628 " popq %r11\n" \
629 " popq %r10\n" \
630 " popq %r9\n" \
631 " popq %r8\n" \
632 " popq %rax\n" \
633 " popq %rcx\n" \
634 " popq %rdx\n" \
635 " popq %rsi\n" \
636 " popq %rdi\n" \
637 /* Skip orig_ax, ip, cs */ \
638 " addq $24, %rsp\n"
639#else
640#define SAVE_REGS_STRING \
641 /* Skip cs, ip, orig_ax and gs. */ \
642 " subl $16, %esp\n" \
643 " pushl %fs\n" \
644 " pushl %es\n" \
645 " pushl %ds\n" \
646 " pushl %eax\n" \
647 " pushl %ebp\n" \
648 " pushl %edi\n" \
649 " pushl %esi\n" \
650 " pushl %edx\n" \
651 " pushl %ecx\n" \
652 " pushl %ebx\n"
653#define RESTORE_REGS_STRING \
654 " popl %ebx\n" \
655 " popl %ecx\n" \
656 " popl %edx\n" \
657 " popl %esi\n" \
658 " popl %edi\n" \
659 " popl %ebp\n" \
660 " popl %eax\n" \
661 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
662 " addl $24, %esp\n"
663#endif
664
634/* 665/*
635 * When a retprobed function returns, this code saves registers and 666 * When a retprobed function returns, this code saves registers and
636 * calls trampoline_handler() runs, which calls the kretprobe's handler. 667 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -784,8 +815,8 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
784 * jump instruction after the copied instruction, that jumps to the next 815 * jump instruction after the copied instruction, that jumps to the next
785 * instruction after the probepoint. 816 * instruction after the probepoint.
786 */ 817 */
787static void __kprobes 818static void __kprobes resume_execution(struct kprobe *p,
788resume_execution(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) 819 struct pt_regs *regs, struct kprobe_ctlblk *kcb)
789{ 820{
790 unsigned long *tos = stack_addr(regs); 821 unsigned long *tos = stack_addr(regs);
791 unsigned long copy_ip = (unsigned long)p->ainsn.insn; 822 unsigned long copy_ip = (unsigned long)p->ainsn.insn;
@@ -964,8 +995,8 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
964/* 995/*
965 * Wrapper routine for handling exceptions. 996 * Wrapper routine for handling exceptions.
966 */ 997 */
967int __kprobes 998int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
968kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) 999 unsigned long val, void *data)
969{ 1000{
970 struct die_args *args = data; 1001 struct die_args *args = data;
971 int ret = NOTIFY_DONE; 1002 int ret = NOTIFY_DONE;
@@ -1060,9 +1091,9 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1060 "current sp %p does not match saved sp %p\n", 1091 "current sp %p does not match saved sp %p\n",
1061 stack_addr(regs), kcb->jprobe_saved_sp); 1092 stack_addr(regs), kcb->jprobe_saved_sp);
1062 printk(KERN_ERR "Saved registers for jprobe %p\n", jp); 1093 printk(KERN_ERR "Saved registers for jprobe %p\n", jp);
1063 show_regs(saved_regs); 1094 show_registers(saved_regs);
1064 printk(KERN_ERR "Current registers\n"); 1095 printk(KERN_ERR "Current registers\n");
1065 show_regs(regs); 1096 show_registers(regs);
1066 BUG(); 1097 BUG();
1067 } 1098 }
1068 *regs = kcb->jprobe_saved_regs; 1099 *regs = kcb->jprobe_saved_regs;
@@ -1075,53 +1106,466 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1075 return 0; 1106 return 0;
1076} 1107}
1077 1108
1078#ifdef KPROBES_CAN_USE_FTRACE 1109
1079/* Ftrace callback handler for kprobes */ 1110#ifdef CONFIG_OPTPROBES
1080void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, 1111
1081 struct ftrace_ops *ops, struct pt_regs *regs) 1112/* Insert a call instruction at address 'from', which calls address 'to'.*/
1113static void __kprobes synthesize_relcall(void *from, void *to)
1082{ 1114{
1083 struct kprobe *p; 1115 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1084 struct kprobe_ctlblk *kcb; 1116}
1085 unsigned long flags;
1086 1117
1087 /* Disable irq for emulating a breakpoint and avoiding preempt */ 1118/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1088 local_irq_save(flags); 1119static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1120 unsigned long val)
1121{
1122#ifdef CONFIG_X86_64
1123 *addr++ = 0x48;
1124 *addr++ = 0xbf;
1125#else
1126 *addr++ = 0xb8;
1127#endif
1128 *(unsigned long *)addr = val;
1129}
1089 1130
1090 p = get_kprobe((kprobe_opcode_t *)ip); 1131static void __used __kprobes kprobes_optinsn_template_holder(void)
1091 if (unlikely(!p) || kprobe_disabled(p)) 1132{
1092 goto end; 1133 asm volatile (
1134 ".global optprobe_template_entry\n"
1135 "optprobe_template_entry: \n"
1136#ifdef CONFIG_X86_64
1137 /* We don't bother saving the ss register */
1138 " pushq %rsp\n"
1139 " pushfq\n"
1140 SAVE_REGS_STRING
1141 " movq %rsp, %rsi\n"
1142 ".global optprobe_template_val\n"
1143 "optprobe_template_val: \n"
1144 ASM_NOP5
1145 ASM_NOP5
1146 ".global optprobe_template_call\n"
1147 "optprobe_template_call: \n"
1148 ASM_NOP5
1149 /* Move flags to rsp */
1150 " movq 144(%rsp), %rdx\n"
1151 " movq %rdx, 152(%rsp)\n"
1152 RESTORE_REGS_STRING
1153 /* Skip flags entry */
1154 " addq $8, %rsp\n"
1155 " popfq\n"
1156#else /* CONFIG_X86_32 */
1157 " pushf\n"
1158 SAVE_REGS_STRING
1159 " movl %esp, %edx\n"
1160 ".global optprobe_template_val\n"
1161 "optprobe_template_val: \n"
1162 ASM_NOP5
1163 ".global optprobe_template_call\n"
1164 "optprobe_template_call: \n"
1165 ASM_NOP5
1166 RESTORE_REGS_STRING
1167 " addl $4, %esp\n" /* skip cs */
1168 " popf\n"
1169#endif
1170 ".global optprobe_template_end\n"
1171 "optprobe_template_end: \n");
1172}
1093 1173
1094 kcb = get_kprobe_ctlblk(); 1174#define TMPL_MOVE_IDX \
1175 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1176#define TMPL_CALL_IDX \
1177 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1178#define TMPL_END_IDX \
1179 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1180
1181#define INT3_SIZE sizeof(kprobe_opcode_t)
1182
1183/* Optimized kprobe call back function: called from optinsn */
1184static void __kprobes optimized_callback(struct optimized_kprobe *op,
1185 struct pt_regs *regs)
1186{
1187 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1188 unsigned long flags;
1189
1190 /* This is possible if op is under delayed unoptimizing */
1191 if (kprobe_disabled(&op->kp))
1192 return;
1193
1194 local_irq_save(flags);
1095 if (kprobe_running()) { 1195 if (kprobe_running()) {
1096 kprobes_inc_nmissed_count(p); 1196 kprobes_inc_nmissed_count(&op->kp);
1097 } else { 1197 } else {
1098 /* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ 1198 /* Save skipped registers */
1099 regs->ip = ip + sizeof(kprobe_opcode_t); 1199#ifdef CONFIG_X86_64
1200 regs->cs = __KERNEL_CS;
1201#else
1202 regs->cs = __KERNEL_CS | get_kernel_rpl();
1203 regs->gs = 0;
1204#endif
1205 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1206 regs->orig_ax = ~0UL;
1100 1207
1101 __this_cpu_write(current_kprobe, p); 1208 __this_cpu_write(current_kprobe, &op->kp);
1102 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 1209 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1103 if (!p->pre_handler || !p->pre_handler(p, regs)) 1210 opt_pre_handler(&op->kp, regs);
1104 skip_singlestep(p, regs, kcb); 1211 __this_cpu_write(current_kprobe, NULL);
1105 /*
1106 * If pre_handler returns !0, it sets regs->ip and
1107 * resets current kprobe.
1108 */
1109 } 1212 }
1110end:
1111 local_irq_restore(flags); 1213 local_irq_restore(flags);
1112} 1214}
1113 1215
1114int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) 1216static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1217{
1218 int len = 0, ret;
1219
1220 while (len < RELATIVEJUMP_SIZE) {
1221 ret = __copy_instruction(dest + len, src + len, 1);
1222 if (!ret || !can_boost(dest + len))
1223 return -EINVAL;
1224 len += ret;
1225 }
1226 /* Check whether the address range is reserved */
1227 if (ftrace_text_reserved(src, src + len - 1) ||
1228 alternatives_text_reserved(src, src + len - 1) ||
1229 jump_label_text_reserved(src, src + len - 1))
1230 return -EBUSY;
1231
1232 return len;
1233}
1234
1235/* Check whether insn is indirect jump */
1236static int __kprobes insn_is_indirect_jump(struct insn *insn)
1237{
1238 return ((insn->opcode.bytes[0] == 0xff &&
1239 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1240 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1241}
1242
1243/* Check whether insn jumps into specified address range */
1244static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1245{
1246 unsigned long target = 0;
1247
1248 switch (insn->opcode.bytes[0]) {
1249 case 0xe0: /* loopne */
1250 case 0xe1: /* loope */
1251 case 0xe2: /* loop */
1252 case 0xe3: /* jcxz */
1253 case 0xe9: /* near relative jump */
1254 case 0xeb: /* short relative jump */
1255 break;
1256 case 0x0f:
1257 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1258 break;
1259 return 0;
1260 default:
1261 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1262 break;
1263 return 0;
1264 }
1265 target = (unsigned long)insn->next_byte + insn->immediate.value;
1266
1267 return (start <= target && target <= start + len);
1268}
1269
1270/* Decode whole function to ensure any instructions don't jump into target */
1271static int __kprobes can_optimize(unsigned long paddr)
1272{
1273 int ret;
1274 unsigned long addr, size = 0, offset = 0;
1275 struct insn insn;
1276 kprobe_opcode_t buf[MAX_INSN_SIZE];
1277
1278 /* Lookup symbol including addr */
1279 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1280 return 0;
1281
1282 /*
1283 * Do not optimize in the entry code due to the unstable
1284 * stack handling.
1285 */
1286 if ((paddr >= (unsigned long )__entry_text_start) &&
1287 (paddr < (unsigned long )__entry_text_end))
1288 return 0;
1289
1290 /* Check there is enough space for a relative jump. */
1291 if (size - offset < RELATIVEJUMP_SIZE)
1292 return 0;
1293
1294 /* Decode instructions */
1295 addr = paddr - offset;
1296 while (addr < paddr - offset + size) { /* Decode until function end */
1297 if (search_exception_tables(addr))
1298 /*
1299 * Since some fixup code will jumps into this function,
1300 * we can't optimize kprobe in this function.
1301 */
1302 return 0;
1303 kernel_insn_init(&insn, (void *)addr);
1304 insn_get_opcode(&insn);
1305 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1306 ret = recover_probed_instruction(buf, addr);
1307 if (ret)
1308 return 0;
1309 kernel_insn_init(&insn, buf);
1310 }
1311 insn_get_length(&insn);
1312 /* Recover address */
1313 insn.kaddr = (void *)addr;
1314 insn.next_byte = (void *)(addr + insn.length);
1315 /* Check any instructions don't jump into target */
1316 if (insn_is_indirect_jump(&insn) ||
1317 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1318 RELATIVE_ADDR_SIZE))
1319 return 0;
1320 addr += insn.length;
1321 }
1322
1323 return 1;
1324}
1325
1326/* Check optimized_kprobe can actually be optimized. */
1327int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1328{
1329 int i;
1330 struct kprobe *p;
1331
1332 for (i = 1; i < op->optinsn.size; i++) {
1333 p = get_kprobe(op->kp.addr + i);
1334 if (p && !kprobe_disabled(p))
1335 return -EEXIST;
1336 }
1337
1338 return 0;
1339}
1340
1341/* Check the addr is within the optimized instructions. */
1342int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1343 unsigned long addr)
1344{
1345 return ((unsigned long)op->kp.addr <= addr &&
1346 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1347}
1348
1349/* Free optimized instruction slot */
1350static __kprobes
1351void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1352{
1353 if (op->optinsn.insn) {
1354 free_optinsn_slot(op->optinsn.insn, dirty);
1355 op->optinsn.insn = NULL;
1356 op->optinsn.size = 0;
1357 }
1358}
1359
1360void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1361{
1362 __arch_remove_optimized_kprobe(op, 1);
1363}
1364
1365/*
1366 * Copy replacing target instructions
1367 * Target instructions MUST be relocatable (checked inside)
1368 */
1369int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1370{
1371 u8 *buf;
1372 int ret;
1373 long rel;
1374
1375 if (!can_optimize((unsigned long)op->kp.addr))
1376 return -EILSEQ;
1377
1378 op->optinsn.insn = get_optinsn_slot();
1379 if (!op->optinsn.insn)
1380 return -ENOMEM;
1381
1382 /*
1383 * Verify if the address gap is in 2GB range, because this uses
1384 * a relative jump.
1385 */
1386 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1387 if (abs(rel) > 0x7fffffff)
1388 return -ERANGE;
1389
1390 buf = (u8 *)op->optinsn.insn;
1391
1392 /* Copy instructions into the out-of-line buffer */
1393 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1394 if (ret < 0) {
1395 __arch_remove_optimized_kprobe(op, 0);
1396 return ret;
1397 }
1398 op->optinsn.size = ret;
1399
1400 /* Copy arch-dep-instance from template */
1401 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1402
1403 /* Set probe information */
1404 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1405
1406 /* Set probe function call */
1407 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1408
1409 /* Set returning jmp instruction at the tail of out-of-line buffer */
1410 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1411 (u8 *)op->kp.addr + op->optinsn.size);
1412
1413 flush_icache_range((unsigned long) buf,
1414 (unsigned long) buf + TMPL_END_IDX +
1415 op->optinsn.size + RELATIVEJUMP_SIZE);
1416 return 0;
1417}
1418
1419#define MAX_OPTIMIZE_PROBES 256
1420static struct text_poke_param *jump_poke_params;
1421static struct jump_poke_buffer {
1422 u8 buf[RELATIVEJUMP_SIZE];
1423} *jump_poke_bufs;
1424
1425static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1426 u8 *insn_buf,
1427 struct optimized_kprobe *op)
1428{
1429 s32 rel = (s32)((long)op->optinsn.insn -
1430 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1431
1432 /* Backup instructions which will be replaced by jump address */
1433 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1434 RELATIVE_ADDR_SIZE);
1435
1436 insn_buf[0] = RELATIVEJUMP_OPCODE;
1437 *(s32 *)(&insn_buf[1]) = rel;
1438
1439 tprm->addr = op->kp.addr;
1440 tprm->opcode = insn_buf;
1441 tprm->len = RELATIVEJUMP_SIZE;
1442}
1443
1444/*
1445 * Replace breakpoints (int3) with relative jumps.
1446 * Caller must call with locking kprobe_mutex and text_mutex.
1447 */
1448void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1449{
1450 struct optimized_kprobe *op, *tmp;
1451 int c = 0;
1452
1453 list_for_each_entry_safe(op, tmp, oplist, list) {
1454 WARN_ON(kprobe_disabled(&op->kp));
1455 /* Setup param */
1456 setup_optimize_kprobe(&jump_poke_params[c],
1457 jump_poke_bufs[c].buf, op);
1458 list_del_init(&op->list);
1459 if (++c >= MAX_OPTIMIZE_PROBES)
1460 break;
1461 }
1462
1463 /*
1464 * text_poke_smp doesn't support NMI/MCE code modifying.
1465 * However, since kprobes itself also doesn't support NMI/MCE
1466 * code probing, it's not a problem.
1467 */
1468 text_poke_smp_batch(jump_poke_params, c);
1469}
1470
1471static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1472 u8 *insn_buf,
1473 struct optimized_kprobe *op)
1474{
1475 /* Set int3 to first byte for kprobes */
1476 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1477 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1478
1479 tprm->addr = op->kp.addr;
1480 tprm->opcode = insn_buf;
1481 tprm->len = RELATIVEJUMP_SIZE;
1482}
1483
1484/*
1485 * Recover original instructions and breakpoints from relative jumps.
1486 * Caller must call with locking kprobe_mutex.
1487 */
1488extern void arch_unoptimize_kprobes(struct list_head *oplist,
1489 struct list_head *done_list)
1490{
1491 struct optimized_kprobe *op, *tmp;
1492 int c = 0;
1493
1494 list_for_each_entry_safe(op, tmp, oplist, list) {
1495 /* Setup param */
1496 setup_unoptimize_kprobe(&jump_poke_params[c],
1497 jump_poke_bufs[c].buf, op);
1498 list_move(&op->list, done_list);
1499 if (++c >= MAX_OPTIMIZE_PROBES)
1500 break;
1501 }
1502
1503 /*
1504 * text_poke_smp doesn't support NMI/MCE code modifying.
1505 * However, since kprobes itself also doesn't support NMI/MCE
1506 * code probing, it's not a problem.
1507 */
1508 text_poke_smp_batch(jump_poke_params, c);
1509}
1510
1511/* Replace a relative jump with a breakpoint (int3). */
1512void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1513{
1514 u8 buf[RELATIVEJUMP_SIZE];
1515
1516 /* Set int3 to first byte for kprobes */
1517 buf[0] = BREAKPOINT_INSTRUCTION;
1518 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1519 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1520}
1521
1522static int __kprobes setup_detour_execution(struct kprobe *p,
1523 struct pt_regs *regs,
1524 int reenter)
1525{
1526 struct optimized_kprobe *op;
1527
1528 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1529 /* This kprobe is really able to run optimized path. */
1530 op = container_of(p, struct optimized_kprobe, kp);
1531 /* Detour through copied instructions */
1532 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1533 if (!reenter)
1534 reset_current_kprobe();
1535 preempt_enable_no_resched();
1536 return 1;
1537 }
1538 return 0;
1539}
1540
1541static int __kprobes init_poke_params(void)
1542{
1543 /* Allocate code buffer and parameter array */
1544 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1545 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1546 if (!jump_poke_bufs)
1547 return -ENOMEM;
1548
1549 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1550 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1551 if (!jump_poke_params) {
1552 kfree(jump_poke_bufs);
1553 jump_poke_bufs = NULL;
1554 return -ENOMEM;
1555 }
1556
1557 return 0;
1558}
1559#else /* !CONFIG_OPTPROBES */
1560static int __kprobes init_poke_params(void)
1115{ 1561{
1116 p->ainsn.insn = NULL;
1117 p->ainsn.boostable = -1;
1118 return 0; 1562 return 0;
1119} 1563}
1120#endif 1564#endif
1121 1565
1122int __init arch_init_kprobes(void) 1566int __init arch_init_kprobes(void)
1123{ 1567{
1124 return arch_init_optprobes(); 1568 return init_poke_params();
1125} 1569}
1126 1570
1127int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1571int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9c2bd8bd4b4..a9c2116001d 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -38,12 +38,8 @@
38#include <asm/traps.h> 38#include <asm/traps.h>
39#include <asm/desc.h> 39#include <asm/desc.h>
40#include <asm/tlbflush.h> 40#include <asm/tlbflush.h>
41#include <asm/idle.h> 41
42#include <asm/apic.h> 42#define MMU_QUEUE_SIZE 1024
43#include <asm/apicdef.h>
44#include <asm/hypervisor.h>
45#include <asm/kvm_guest.h>
46#include <asm/context_tracking.h>
47 43
48static int kvmapf = 1; 44static int kvmapf = 1;
49 45
@@ -64,19 +60,21 @@ static int parse_no_stealacc(char *arg)
64 60
65early_param("no-steal-acc", parse_no_stealacc); 61early_param("no-steal-acc", parse_no_stealacc);
66 62
67static int kvmclock_vsyscall = 1; 63struct kvm_para_state {
68static int parse_no_kvmclock_vsyscall(char *arg) 64 u8 mmu_queue[MMU_QUEUE_SIZE];
69{ 65 int mmu_queue_len;
70 kvmclock_vsyscall = 0; 66};
71 return 0;
72}
73
74early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
75 67
68static DEFINE_PER_CPU(struct kvm_para_state, para_state);
76static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 69static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
77static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); 70static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
78static int has_steal_clock = 0; 71static int has_steal_clock = 0;
79 72
73static struct kvm_para_state *kvm_para_state(void)
74{
75 return &per_cpu(para_state, raw_smp_processor_id());
76}
77
80/* 78/*
81 * No need for any "IO delay" on KVM 79 * No need for any "IO delay" on KVM
82 */ 80 */
@@ -93,6 +91,7 @@ struct kvm_task_sleep_node {
93 u32 token; 91 u32 token;
94 int cpu; 92 int cpu;
95 bool halted; 93 bool halted;
94 struct mm_struct *mm;
96}; 95};
97 96
98static struct kvm_task_sleep_head { 97static struct kvm_task_sleep_head {
@@ -121,8 +120,11 @@ void kvm_async_pf_task_wait(u32 token)
121 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 120 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
122 struct kvm_task_sleep_node n, *e; 121 struct kvm_task_sleep_node n, *e;
123 DEFINE_WAIT(wait); 122 DEFINE_WAIT(wait);
123 int cpu, idle;
124 124
125 rcu_irq_enter(); 125 cpu = get_cpu();
126 idle = idle_cpu(cpu);
127 put_cpu();
126 128
127 spin_lock(&b->lock); 129 spin_lock(&b->lock);
128 e = _find_apf_task(b, token); 130 e = _find_apf_task(b, token);
@@ -131,14 +133,14 @@ void kvm_async_pf_task_wait(u32 token)
131 hlist_del(&e->link); 133 hlist_del(&e->link);
132 kfree(e); 134 kfree(e);
133 spin_unlock(&b->lock); 135 spin_unlock(&b->lock);
134
135 rcu_irq_exit();
136 return; 136 return;
137 } 137 }
138 138
139 n.token = token; 139 n.token = token;
140 n.cpu = smp_processor_id(); 140 n.cpu = smp_processor_id();
141 n.halted = is_idle_task(current) || preempt_count() > 1; 141 n.mm = current->active_mm;
142 n.halted = idle || preempt_count() > 1;
143 atomic_inc(&n.mm->mm_count);
142 init_waitqueue_head(&n.wq); 144 init_waitqueue_head(&n.wq);
143 hlist_add_head(&n.link, &b->list); 145 hlist_add_head(&n.link, &b->list);
144 spin_unlock(&b->lock); 146 spin_unlock(&b->lock);
@@ -157,16 +159,13 @@ void kvm_async_pf_task_wait(u32 token)
157 /* 159 /*
158 * We cannot reschedule. So halt. 160 * We cannot reschedule. So halt.
159 */ 161 */
160 rcu_irq_exit();
161 native_safe_halt(); 162 native_safe_halt();
162 rcu_irq_enter();
163 local_irq_disable(); 163 local_irq_disable();
164 } 164 }
165 } 165 }
166 if (!n.halted) 166 if (!n.halted)
167 finish_wait(&n.wq, &wait); 167 finish_wait(&n.wq, &wait);
168 168
169 rcu_irq_exit();
170 return; 169 return;
171} 170}
172EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); 171EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
@@ -174,6 +173,9 @@ EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
174static void apf_task_wake_one(struct kvm_task_sleep_node *n) 173static void apf_task_wake_one(struct kvm_task_sleep_node *n)
175{ 174{
176 hlist_del_init(&n->link); 175 hlist_del_init(&n->link);
176 if (!n->mm)
177 return;
178 mmdrop(n->mm);
177 if (n->halted) 179 if (n->halted)
178 smp_send_reschedule(n->cpu); 180 smp_send_reschedule(n->cpu);
179 else if (waitqueue_active(&n->wq)) 181 else if (waitqueue_active(&n->wq))
@@ -217,7 +219,7 @@ again:
217 * async PF was not yet handled. 219 * async PF was not yet handled.
218 * Add dummy entry for the token. 220 * Add dummy entry for the token.
219 */ 221 */
220 n = kzalloc(sizeof(*n), GFP_ATOMIC); 222 n = kmalloc(sizeof(*n), GFP_ATOMIC);
221 if (!n) { 223 if (!n) {
222 /* 224 /*
223 * Allocation failed! Busy wait while other cpu 225 * Allocation failed! Busy wait while other cpu
@@ -229,6 +231,7 @@ again:
229 } 231 }
230 n->token = token; 232 n->token = token;
231 n->cpu = smp_processor_id(); 233 n->cpu = smp_processor_id();
234 n->mm = NULL;
232 init_waitqueue_head(&n->wq); 235 init_waitqueue_head(&n->wq);
233 hlist_add_head(&n->link, &b->list); 236 hlist_add_head(&n->link, &b->list);
234 } else 237 } else
@@ -260,20 +263,159 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
260 break; 263 break;
261 case KVM_PV_REASON_PAGE_NOT_PRESENT: 264 case KVM_PV_REASON_PAGE_NOT_PRESENT:
262 /* page is swapped out by the host. */ 265 /* page is swapped out by the host. */
263 exception_enter(regs);
264 exit_idle();
265 kvm_async_pf_task_wait((u32)read_cr2()); 266 kvm_async_pf_task_wait((u32)read_cr2());
266 exception_exit(regs);
267 break; 267 break;
268 case KVM_PV_REASON_PAGE_READY: 268 case KVM_PV_REASON_PAGE_READY:
269 rcu_irq_enter();
270 exit_idle();
271 kvm_async_pf_task_wake((u32)read_cr2()); 269 kvm_async_pf_task_wake((u32)read_cr2());
272 rcu_irq_exit();
273 break; 270 break;
274 } 271 }
275} 272}
276 273
274static void kvm_mmu_op(void *buffer, unsigned len)
275{
276 int r;
277 unsigned long a1, a2;
278
279 do {
280 a1 = __pa(buffer);
281 a2 = 0; /* on i386 __pa() always returns <4G */
282 r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
283 buffer += r;
284 len -= r;
285 } while (len);
286}
287
288static void mmu_queue_flush(struct kvm_para_state *state)
289{
290 if (state->mmu_queue_len) {
291 kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
292 state->mmu_queue_len = 0;
293 }
294}
295
296static void kvm_deferred_mmu_op(void *buffer, int len)
297{
298 struct kvm_para_state *state = kvm_para_state();
299
300 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) {
301 kvm_mmu_op(buffer, len);
302 return;
303 }
304 if (state->mmu_queue_len + len > sizeof state->mmu_queue)
305 mmu_queue_flush(state);
306 memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
307 state->mmu_queue_len += len;
308}
309
310static void kvm_mmu_write(void *dest, u64 val)
311{
312 __u64 pte_phys;
313 struct kvm_mmu_op_write_pte wpte;
314
315#ifdef CONFIG_HIGHPTE
316 struct page *page;
317 unsigned long dst = (unsigned long) dest;
318
319 page = kmap_atomic_to_page(dest);
320 pte_phys = page_to_pfn(page);
321 pte_phys <<= PAGE_SHIFT;
322 pte_phys += (dst & ~(PAGE_MASK));
323#else
324 pte_phys = (unsigned long)__pa(dest);
325#endif
326 wpte.header.op = KVM_MMU_OP_WRITE_PTE;
327 wpte.pte_val = val;
328 wpte.pte_phys = pte_phys;
329
330 kvm_deferred_mmu_op(&wpte, sizeof wpte);
331}
332
333/*
334 * We only need to hook operations that are MMU writes. We hook these so that
335 * we can use lazy MMU mode to batch these operations. We could probably
336 * improve the performance of the host code if we used some of the information
337 * here to simplify processing of batched writes.
338 */
339static void kvm_set_pte(pte_t *ptep, pte_t pte)
340{
341 kvm_mmu_write(ptep, pte_val(pte));
342}
343
344static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
345 pte_t *ptep, pte_t pte)
346{
347 kvm_mmu_write(ptep, pte_val(pte));
348}
349
350static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
351{
352 kvm_mmu_write(pmdp, pmd_val(pmd));
353}
354
355#if PAGETABLE_LEVELS >= 3
356#ifdef CONFIG_X86_PAE
357static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
358{
359 kvm_mmu_write(ptep, pte_val(pte));
360}
361
362static void kvm_pte_clear(struct mm_struct *mm,
363 unsigned long addr, pte_t *ptep)
364{
365 kvm_mmu_write(ptep, 0);
366}
367
368static void kvm_pmd_clear(pmd_t *pmdp)
369{
370 kvm_mmu_write(pmdp, 0);
371}
372#endif
373
374static void kvm_set_pud(pud_t *pudp, pud_t pud)
375{
376 kvm_mmu_write(pudp, pud_val(pud));
377}
378
379#if PAGETABLE_LEVELS == 4
380static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
381{
382 kvm_mmu_write(pgdp, pgd_val(pgd));
383}
384#endif
385#endif /* PAGETABLE_LEVELS >= 3 */
386
387static void kvm_flush_tlb(void)
388{
389 struct kvm_mmu_op_flush_tlb ftlb = {
390 .header.op = KVM_MMU_OP_FLUSH_TLB,
391 };
392
393 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
394}
395
396static void kvm_release_pt(unsigned long pfn)
397{
398 struct kvm_mmu_op_release_pt rpt = {
399 .header.op = KVM_MMU_OP_RELEASE_PT,
400 .pt_phys = (u64)pfn << PAGE_SHIFT,
401 };
402
403 kvm_mmu_op(&rpt, sizeof rpt);
404}
405
406static void kvm_enter_lazy_mmu(void)
407{
408 paravirt_enter_lazy_mmu();
409}
410
411static void kvm_leave_lazy_mmu(void)
412{
413 struct kvm_para_state *state = kvm_para_state();
414
415 mmu_queue_flush(state);
416 paravirt_leave_lazy_mmu();
417}
418
277static void __init paravirt_ops_setup(void) 419static void __init paravirt_ops_setup(void)
278{ 420{
279 pv_info.name = "KVM"; 421 pv_info.name = "KVM";
@@ -282,6 +424,29 @@ static void __init paravirt_ops_setup(void)
282 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) 424 if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
283 pv_cpu_ops.io_delay = kvm_io_delay; 425 pv_cpu_ops.io_delay = kvm_io_delay;
284 426
427 if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
428 pv_mmu_ops.set_pte = kvm_set_pte;
429 pv_mmu_ops.set_pte_at = kvm_set_pte_at;
430 pv_mmu_ops.set_pmd = kvm_set_pmd;
431#if PAGETABLE_LEVELS >= 3
432#ifdef CONFIG_X86_PAE
433 pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
434 pv_mmu_ops.pte_clear = kvm_pte_clear;
435 pv_mmu_ops.pmd_clear = kvm_pmd_clear;
436#endif
437 pv_mmu_ops.set_pud = kvm_set_pud;
438#if PAGETABLE_LEVELS == 4
439 pv_mmu_ops.set_pgd = kvm_set_pgd;
440#endif
441#endif
442 pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
443 pv_mmu_ops.release_pte = kvm_release_pt;
444 pv_mmu_ops.release_pmd = kvm_release_pt;
445 pv_mmu_ops.release_pud = kvm_release_pt;
446
447 pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
448 pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
449 }
285#ifdef CONFIG_X86_IO_APIC 450#ifdef CONFIG_X86_IO_APIC
286 no_timer_check = 1; 451 no_timer_check = 1;
287#endif 452#endif
@@ -302,22 +467,6 @@ static void kvm_register_steal_time(void)
302 cpu, __pa(st)); 467 cpu, __pa(st));
303} 468}
304 469
305static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
306
307static void kvm_guest_apic_eoi_write(u32 reg, u32 val)
308{
309 /**
310 * This relies on __test_and_clear_bit to modify the memory
311 * in a way that is atomic with respect to the local CPU.
312 * The hypervisor only accesses this memory from the local CPU so
313 * there's no need for lock or memory barriers.
314 * An optimization barrier is implied in apic write.
315 */
316 if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi)))
317 return;
318 apic_write(APIC_EOI, APIC_EOI_ACK);
319}
320
321void __cpuinit kvm_guest_cpu_init(void) 470void __cpuinit kvm_guest_cpu_init(void)
322{ 471{
323 if (!kvm_para_available()) 472 if (!kvm_para_available())
@@ -335,20 +484,11 @@ void __cpuinit kvm_guest_cpu_init(void)
335 smp_processor_id()); 484 smp_processor_id());
336 } 485 }
337 486
338 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
339 unsigned long pa;
340 /* Size alignment is implied but just to make it explicit. */
341 BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
342 __get_cpu_var(kvm_apic_eoi) = 0;
343 pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED;
344 wrmsrl(MSR_KVM_PV_EOI_EN, pa);
345 }
346
347 if (has_steal_clock) 487 if (has_steal_clock)
348 kvm_register_steal_time(); 488 kvm_register_steal_time();
349} 489}
350 490
351static void kvm_pv_disable_apf(void) 491static void kvm_pv_disable_apf(void *unused)
352{ 492{
353 if (!__get_cpu_var(apf_reason).enabled) 493 if (!__get_cpu_var(apf_reason).enabled)
354 return; 494 return;
@@ -360,24 +500,11 @@ static void kvm_pv_disable_apf(void)
360 smp_processor_id()); 500 smp_processor_id());
361} 501}
362 502
363static void kvm_pv_guest_cpu_reboot(void *unused)
364{
365 /*
366 * We disable PV EOI before we load a new kernel by kexec,
367 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
368 * New kernel can re-enable when it boots.
369 */
370 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
371 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
372 kvm_pv_disable_apf();
373 kvm_disable_steal_time();
374}
375
376static int kvm_pv_reboot_notify(struct notifier_block *nb, 503static int kvm_pv_reboot_notify(struct notifier_block *nb,
377 unsigned long code, void *unused) 504 unsigned long code, void *unused)
378{ 505{
379 if (code == SYS_RESTART) 506 if (code == SYS_RESTART)
380 on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); 507 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
381 return NOTIFY_DONE; 508 return NOTIFY_DONE;
382} 509}
383 510
@@ -413,7 +540,9 @@ void kvm_disable_steal_time(void)
413#ifdef CONFIG_SMP 540#ifdef CONFIG_SMP
414static void __init kvm_smp_prepare_boot_cpu(void) 541static void __init kvm_smp_prepare_boot_cpu(void)
415{ 542{
543#ifdef CONFIG_KVM_CLOCK
416 WARN_ON(kvm_register_clock("primary cpu clock")); 544 WARN_ON(kvm_register_clock("primary cpu clock"));
545#endif
417 kvm_guest_cpu_init(); 546 kvm_guest_cpu_init();
418 native_smp_prepare_boot_cpu(); 547 native_smp_prepare_boot_cpu();
419} 548}
@@ -426,9 +555,7 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy)
426static void kvm_guest_cpu_offline(void *dummy) 555static void kvm_guest_cpu_offline(void *dummy)
427{ 556{
428 kvm_disable_steal_time(); 557 kvm_disable_steal_time();
429 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 558 kvm_pv_disable_apf(NULL);
430 wrmsrl(MSR_KVM_PV_EOI_EN, 0);
431 kvm_pv_disable_apf();
432 apf_task_wake_all(); 559 apf_task_wake_all();
433} 560}
434 561
@@ -481,12 +608,6 @@ void __init kvm_guest_init(void)
481 pv_time_ops.steal_clock = kvm_steal_clock; 608 pv_time_ops.steal_clock = kvm_steal_clock;
482 } 609 }
483 610
484 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
485 apic_set_eoi_write(kvm_guest_apic_eoi_write);
486
487 if (kvmclock_vsyscall)
488 kvm_setup_vsyscall_timeinfo();
489
490#ifdef CONFIG_SMP 611#ifdef CONFIG_SMP
491 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; 612 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
492 register_cpu_notifier(&kvm_cpu_notifier); 613 register_cpu_notifier(&kvm_cpu_notifier);
@@ -495,25 +616,12 @@ void __init kvm_guest_init(void)
495#endif 616#endif
496} 617}
497 618
498static bool __init kvm_detect(void)
499{
500 if (!kvm_para_available())
501 return false;
502 return true;
503}
504
505const struct hypervisor_x86 x86_hyper_kvm __refconst = {
506 .name = "KVM",
507 .detect = kvm_detect,
508};
509EXPORT_SYMBOL_GPL(x86_hyper_kvm);
510
511static __init int activate_jump_labels(void) 619static __init int activate_jump_labels(void)
512{ 620{
513 if (has_steal_clock) { 621 if (has_steal_clock) {
514 static_key_slow_inc(&paravirt_steal_enabled); 622 jump_label_inc(&paravirt_steal_enabled);
515 if (steal_acc) 623 if (steal_acc)
516 static_key_slow_inc(&paravirt_steal_rq_enabled); 624 jump_label_inc(&paravirt_steal_rq_enabled);
517 } 625 }
518 626
519 return 0; 627 return 0;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 220a360010f..44842d756b2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -22,8 +22,6 @@
22#include <asm/msr.h> 22#include <asm/msr.h>
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/hardirq.h>
26#include <linux/memblock.h>
27 25
28#include <asm/x86_init.h> 26#include <asm/x86_init.h>
29#include <asm/reboot.h> 27#include <asm/reboot.h>
@@ -40,7 +38,7 @@ static int parse_no_kvmclock(char *arg)
40early_param("no-kvmclock", parse_no_kvmclock); 38early_param("no-kvmclock", parse_no_kvmclock);
41 39
42/* The hypervisor will put information about time periodically here */ 40/* The hypervisor will put information about time periodically here */
43static struct pvclock_vsyscall_time_info *hv_clock; 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
44static struct pvclock_wall_clock wall_clock; 42static struct pvclock_wall_clock wall_clock;
45 43
46/* 44/*
@@ -53,20 +51,15 @@ static unsigned long kvm_get_wallclock(void)
53 struct pvclock_vcpu_time_info *vcpu_time; 51 struct pvclock_vcpu_time_info *vcpu_time;
54 struct timespec ts; 52 struct timespec ts;
55 int low, high; 53 int low, high;
56 int cpu;
57 54
58 low = (int)__pa_symbol(&wall_clock); 55 low = (int)__pa_symbol(&wall_clock);
59 high = ((u64)__pa_symbol(&wall_clock) >> 32); 56 high = ((u64)__pa_symbol(&wall_clock) >> 32);
60 57
61 native_write_msr(msr_kvm_wall_clock, low, high); 58 native_write_msr(msr_kvm_wall_clock, low, high);
62 59
63 preempt_disable(); 60 vcpu_time = &get_cpu_var(hv_clock);
64 cpu = smp_processor_id();
65
66 vcpu_time = &hv_clock[cpu].pvti;
67 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 61 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
68 62 put_cpu_var(hv_clock);
69 preempt_enable();
70 63
71 return ts.tv_sec; 64 return ts.tv_sec;
72} 65}
@@ -80,11 +73,9 @@ static cycle_t kvm_clock_read(void)
80{ 73{
81 struct pvclock_vcpu_time_info *src; 74 struct pvclock_vcpu_time_info *src;
82 cycle_t ret; 75 cycle_t ret;
83 int cpu;
84 76
85 preempt_disable_notrace(); 77 preempt_disable_notrace();
86 cpu = smp_processor_id(); 78 src = &__get_cpu_var(hv_clock);
87 src = &hv_clock[cpu].pvti;
88 ret = pvclock_clocksource_read(src); 79 ret = pvclock_clocksource_read(src);
89 preempt_enable_notrace(); 80 preempt_enable_notrace();
90 return ret; 81 return ret;
@@ -107,15 +98,8 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
107static unsigned long kvm_get_tsc_khz(void) 98static unsigned long kvm_get_tsc_khz(void)
108{ 99{
109 struct pvclock_vcpu_time_info *src; 100 struct pvclock_vcpu_time_info *src;
110 int cpu; 101 src = &per_cpu(hv_clock, 0);
111 unsigned long tsc_khz; 102 return pvclock_tsc_khz(src);
112
113 preempt_disable();
114 cpu = smp_processor_id();
115 src = &hv_clock[cpu].pvti;
116 tsc_khz = pvclock_tsc_khz(src);
117 preempt_enable();
118 return tsc_khz;
119} 103}
120 104
121static void kvm_get_preset_lpj(void) 105static void kvm_get_preset_lpj(void)
@@ -130,24 +114,6 @@ static void kvm_get_preset_lpj(void)
130 preset_lpj = lpj; 114 preset_lpj = lpj;
131} 115}
132 116
133bool kvm_check_and_clear_guest_paused(void)
134{
135 bool ret = false;
136 struct pvclock_vcpu_time_info *src;
137 int cpu = smp_processor_id();
138
139 if (!hv_clock)
140 return ret;
141
142 src = &hv_clock[cpu].pvti;
143 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
144 src->flags &= ~PVCLOCK_GUEST_STOPPED;
145 ret = true;
146 }
147
148 return ret;
149}
150
151static struct clocksource kvm_clock = { 117static struct clocksource kvm_clock = {
152 .name = "kvm-clock", 118 .name = "kvm-clock",
153 .read = kvm_clock_get_cycles, 119 .read = kvm_clock_get_cycles,
@@ -160,10 +126,9 @@ int kvm_register_clock(char *txt)
160{ 126{
161 int cpu = smp_processor_id(); 127 int cpu = smp_processor_id();
162 int low, high, ret; 128 int low, high, ret;
163 struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
164 129
165 low = (int)__pa(src) | 1; 130 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
166 high = ((u64)__pa(src) >> 32); 131 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
167 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 132 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
168 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 133 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
169 cpu, high, low, txt); 134 cpu, high, low, txt);
@@ -171,15 +136,6 @@ int kvm_register_clock(char *txt)
171 return ret; 136 return ret;
172} 137}
173 138
174static void kvm_save_sched_clock_state(void)
175{
176}
177
178static void kvm_restore_sched_clock_state(void)
179{
180 kvm_register_clock("primary cpu clock, resume");
181}
182
183#ifdef CONFIG_X86_LOCAL_APIC 139#ifdef CONFIG_X86_LOCAL_APIC
184static void __cpuinit kvm_setup_secondary_clock(void) 140static void __cpuinit kvm_setup_secondary_clock(void)
185{ 141{
@@ -188,6 +144,8 @@ static void __cpuinit kvm_setup_secondary_clock(void)
188 * we shouldn't fail. 144 * we shouldn't fail.
189 */ 145 */
190 WARN_ON(kvm_register_clock("secondary cpu clock")); 146 WARN_ON(kvm_register_clock("secondary cpu clock"));
147 /* ok, done with our trickery, call native */
148 setup_secondary_APIC_clock();
191} 149}
192#endif 150#endif
193 151
@@ -217,8 +175,6 @@ static void kvm_shutdown(void)
217 175
218void __init kvmclock_init(void) 176void __init kvmclock_init(void)
219{ 177{
220 unsigned long mem;
221
222 if (!kvm_para_available()) 178 if (!kvm_para_available())
223 return; 179 return;
224 180
@@ -231,28 +187,16 @@ void __init kvmclock_init(void)
231 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 187 printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
232 msr_kvm_system_time, msr_kvm_wall_clock); 188 msr_kvm_system_time, msr_kvm_wall_clock);
233 189
234 mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, 190 if (kvm_register_clock("boot clock"))
235 PAGE_SIZE);
236 if (!mem)
237 return;
238 hv_clock = __va(mem);
239
240 if (kvm_register_clock("boot clock")) {
241 hv_clock = NULL;
242 memblock_free(mem,
243 sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
244 return; 191 return;
245 }
246 pv_time_ops.sched_clock = kvm_clock_read; 192 pv_time_ops.sched_clock = kvm_clock_read;
247 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 193 x86_platform.calibrate_tsc = kvm_get_tsc_khz;
248 x86_platform.get_wallclock = kvm_get_wallclock; 194 x86_platform.get_wallclock = kvm_get_wallclock;
249 x86_platform.set_wallclock = kvm_set_wallclock; 195 x86_platform.set_wallclock = kvm_set_wallclock;
250#ifdef CONFIG_X86_LOCAL_APIC 196#ifdef CONFIG_X86_LOCAL_APIC
251 x86_cpuinit.early_percpu_clock_init = 197 x86_cpuinit.setup_percpu_clockev =
252 kvm_setup_secondary_clock; 198 kvm_setup_secondary_clock;
253#endif 199#endif
254 x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
255 x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
256 machine_ops.shutdown = kvm_shutdown; 200 machine_ops.shutdown = kvm_shutdown;
257#ifdef CONFIG_KEXEC 201#ifdef CONFIG_KEXEC
258 machine_ops.crash_shutdown = kvm_crash_shutdown; 202 machine_ops.crash_shutdown = kvm_crash_shutdown;
@@ -265,37 +209,3 @@ void __init kvmclock_init(void)
265 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 209 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
266 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 210 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
267} 211}
268
269int __init kvm_setup_vsyscall_timeinfo(void)
270{
271#ifdef CONFIG_X86_64
272 int cpu;
273 int ret;
274 u8 flags;
275 struct pvclock_vcpu_time_info *vcpu_time;
276 unsigned int size;
277
278 size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
279
280 preempt_disable();
281 cpu = smp_processor_id();
282
283 vcpu_time = &hv_clock[cpu].pvti;
284 flags = pvclock_read_flags(vcpu_time);
285
286 if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
287 preempt_enable();
288 return 1;
289 }
290
291 if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
292 preempt_enable();
293 return ret;
294 }
295
296 preempt_enable();
297
298 kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
299#endif
300 return 0;
301}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ebc98739892..ea697263b37 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -15,6 +15,7 @@
15#include <linux/vmalloc.h> 15#include <linux/vmalloc.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17 17
18#include <asm/system.h>
18#include <asm/ldt.h> 19#include <asm/ldt.h>
19#include <asm/desc.h> 20#include <asm/desc.h>
20#include <asm/mmu_context.h> 21#include <asm/mmu_context.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 5b19e4d78b0..a3fa43ba5d3 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -23,6 +23,7 @@
23#include <asm/apic.h> 23#include <asm/apic.h>
24#include <asm/cpufeature.h> 24#include <asm/cpufeature.h>
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h>
26#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
27#include <asm/debugreg.h> 28#include <asm/debugreg.h>
28 29
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index efdec7cd8e0..591be0ee193 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -1,18 +1,14 @@
1/* 1/*
2 * AMD CPU Microcode Update Driver for Linux 2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008-2011 Advanced Micro Devices Inc. 3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 * 4 *
5 * Author: Peter Oruba <peter.oruba@amd.com> 5 * Author: Peter Oruba <peter.oruba@amd.com>
6 * 6 *
7 * Based on work by: 7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> 8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 * 9 *
10 * Maintainers: 10 * This driver allows to upgrade microcode on AMD
11 * Andreas Herrmann <herrmann.der.user@googlemail.com> 11 * family 0x10 and 0x11 processors.
12 * Borislav Petkov <bp@alien8.de>
13 *
14 * This driver allows to upgrade microcode on F10h AMD
15 * CPUs and later.
16 * 12 *
17 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
18 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
@@ -75,122 +71,97 @@ struct microcode_amd {
75 71
76static struct equiv_cpu_entry *equiv_cpu_table; 72static struct equiv_cpu_entry *equiv_cpu_table;
77 73
78struct ucode_patch { 74static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
79 struct list_head plist;
80 void *data;
81 u32 patch_id;
82 u16 equiv_cpu;
83};
84
85static LIST_HEAD(pcache);
86
87static u16 find_equiv_id(unsigned int cpu)
88{ 75{
89 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
90 int i = 0; 77 u32 dummy;
91 78
92 if (!equiv_cpu_table) 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
93 return 0; 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
81 return -1;
82 }
94 83
95 while (equiv_cpu_table[i].installed_cpu != 0) { 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
96 if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
97 return equiv_cpu_table[i].equiv_cpu;
98 86
99 i++;
100 }
101 return 0; 87 return 0;
102} 88}
103 89
104static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
105{ 92{
106 int i = 0; 93 unsigned int current_cpu_id;
94 u16 equiv_cpu_id = 0;
95 unsigned int i = 0;
107 96
108 BUG_ON(!equiv_cpu_table); 97 BUG_ON(equiv_cpu_table == NULL);
98 current_cpu_id = cpuid_eax(0x00000001);
109 99
110 while (equiv_cpu_table[i].equiv_cpu != 0) { 100 while (equiv_cpu_table[i].installed_cpu != 0) {
111 if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) 101 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
112 return equiv_cpu_table[i].installed_cpu; 102 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
103 break;
104 }
113 i++; 105 i++;
114 } 106 }
115 return 0;
116}
117 107
118/* 108 if (!equiv_cpu_id)
119 * a small, trivial cache of per-family ucode patches 109 return 0;
120 */
121static struct ucode_patch *cache_find_patch(u16 equiv_cpu)
122{
123 struct ucode_patch *p;
124 110
125 list_for_each_entry(p, &pcache, plist) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
126 if (p->equiv_cpu == equiv_cpu) 112 return 0;
127 return p;
128 return NULL;
129}
130 113
131static void update_cache(struct ucode_patch *new_patch) 114 /* ucode might be chipset specific -- currently we don't support this */
132{ 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
133 struct ucode_patch *p; 116 pr_err("CPU%d: chipset specific code not yet supported\n",
134 117 cpu);
135 list_for_each_entry(p, &pcache, plist) { 118 return 0;
136 if (p->equiv_cpu == new_patch->equiv_cpu) {
137 if (p->patch_id >= new_patch->patch_id)
138 /* we already have the latest patch */
139 return;
140
141 list_replace(&p->plist, &new_patch->plist);
142 kfree(p->data);
143 kfree(p);
144 return;
145 }
146 } 119 }
147 /* no patch found, add it */
148 list_add_tail(&new_patch->plist, &pcache);
149}
150 120
151static void free_cache(void) 121 if (mc_hdr->patch_id <= rev)
152{ 122 return 0;
153 struct ucode_patch *p, *tmp;
154 123
155 list_for_each_entry_safe(p, tmp, &pcache, plist) { 124 return 1;
156 __list_del(p->plist.prev, p->plist.next);
157 kfree(p->data);
158 kfree(p);
159 }
160} 125}
161 126
162static struct ucode_patch *find_patch(unsigned int cpu) 127static int apply_microcode_amd(int cpu)
163{ 128{
164 u16 equiv_id; 129 u32 rev, dummy;
130 int cpu_num = raw_smp_processor_id();
131 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
132 struct microcode_amd *mc_amd = uci->mc;
165 133
166 equiv_id = find_equiv_id(cpu); 134 /* We should bind the task to the CPU */
167 if (!equiv_id) 135 BUG_ON(cpu_num != cpu);
168 return NULL;
169 136
170 return cache_find_patch(equiv_id); 137 if (mc_amd == NULL)
171} 138 return 0;
172 139
173static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) 140 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
174{ 141 /* get patch id after patching */
175 struct cpuinfo_x86 *c = &cpu_data(cpu); 142 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
176 143
177 csig->sig = cpuid_eax(0x00000001); 144 /* check current patch id and patch's id for match */
178 csig->rev = c->microcode; 145 if (rev != mc_amd->hdr.patch_id) {
179 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
147 cpu, mc_amd->hdr.patch_id);
148 return -1;
149 }
150
151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
152 uci->cpu_sig.rev = rev;
180 153
181 return 0; 154 return 0;
182} 155}
183 156
184static unsigned int verify_patch_size(int cpu, u32 patch_size, 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
185 unsigned int size)
186{ 158{
187 struct cpuinfo_x86 *c = &cpu_data(cpu); 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
188 u32 max_size; 160 u32 max_size, actual_size;
189 161
190#define F1XH_MPB_MAX_SIZE 2048 162#define F1XH_MPB_MAX_SIZE 2048
191#define F14H_MPB_MAX_SIZE 1824 163#define F14H_MPB_MAX_SIZE 1824
192#define F15H_MPB_MAX_SIZE 4096 164#define F15H_MPB_MAX_SIZE 4096
193#define F16H_MPB_MAX_SIZE 3458
194 165
195 switch (c->x86) { 166 switch (c->x86) {
196 case 0x14: 167 case 0x14:
@@ -199,64 +170,45 @@ static unsigned int verify_patch_size(int cpu, u32 patch_size,
199 case 0x15: 170 case 0x15:
200 max_size = F15H_MPB_MAX_SIZE; 171 max_size = F15H_MPB_MAX_SIZE;
201 break; 172 break;
202 case 0x16:
203 max_size = F16H_MPB_MAX_SIZE;
204 break;
205 default: 173 default:
206 max_size = F1XH_MPB_MAX_SIZE; 174 max_size = F1XH_MPB_MAX_SIZE;
207 break; 175 break;
208 } 176 }
209 177
210 if (patch_size > min_t(u32, size, max_size)) { 178 actual_size = *(u32 *)(buf + 4);
211 pr_err("patch size mismatch\n"); 179
180 if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) {
181 pr_err("section size mismatch\n");
212 return 0; 182 return 0;
213 } 183 }
214 184
215 return patch_size; 185 return actual_size;
216} 186}
217 187
218static int apply_microcode_amd(int cpu) 188static struct microcode_header_amd *
189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
219{ 190{
220 struct cpuinfo_x86 *c = &cpu_data(cpu); 191 struct microcode_header_amd *mc = NULL;
221 struct microcode_amd *mc_amd; 192 unsigned int actual_size = 0;
222 struct ucode_cpu_info *uci;
223 struct ucode_patch *p;
224 u32 rev, dummy;
225
226 BUG_ON(raw_smp_processor_id() != cpu);
227
228 uci = ucode_cpu_info + cpu;
229
230 p = find_patch(cpu);
231 if (!p)
232 return 0;
233
234 mc_amd = p->data;
235 uci->mc = p->data;
236
237 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
238 193
239 /* need to apply patch? */ 194 if (*(u32 *)buf != UCODE_UCODE_TYPE) {
240 if (rev >= mc_amd->hdr.patch_id) { 195 pr_err("invalid type field in container file section header\n");
241 c->microcode = rev; 196 goto out;
242 return 0;
243 } 197 }
244 198
245 wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
246 202
247 /* verify patch application was successful */ 203 mc = vzalloc(actual_size);
248 rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); 204 if (!mc)
249 if (rev != mc_amd->hdr.patch_id) { 205 goto out;
250 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
251 cpu, mc_amd->hdr.patch_id);
252 return -1;
253 }
254 206
255 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); 207 get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size);
256 uci->cpu_sig.rev = rev; 208 *mc_size = actual_size + SECTION_HDR_SIZE;
257 c->microcode = rev;
258 209
259 return 0; 210out:
211 return mc;
260} 212}
261 213
262static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
@@ -277,7 +229,7 @@ static int install_equiv_cpu_table(const u8 *buf)
277 return -ENOMEM; 229 return -ENOMEM;
278 } 230 }
279 231
280 memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); 232 get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);
281 233
282 /* add header length */ 234 /* add header length */
283 return size + CONTAINER_HDR_SZ; 235 return size + CONTAINER_HDR_SZ;
@@ -289,147 +241,71 @@ static void free_equiv_cpu_table(void)
289 equiv_cpu_table = NULL; 241 equiv_cpu_table = NULL;
290} 242}
291 243
292static void cleanup(void) 244static enum ucode_state
293{ 245generic_load_microcode(int cpu, const u8 *data, size_t size)
294 free_equiv_cpu_table();
295 free_cache();
296}
297
298/*
299 * We return the current size even if some of the checks failed so that
300 * we can skip over the next patch. If we return a negative value, we
301 * signal a grave error like a memory allocation has failed and the
302 * driver cannot continue functioning normally. In such cases, we tear
303 * down everything we've used up so far and exit.
304 */
305static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover)
306{
307 struct cpuinfo_x86 *c = &cpu_data(cpu);
308 struct microcode_header_amd *mc_hdr;
309 struct ucode_patch *patch;
310 unsigned int patch_size, crnt_size, ret;
311 u32 proc_fam;
312 u16 proc_id;
313
314 patch_size = *(u32 *)(fw + 4);
315 crnt_size = patch_size + SECTION_HDR_SIZE;
316 mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
317 proc_id = mc_hdr->processor_rev_id;
318
319 proc_fam = find_cpu_family_by_equiv_cpu(proc_id);
320 if (!proc_fam) {
321 pr_err("No patch family for equiv ID: 0x%04x\n", proc_id);
322 return crnt_size;
323 }
324
325 /* check if patch is for the current family */
326 proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff);
327 if (proc_fam != c->x86)
328 return crnt_size;
329
330 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
331 pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n",
332 mc_hdr->patch_id);
333 return crnt_size;
334 }
335
336 ret = verify_patch_size(cpu, patch_size, leftover);
337 if (!ret) {
338 pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id);
339 return crnt_size;
340 }
341
342 patch = kzalloc(sizeof(*patch), GFP_KERNEL);
343 if (!patch) {
344 pr_err("Patch allocation failure.\n");
345 return -EINVAL;
346 }
347
348 patch->data = kzalloc(patch_size, GFP_KERNEL);
349 if (!patch->data) {
350 pr_err("Patch data allocation failure.\n");
351 kfree(patch);
352 return -EINVAL;
353 }
354
355 /* All looks ok, copy patch... */
356 memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size);
357 INIT_LIST_HEAD(&patch->plist);
358 patch->patch_id = mc_hdr->patch_id;
359 patch->equiv_cpu = proc_id;
360
361 /* ... and add to cache. */
362 update_cache(patch);
363
364 return crnt_size;
365}
366
367static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size)
368{ 246{
369 enum ucode_state ret = UCODE_ERROR; 247 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
370 unsigned int leftover; 248 struct microcode_header_amd *mc_hdr = NULL;
371 u8 *fw = (u8 *)data; 249 unsigned int mc_size, leftover;
372 int crnt_size = 0;
373 int offset; 250 int offset;
251 const u8 *ucode_ptr = data;
252 void *new_mc = NULL;
253 unsigned int new_rev = uci->cpu_sig.rev;
254 enum ucode_state state = UCODE_OK;
374 255
375 offset = install_equiv_cpu_table(data); 256 offset = install_equiv_cpu_table(ucode_ptr);
376 if (offset < 0) { 257 if (offset < 0) {
377 pr_err("failed to create equivalent cpu table\n"); 258 pr_err("failed to create equivalent cpu table\n");
378 return ret; 259 return UCODE_ERROR;
379 } 260 }
380 fw += offset; 261
262 ucode_ptr += offset;
381 leftover = size - offset; 263 leftover = size - offset;
382 264
383 if (*(u32 *)fw != UCODE_UCODE_TYPE) { 265 while (leftover) {
384 pr_err("invalid type field in container file section header\n"); 266 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
385 free_equiv_cpu_table(); 267 if (!mc_hdr)
386 return ret; 268 break;
269
270 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
271 vfree(new_mc);
272 new_rev = mc_hdr->patch_id;
273 new_mc = mc_hdr;
274 } else
275 vfree(mc_hdr);
276
277 ucode_ptr += mc_size;
278 leftover -= mc_size;
387 } 279 }
388 280
389 while (leftover) { 281 if (!new_mc) {
390 crnt_size = verify_and_add_patch(cpu, fw, leftover); 282 state = UCODE_NFOUND;
391 if (crnt_size < 0) 283 goto free_table;
392 return ret; 284 }
393 285
394 fw += crnt_size; 286 if (!leftover) {
395 leftover -= crnt_size; 287 vfree(uci->mc);
288 uci->mc = new_mc;
289 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
290 cpu, uci->cpu_sig.rev, new_rev);
291 } else {
292 vfree(new_mc);
293 state = UCODE_ERROR;
396 } 294 }
397 295
398 return UCODE_OK; 296free_table:
297 free_equiv_cpu_table();
298
299 return state;
399} 300}
400 301
401/* 302static enum ucode_state request_microcode_amd(int cpu, struct device *device)
402 * AMD microcode firmware naming convention, up to family 15h they are in
403 * the legacy file:
404 *
405 * amd-ucode/microcode_amd.bin
406 *
407 * This legacy file is always smaller than 2K in size.
408 *
409 * Beginning with family 15h, they are in family-specific firmware files:
410 *
411 * amd-ucode/microcode_amd_fam15h.bin
412 * amd-ucode/microcode_amd_fam16h.bin
413 * ...
414 *
415 * These might be larger than 2K.
416 */
417static enum ucode_state request_microcode_amd(int cpu, struct device *device,
418 bool refresh_fw)
419{ 303{
420 char fw_name[36] = "amd-ucode/microcode_amd.bin"; 304 const char *fw_name = "amd-ucode/microcode_amd.bin";
421 struct cpuinfo_x86 *c = &cpu_data(cpu);
422 enum ucode_state ret = UCODE_NFOUND;
423 const struct firmware *fw; 305 const struct firmware *fw;
306 enum ucode_state ret = UCODE_NFOUND;
424 307
425 /* reload ucode container only on the boot cpu */ 308 if (request_firmware(&fw, fw_name, device)) {
426 if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
427 return UCODE_OK;
428
429 if (c->x86 >= 0x15)
430 snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86);
431
432 if (request_firmware(&fw, (const char *)fw_name, device)) {
433 pr_err("failed to load file %s\n", fw_name); 309 pr_err("failed to load file %s\n", fw_name);
434 goto out; 310 goto out;
435 } 311 }
@@ -440,23 +316,19 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
440 goto fw_release; 316 goto fw_release;
441 } 317 }
442 318
443 /* free old equiv table */ 319 ret = generic_load_microcode(cpu, fw->data, fw->size);
444 free_equiv_cpu_table();
445
446 ret = load_microcode_amd(cpu, fw->data, fw->size);
447 if (ret != UCODE_OK)
448 cleanup();
449 320
450 fw_release: 321fw_release:
451 release_firmware(fw); 322 release_firmware(fw);
452 323
453 out: 324out:
454 return ret; 325 return ret;
455} 326}
456 327
457static enum ucode_state 328static enum ucode_state
458request_microcode_user(int cpu, const void __user *buf, size_t size) 329request_microcode_user(int cpu, const void __user *buf, size_t size)
459{ 330{
331 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
460 return UCODE_ERROR; 332 return UCODE_ERROR;
461} 333}
462 334
@@ -464,6 +336,7 @@ static void microcode_fini_cpu_amd(int cpu)
464{ 336{
465 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 337 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
466 338
339 vfree(uci->mc);
467 uci->mc = NULL; 340 uci->mc = NULL;
468} 341}
469 342
@@ -477,17 +350,5 @@ static struct microcode_ops microcode_amd_ops = {
477 350
478struct microcode_ops * __init init_amd_microcode(void) 351struct microcode_ops * __init init_amd_microcode(void)
479{ 352{
480 struct cpuinfo_x86 *c = &cpu_data(0);
481
482 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
483 pr_warning("AMD CPU family 0x%x not supported\n", c->x86);
484 return NULL;
485 }
486
487 return &microcode_amd_ops; 353 return &microcode_amd_ops;
488} 354}
489
490void __exit exit_amd_microcode(void)
491{
492 cleanup();
493}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 3a04b224d0c..f9242800bc8 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -86,8 +86,6 @@
86 86
87#include <asm/microcode.h> 87#include <asm/microcode.h>
88#include <asm/processor.h> 88#include <asm/processor.h>
89#include <asm/cpu_device_id.h>
90#include <asm/perf_event.h>
91 89
92MODULE_DESCRIPTION("Microcode Update Driver"); 90MODULE_DESCRIPTION("Microcode Update Driver");
93MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); 91MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -225,9 +223,6 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
225 if (do_microcode_update(buf, len) == 0) 223 if (do_microcode_update(buf, len) == 0)
226 ret = (ssize_t)len; 224 ret = (ssize_t)len;
227 225
228 if (ret > 0)
229 perf_check_microcode();
230
231 mutex_unlock(&microcode_mutex); 226 mutex_unlock(&microcode_mutex);
232 put_online_cpus(); 227 put_online_cpus();
233 228
@@ -261,7 +256,7 @@ static int __init microcode_dev_init(void)
261 return 0; 256 return 0;
262} 257}
263 258
264static void __exit microcode_dev_exit(void) 259static void microcode_dev_exit(void)
265{ 260{
266 misc_deregister(&microcode_dev); 261 misc_deregister(&microcode_dev);
267} 262}
@@ -279,51 +274,43 @@ static struct platform_device *microcode_pdev;
279static int reload_for_cpu(int cpu) 274static int reload_for_cpu(int cpu)
280{ 275{
281 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 276 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
282 enum ucode_state ustate;
283 int err = 0; 277 int err = 0;
284 278
285 if (!uci->valid) 279 mutex_lock(&microcode_mutex);
286 return err; 280 if (uci->valid) {
281 enum ucode_state ustate;
282
283 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
284 if (ustate == UCODE_OK)
285 apply_microcode_on_target(cpu);
286 else
287 if (ustate == UCODE_ERROR)
288 err = -EINVAL;
289 }
290 mutex_unlock(&microcode_mutex);
287 291
288 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, true);
289 if (ustate == UCODE_OK)
290 apply_microcode_on_target(cpu);
291 else
292 if (ustate == UCODE_ERROR)
293 err = -EINVAL;
294 return err; 292 return err;
295} 293}
296 294
297static ssize_t reload_store(struct device *dev, 295static ssize_t reload_store(struct sys_device *dev,
298 struct device_attribute *attr, 296 struct sysdev_attribute *attr,
299 const char *buf, size_t size) 297 const char *buf, size_t size)
300{ 298{
301 unsigned long val; 299 unsigned long val;
302 int cpu; 300 int cpu = dev->id;
303 ssize_t ret = 0, tmp_ret; 301 int ret = 0;
304 302 char *end;
305 ret = kstrtoul(buf, 0, &val);
306 if (ret)
307 return ret;
308
309 if (val != 1)
310 return size;
311 303
312 get_online_cpus(); 304 val = simple_strtoul(buf, &end, 0);
313 mutex_lock(&microcode_mutex); 305 if (end == buf)
314 for_each_online_cpu(cpu) { 306 return -EINVAL;
315 tmp_ret = reload_for_cpu(cpu);
316 if (tmp_ret != 0)
317 pr_warn("Error reloading microcode on CPU %d\n", cpu);
318 307
319 /* save retval of the first encountered reload error */ 308 if (val == 1) {
320 if (!ret) 309 get_online_cpus();
321 ret = tmp_ret; 310 if (cpu_online(cpu))
311 ret = reload_for_cpu(cpu);
312 put_online_cpus();
322 } 313 }
323 if (!ret)
324 perf_check_microcode();
325 mutex_unlock(&microcode_mutex);
326 put_online_cpus();
327 314
328 if (!ret) 315 if (!ret)
329 ret = size; 316 ret = size;
@@ -331,29 +318,30 @@ static ssize_t reload_store(struct device *dev,
331 return ret; 318 return ret;
332} 319}
333 320
334static ssize_t version_show(struct device *dev, 321static ssize_t version_show(struct sys_device *dev,
335 struct device_attribute *attr, char *buf) 322 struct sysdev_attribute *attr, char *buf)
336{ 323{
337 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 324 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
338 325
339 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); 326 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
340} 327}
341 328
342static ssize_t pf_show(struct device *dev, 329static ssize_t pf_show(struct sys_device *dev,
343 struct device_attribute *attr, char *buf) 330 struct sysdev_attribute *attr, char *buf)
344{ 331{
345 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; 332 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
346 333
347 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); 334 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
348} 335}
349 336
350static DEVICE_ATTR(reload, 0200, NULL, reload_store); 337static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
351static DEVICE_ATTR(version, 0400, version_show, NULL); 338static SYSDEV_ATTR(version, 0400, version_show, NULL);
352static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); 339static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
353 340
354static struct attribute *mc_default_attrs[] = { 341static struct attribute *mc_default_attrs[] = {
355 &dev_attr_version.attr, 342 &attr_reload.attr,
356 &dev_attr_processor_flags.attr, 343 &attr_version.attr,
344 &attr_processor_flags.attr,
357 NULL 345 NULL
358}; 346};
359 347
@@ -372,15 +360,18 @@ static void microcode_fini_cpu(int cpu)
372 360
373static enum ucode_state microcode_resume_cpu(int cpu) 361static enum ucode_state microcode_resume_cpu(int cpu)
374{ 362{
375 pr_debug("CPU%d updated upon resume\n", cpu); 363 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
376 364
377 if (apply_microcode_on_target(cpu)) 365 if (!uci->mc)
378 return UCODE_ERROR; 366 return UCODE_NFOUND;
367
368 pr_debug("CPU%d updated upon resume\n", cpu);
369 apply_microcode_on_target(cpu);
379 370
380 return UCODE_OK; 371 return UCODE_OK;
381} 372}
382 373
383static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw) 374static enum ucode_state microcode_init_cpu(int cpu)
384{ 375{
385 enum ucode_state ustate; 376 enum ucode_state ustate;
386 377
@@ -391,8 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
391 if (system_state != SYSTEM_RUNNING) 382 if (system_state != SYSTEM_RUNNING)
392 return UCODE_NFOUND; 383 return UCODE_NFOUND;
393 384
394 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev, 385 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
395 refresh_fw);
396 386
397 if (ustate == UCODE_OK) { 387 if (ustate == UCODE_OK) {
398 pr_debug("CPU%d updated upon init\n", cpu); 388 pr_debug("CPU%d updated upon init\n", cpu);
@@ -405,50 +395,53 @@ static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)
405static enum ucode_state microcode_update_cpu(int cpu) 395static enum ucode_state microcode_update_cpu(int cpu)
406{ 396{
407 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 397 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
398 enum ucode_state ustate;
408 399
409 if (uci->valid) 400 if (uci->valid)
410 return microcode_resume_cpu(cpu); 401 ustate = microcode_resume_cpu(cpu);
402 else
403 ustate = microcode_init_cpu(cpu);
411 404
412 return microcode_init_cpu(cpu, false); 405 return ustate;
413} 406}
414 407
415static int mc_device_add(struct device *dev, struct subsys_interface *sif) 408static int mc_sysdev_add(struct sys_device *sys_dev)
416{ 409{
417 int err, cpu = dev->id; 410 int err, cpu = sys_dev->id;
418 411
419 if (!cpu_online(cpu)) 412 if (!cpu_online(cpu))
420 return 0; 413 return 0;
421 414
422 pr_debug("CPU%d added\n", cpu); 415 pr_debug("CPU%d added\n", cpu);
423 416
424 err = sysfs_create_group(&dev->kobj, &mc_attr_group); 417 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
425 if (err) 418 if (err)
426 return err; 419 return err;
427 420
428 if (microcode_init_cpu(cpu, true) == UCODE_ERROR) 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
429 return -EINVAL; 423 return -EINVAL;
424 }
430 425
431 return err; 426 return err;
432} 427}
433 428
434static int mc_device_remove(struct device *dev, struct subsys_interface *sif) 429static int mc_sysdev_remove(struct sys_device *sys_dev)
435{ 430{
436 int cpu = dev->id; 431 int cpu = sys_dev->id;
437 432
438 if (!cpu_online(cpu)) 433 if (!cpu_online(cpu))
439 return 0; 434 return 0;
440 435
441 pr_debug("CPU%d removed\n", cpu); 436 pr_debug("CPU%d removed\n", cpu);
442 microcode_fini_cpu(cpu); 437 microcode_fini_cpu(cpu);
443 sysfs_remove_group(&dev->kobj, &mc_attr_group); 438 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
444 return 0; 439 return 0;
445} 440}
446 441
447static struct subsys_interface mc_cpu_interface = { 442static struct sysdev_driver mc_sysdev_driver = {
448 .name = "microcode", 443 .add = mc_sysdev_add,
449 .subsys = &cpu_subsys, 444 .remove = mc_sysdev_remove,
450 .add_dev = mc_device_add,
451 .remove_dev = mc_device_remove,
452}; 445};
453 446
454/** 447/**
@@ -471,44 +464,31 @@ static __cpuinit int
471mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) 464mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
472{ 465{
473 unsigned int cpu = (unsigned long)hcpu; 466 unsigned int cpu = (unsigned long)hcpu;
474 struct device *dev; 467 struct sys_device *sys_dev;
475 468
476 dev = get_cpu_device(cpu); 469 sys_dev = get_cpu_sysdev(cpu);
477 470 switch (action) {
478 switch (action & ~CPU_TASKS_FROZEN) {
479 case CPU_ONLINE: 471 case CPU_ONLINE:
472 case CPU_ONLINE_FROZEN:
480 microcode_update_cpu(cpu); 473 microcode_update_cpu(cpu);
481 pr_debug("CPU%d added\n", cpu);
482 /*
483 * "break" is missing on purpose here because we want to fall
484 * through in order to create the sysfs group.
485 */
486
487 case CPU_DOWN_FAILED: 474 case CPU_DOWN_FAILED:
488 if (sysfs_create_group(&dev->kobj, &mc_attr_group)) 475 case CPU_DOWN_FAILED_FROZEN:
476 pr_debug("CPU%d added\n", cpu);
477 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
489 pr_err("Failed to create group for CPU%d\n", cpu); 478 pr_err("Failed to create group for CPU%d\n", cpu);
490 break; 479 break;
491
492 case CPU_DOWN_PREPARE: 480 case CPU_DOWN_PREPARE:
481 case CPU_DOWN_PREPARE_FROZEN:
493 /* Suspend is in progress, only remove the interface */ 482 /* Suspend is in progress, only remove the interface */
494 sysfs_remove_group(&dev->kobj, &mc_attr_group); 483 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
495 pr_debug("CPU%d removed\n", cpu); 484 pr_debug("CPU%d removed\n", cpu);
496 break; 485 break;
497 486 case CPU_DEAD:
498 /* 487 case CPU_UP_CANCELED_FROZEN:
499 * case CPU_DEAD: 488 /* The CPU refused to come up during a system resume */
500 *
501 * When a CPU goes offline, don't free up or invalidate the copy of
502 * the microcode in kernel memory, so that we can reuse it when the
503 * CPU comes back online without unnecessarily requesting the userspace
504 * for it again.
505 */
506 }
507
508 /* The CPU refused to come up during a system resume */
509 if (action == CPU_UP_CANCELED_FROZEN)
510 microcode_fini_cpu(cpu); 489 microcode_fini_cpu(cpu);
511 490 break;
491 }
512 return NOTIFY_OK; 492 return NOTIFY_OK;
513} 493}
514 494
@@ -516,30 +496,6 @@ static struct notifier_block __refdata mc_cpu_notifier = {
516 .notifier_call = mc_cpu_callback, 496 .notifier_call = mc_cpu_callback,
517}; 497};
518 498
519#ifdef MODULE
520/* Autoload on Intel and AMD systems */
521static const struct x86_cpu_id __initconst microcode_id[] = {
522#ifdef CONFIG_MICROCODE_INTEL
523 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, },
524#endif
525#ifdef CONFIG_MICROCODE_AMD
526 { X86_VENDOR_AMD, X86_FAMILY_ANY, X86_MODEL_ANY, },
527#endif
528 {}
529};
530MODULE_DEVICE_TABLE(x86cpu, microcode_id);
531#endif
532
533static struct attribute *cpu_root_microcode_attrs[] = {
534 &dev_attr_reload.attr,
535 NULL
536};
537
538static struct attribute_group cpu_root_microcode_group = {
539 .name = "microcode",
540 .attrs = cpu_root_microcode_attrs,
541};
542
543static int __init microcode_init(void) 499static int __init microcode_init(void)
544{ 500{
545 struct cpuinfo_x86 *c = &cpu_data(0); 501 struct cpuinfo_x86 *c = &cpu_data(0);
@@ -549,40 +505,35 @@ static int __init microcode_init(void)
549 microcode_ops = init_intel_microcode(); 505 microcode_ops = init_intel_microcode();
550 else if (c->x86_vendor == X86_VENDOR_AMD) 506 else if (c->x86_vendor == X86_VENDOR_AMD)
551 microcode_ops = init_amd_microcode(); 507 microcode_ops = init_amd_microcode();
552 else
553 pr_err("no support for this CPU vendor\n");
554 508
555 if (!microcode_ops) 509 if (!microcode_ops) {
510 pr_err("no support for this CPU vendor\n");
556 return -ENODEV; 511 return -ENODEV;
512 }
557 513
558 microcode_pdev = platform_device_register_simple("microcode", -1, 514 microcode_pdev = platform_device_register_simple("microcode", -1,
559 NULL, 0); 515 NULL, 0);
560 if (IS_ERR(microcode_pdev)) 516 if (IS_ERR(microcode_pdev)) {
517 microcode_dev_exit();
561 return PTR_ERR(microcode_pdev); 518 return PTR_ERR(microcode_pdev);
519 }
562 520
563 get_online_cpus(); 521 get_online_cpus();
564 mutex_lock(&microcode_mutex); 522 mutex_lock(&microcode_mutex);
565 523
566 error = subsys_interface_register(&mc_cpu_interface); 524 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
567 if (!error) 525
568 perf_check_microcode();
569 mutex_unlock(&microcode_mutex); 526 mutex_unlock(&microcode_mutex);
570 put_online_cpus(); 527 put_online_cpus();
571 528
572 if (error)
573 goto out_pdev;
574
575 error = sysfs_create_group(&cpu_subsys.dev_root->kobj,
576 &cpu_root_microcode_group);
577
578 if (error) { 529 if (error) {
579 pr_err("Error creating microcode group!\n"); 530 platform_device_unregister(microcode_pdev);
580 goto out_driver; 531 return error;
581 } 532 }
582 533
583 error = microcode_dev_init(); 534 error = microcode_dev_init();
584 if (error) 535 if (error)
585 goto out_ucode_group; 536 return error;
586 537
587 register_syscore_ops(&mc_syscore_ops); 538 register_syscore_ops(&mc_syscore_ops);
588 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
@@ -591,43 +542,20 @@ static int __init microcode_init(void)
591 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); 542 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
592 543
593 return 0; 544 return 0;
594
595 out_ucode_group:
596 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
597 &cpu_root_microcode_group);
598
599 out_driver:
600 get_online_cpus();
601 mutex_lock(&microcode_mutex);
602
603 subsys_interface_unregister(&mc_cpu_interface);
604
605 mutex_unlock(&microcode_mutex);
606 put_online_cpus();
607
608 out_pdev:
609 platform_device_unregister(microcode_pdev);
610 return error;
611
612} 545}
613module_init(microcode_init); 546module_init(microcode_init);
614 547
615static void __exit microcode_exit(void) 548static void __exit microcode_exit(void)
616{ 549{
617 struct cpuinfo_x86 *c = &cpu_data(0);
618
619 microcode_dev_exit(); 550 microcode_dev_exit();
620 551
621 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
622 unregister_syscore_ops(&mc_syscore_ops); 553 unregister_syscore_ops(&mc_syscore_ops);
623 554
624 sysfs_remove_group(&cpu_subsys.dev_root->kobj,
625 &cpu_root_microcode_group);
626
627 get_online_cpus(); 555 get_online_cpus();
628 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
629 557
630 subsys_interface_unregister(&mc_cpu_interface); 558 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
631 559
632 mutex_unlock(&microcode_mutex); 560 mutex_unlock(&microcode_mutex);
633 put_online_cpus(); 561 put_online_cpus();
@@ -636,9 +564,6 @@ static void __exit microcode_exit(void)
636 564
637 microcode_ops = NULL; 565 microcode_ops = NULL;
638 566
639 if (c->x86_vendor == X86_VENDOR_AMD)
640 exit_amd_microcode();
641
642 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); 567 pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
643} 568}
644module_exit(microcode_exit); 569module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 3544aed3933..1a1b606d3e9 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -147,6 +147,12 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
147 147
148 memset(csig, 0, sizeof(*csig)); 148 memset(csig, 0, sizeof(*csig));
149 149
150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
151 cpu_has(c, X86_FEATURE_IA64)) {
152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
153 return -1;
154 }
155
150 csig->sig = cpuid_eax(0x00000001); 156 csig->sig = cpuid_eax(0x00000001);
151 157
152 if ((c->x86_model >= 5) || (c->x86 > 6)) { 158 if ((c->x86_model >= 5) || (c->x86 > 6)) {
@@ -155,7 +161,12 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
155 csig->pf = 1 << ((val[1] >> 18) & 7); 161 csig->pf = 1 << ((val[1] >> 18) & 7);
156 } 162 }
157 163
158 csig->rev = c->microcode; 164 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
165 /* see notes above for revision 1.07. Apparent chip bug */
166 sync_core();
167 /* get the current revision from MSR 0x8B */
168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
169
159 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
160 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
161 172
@@ -288,9 +299,9 @@ static int apply_microcode(int cpu)
288 struct microcode_intel *mc_intel; 299 struct microcode_intel *mc_intel;
289 struct ucode_cpu_info *uci; 300 struct ucode_cpu_info *uci;
290 unsigned int val[2]; 301 unsigned int val[2];
291 int cpu_num = raw_smp_processor_id(); 302 int cpu_num;
292 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
293 303
304 cpu_num = raw_smp_processor_id();
294 uci = ucode_cpu_info + cpu; 305 uci = ucode_cpu_info + cpu;
295 mc_intel = uci->mc; 306 mc_intel = uci->mc;
296 307
@@ -306,7 +317,7 @@ static int apply_microcode(int cpu)
306 (unsigned long) mc_intel->bits >> 16 >> 16); 317 (unsigned long) mc_intel->bits >> 16 >> 16);
307 wrmsr(MSR_IA32_UCODE_REV, 0, 0); 318 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
308 319
309 /* As documented in the SDM: Do a CPUID 1 here */ 320 /* see notes above for revision 1.07. Apparent chip bug */
310 sync_core(); 321 sync_core();
311 322
312 /* get the current revision from MSR 0x8B */ 323 /* get the current revision from MSR 0x8B */
@@ -324,7 +335,6 @@ static int apply_microcode(int cpu)
324 (mc_intel->hdr.date >> 16) & 0xff); 335 (mc_intel->hdr.date >> 16) & 0xff);
325 336
326 uci->cpu_sig.rev = val[1]; 337 uci->cpu_sig.rev = val[1];
327 c->microcode = val[1];
328 338
329 return 0; 339 return 0;
330} 340}
@@ -405,8 +415,7 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
405 return 0; 415 return 0;
406} 416}
407 417
408static enum ucode_state request_microcode_fw(int cpu, struct device *device, 418static enum ucode_state request_microcode_fw(int cpu, struct device *device)
409 bool refresh_fw)
410{ 419{
411 char name[30]; 420 char name[30];
412 struct cpuinfo_x86 *c = &cpu_data(cpu); 421 struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -458,14 +467,6 @@ static struct microcode_ops microcode_intel_ops = {
458 467
459struct microcode_ops * __init init_intel_microcode(void) 468struct microcode_ops * __init init_intel_microcode(void)
460{ 469{
461 struct cpuinfo_x86 *c = &cpu_data(0);
462
463 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
464 cpu_has(c, X86_FEATURE_IA64)) {
465 pr_err("Intel CPU family 0x%x not supported\n", c->x86);
466 return NULL;
467 }
468
469 return &microcode_intel_ops; 470 return &microcode_intel_ops;
470} 471}
471 472
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 216a4d754b0..925179f871d 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -15,9 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18
19#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21#include <linux/moduleloader.h> 18#include <linux/moduleloader.h>
22#include <linux/elf.h> 19#include <linux/elf.h>
23#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
@@ -29,18 +26,14 @@
29#include <linux/gfp.h> 26#include <linux/gfp.h>
30#include <linux/jump_label.h> 27#include <linux/jump_label.h>
31 28
29#include <asm/system.h>
32#include <asm/page.h> 30#include <asm/page.h>
33#include <asm/pgtable.h> 31#include <asm/pgtable.h>
34 32
35#if 0 33#if 0
36#define DEBUGP(fmt, ...) \ 34#define DEBUGP printk
37 printk(KERN_DEBUG fmt, ##__VA_ARGS__)
38#else 35#else
39#define DEBUGP(fmt, ...) \ 36#define DEBUGP(fmt...)
40do { \
41 if (0) \
42 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
43} while (0)
44#endif 37#endif
45 38
46void *module_alloc(unsigned long size) 39void *module_alloc(unsigned long size)
@@ -64,8 +57,8 @@ int apply_relocate(Elf32_Shdr *sechdrs,
64 Elf32_Sym *sym; 57 Elf32_Sym *sym;
65 uint32_t *location; 58 uint32_t *location;
66 59
67 DEBUGP("Applying relocate section %u to %u\n", 60 DEBUGP("Applying relocate section %u to %u\n", relsec,
68 relsec, sechdrs[relsec].sh_info); 61 sechdrs[relsec].sh_info);
69 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 62 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
70 /* This is where to make the change */ 63 /* This is where to make the change */
71 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 64 location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -81,11 +74,11 @@ int apply_relocate(Elf32_Shdr *sechdrs,
81 *location += sym->st_value; 74 *location += sym->st_value;
82 break; 75 break;
83 case R_386_PC32: 76 case R_386_PC32:
84 /* Add the value, subtract its position */ 77 /* Add the value, subtract its postition */
85 *location += sym->st_value - (uint32_t)location; 78 *location += sym->st_value - (uint32_t)location;
86 break; 79 break;
87 default: 80 default:
88 pr_err("%s: Unknown relocation: %u\n", 81 printk(KERN_ERR "module %s: Unknown relocation: %u\n",
89 me->name, ELF32_R_TYPE(rel[i].r_info)); 82 me->name, ELF32_R_TYPE(rel[i].r_info));
90 return -ENOEXEC; 83 return -ENOEXEC;
91 } 84 }
@@ -105,8 +98,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
105 void *loc; 98 void *loc;
106 u64 val; 99 u64 val;
107 100
108 DEBUGP("Applying relocate section %u to %u\n", 101 DEBUGP("Applying relocate section %u to %u\n", relsec,
109 relsec, sechdrs[relsec].sh_info); 102 sechdrs[relsec].sh_info);
110 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { 103 for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
111 /* This is where to make the change */ 104 /* This is where to make the change */
112 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr 105 loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr
@@ -118,8 +111,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
118 + ELF64_R_SYM(rel[i].r_info); 111 + ELF64_R_SYM(rel[i].r_info);
119 112
120 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", 113 DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n",
121 (int)ELF64_R_TYPE(rel[i].r_info), 114 (int)ELF64_R_TYPE(rel[i].r_info),
122 sym->st_value, rel[i].r_addend, (u64)loc); 115 sym->st_value, rel[i].r_addend, (u64)loc);
123 116
124 val = sym->st_value + rel[i].r_addend; 117 val = sym->st_value + rel[i].r_addend;
125 118
@@ -148,7 +141,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
148#endif 141#endif
149 break; 142 break;
150 default: 143 default:
151 pr_err("%s: Unknown rela relocation: %llu\n", 144 printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n",
152 me->name, ELF64_R_TYPE(rel[i].r_info)); 145 me->name, ELF64_R_TYPE(rel[i].r_info));
153 return -ENOEXEC; 146 return -ENOEXEC;
154 } 147 }
@@ -156,9 +149,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
156 return 0; 149 return 0;
157 150
158overflow: 151overflow:
159 pr_err("overflow in relocation type %d val %Lx\n", 152 printk(KERN_ERR "overflow in relocation type %d val %Lx\n",
160 (int)ELF64_R_TYPE(rel[i].r_info), val); 153 (int)ELF64_R_TYPE(rel[i].r_info), val);
161 pr_err("`%s' likely not compiled with -mcmodel=kernel\n", 154 printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n",
162 me->name); 155 me->name);
163 return -ENOEXEC; 156 return -ENOEXEC;
164} 157}
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d2b56489d70..0741b062a30 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/proto.h> 27#include <asm/proto.h>
28#include <asm/bios_ebda.h> 28#include <asm/bios_ebda.h>
29#include <asm/e820.h> 29#include <asm/e820.h>
30#include <asm/trampoline.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/smp.h> 32#include <asm/smp.h>
32 33
@@ -96,7 +97,7 @@ static void __init MP_bus_info(struct mpc_bus *m)
96 97
97 set_bit(m->busid, mp_bus_not_pci); 98 set_bit(m->busid, mp_bus_not_pci);
98 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { 99 if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
99#ifdef CONFIG_EISA 100#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
100 mp_bus_id_to_type[m->busid] = MP_BUS_ISA; 101 mp_bus_id_to_type[m->busid] = MP_BUS_ISA;
101#endif 102#endif
102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 103 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,10 +105,12 @@ static void __init MP_bus_info(struct mpc_bus *m)
104 x86_init.mpparse.mpc_oem_pci_bus(m); 105 x86_init.mpparse.mpc_oem_pci_bus(m);
105 106
106 clear_bit(m->busid, mp_bus_not_pci); 107 clear_bit(m->busid, mp_bus_not_pci);
107#ifdef CONFIG_EISA 108#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
108 mp_bus_id_to_type[m->busid] = MP_BUS_PCI; 109 mp_bus_id_to_type[m->busid] = MP_BUS_PCI;
109 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { 110 } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
110 mp_bus_id_to_type[m->busid] = MP_BUS_EISA; 111 mp_bus_id_to_type[m->busid] = MP_BUS_EISA;
112 } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
113 mp_bus_id_to_type[m->busid] = MP_BUS_MCA;
111#endif 114#endif
112 } else 115 } else
113 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); 116 printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
@@ -365,6 +368,9 @@ static void __init construct_ioapic_table(int mpc_default_type)
365 case 3: 368 case 3:
366 memcpy(bus.bustype, "EISA ", 6); 369 memcpy(bus.bustype, "EISA ", 6);
367 break; 370 break;
371 case 4:
372 case 7:
373 memcpy(bus.bustype, "MCA ", 6);
368 } 374 }
369 MP_bus_info(&bus); 375 MP_bus_info(&bus);
370 if (mpc_default_type > 4) { 376 if (mpc_default_type > 4) {
@@ -558,7 +564,9 @@ void __init default_get_smp_config(unsigned int early)
558 564
559static void __init smp_reserve_memory(struct mpf_intel *mpf) 565static void __init smp_reserve_memory(struct mpf_intel *mpf)
560{ 566{
561 memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); 567 unsigned long size = get_mpc_size(mpf->physptr);
568
569 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
562} 570}
563 571
564static int __init smp_scan_config(unsigned long base, unsigned long length) 572static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -567,8 +575,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
567 struct mpf_intel *mpf; 575 struct mpf_intel *mpf;
568 unsigned long mem; 576 unsigned long mem;
569 577
570 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", 578 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
571 base, base + length - 1); 579 bp, length);
572 BUILD_BUG_ON(sizeof(*mpf) != 16); 580 BUILD_BUG_ON(sizeof(*mpf) != 16);
573 581
574 while (length > 0) { 582 while (length > 0) {
@@ -583,13 +591,11 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
583#endif 591#endif
584 mpf_found = mpf; 592 mpf_found = mpf;
585 593
586 printk(KERN_INFO "found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", 594 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
587 (unsigned long long) virt_to_phys(mpf), 595 mpf, (u64)virt_to_phys(mpf));
588 (unsigned long long) virt_to_phys(mpf) +
589 sizeof(*mpf) - 1, mpf);
590 596
591 mem = virt_to_phys(mpf); 597 mem = virt_to_phys(mpf);
592 memblock_reserve(mem, sizeof(*mpf)); 598 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
593 if (mpf->physptr) 599 if (mpf->physptr)
594 smp_reserve_memory(mpf); 600 smp_reserve_memory(mpf);
595 601
@@ -619,7 +625,7 @@ void __init default_find_smp_config(void)
619 return; 625 return;
620 /* 626 /*
621 * If it is an SMP machine we should know now, unless the 627 * If it is an SMP machine we should know now, unless the
622 * configuration is in an EISA bus machine with an 628 * configuration is in an EISA/MCA bus machine with an
623 * extended bios data area. 629 * extended bios data area.
624 * 630 *
625 * there is a real-mode segmented pointer pointing to the 631 * there is a real-mode segmented pointer pointing to the
@@ -830,8 +836,10 @@ early_param("alloc_mptable", parse_alloc_mptable_opt);
830 836
831void __init early_reserve_e820_mpc_new(void) 837void __init early_reserve_e820_mpc_new(void)
832{ 838{
833 if (enable_update_mptable && alloc_mptable) 839 if (enable_update_mptable && alloc_mptable) {
834 mpc_new_phys = early_reserve_e820(mpc_new_length, 4); 840 u64 startt = 0;
841 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
842 }
835} 843}
836 844
837static int __init update_mp_table(void) 845static int __init update_mp_table(void)
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a7c5661f849..12fcbe2c143 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -40,6 +40,7 @@
40 40
41#include <asm/processor.h> 41#include <asm/processor.h>
42#include <asm/msr.h> 42#include <asm/msr.h>
43#include <asm/system.h>
43 44
44static struct class *msr_class; 45static struct class *msr_class;
45 46
@@ -235,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = {
235 .notifier_call = msr_class_cpu_callback, 236 .notifier_call = msr_class_cpu_callback,
236}; 237};
237 238
238static char *msr_devnode(struct device *dev, umode_t *mode) 239static char *msr_devnode(struct device *dev, mode_t *mode)
239{ 240{
240 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); 241 return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt));
241} 242}
@@ -257,14 +258,12 @@ static int __init msr_init(void)
257 goto out_chrdev; 258 goto out_chrdev;
258 } 259 }
259 msr_class->devnode = msr_devnode; 260 msr_class->devnode = msr_devnode;
260 get_online_cpus();
261 for_each_online_cpu(i) { 261 for_each_online_cpu(i) {
262 err = msr_device_create(i); 262 err = msr_device_create(i);
263 if (err != 0) 263 if (err != 0)
264 goto out_class; 264 goto out_class;
265 } 265 }
266 register_hotcpu_notifier(&msr_class_cpu_notifier); 266 register_hotcpu_notifier(&msr_class_cpu_notifier);
267 put_online_cpus();
268 267
269 err = 0; 268 err = 0;
270 goto out; 269 goto out;
@@ -273,7 +272,6 @@ out_class:
273 i = 0; 272 i = 0;
274 for_each_online_cpu(i) 273 for_each_online_cpu(i)
275 msr_device_destroy(i); 274 msr_device_destroy(i);
276 put_online_cpus();
277 class_destroy(msr_class); 275 class_destroy(msr_class);
278out_chrdev: 276out_chrdev:
279 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 277 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
@@ -284,13 +282,11 @@ out:
284static void __exit msr_exit(void) 282static void __exit msr_exit(void)
285{ 283{
286 int cpu = 0; 284 int cpu = 0;
287 get_online_cpus();
288 for_each_online_cpu(cpu) 285 for_each_online_cpu(cpu)
289 msr_device_destroy(cpu); 286 msr_device_destroy(cpu);
290 class_destroy(msr_class); 287 class_destroy(msr_class);
291 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); 288 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
292 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 289 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
293 put_online_cpus();
294} 290}
295 291
296module_init(msr_init); 292module_init(msr_init);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
deleted file mode 100644
index f84f5c57de3..00000000000
--- a/arch/x86/kernel/nmi.c
+++ /dev/null
@@ -1,511 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 * Copyright (C) 2011 Don Zickus Red Hat, Inc.
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 */
9
10/*
11 * Handle hardware traps and faults.
12 */
13#include <linux/spinlock.h>
14#include <linux/kprobes.h>
15#include <linux/kdebug.h>
16#include <linux/nmi.h>
17#include <linux/delay.h>
18#include <linux/hardirq.h>
19#include <linux/slab.h>
20#include <linux/export.h>
21
22#if defined(CONFIG_EDAC)
23#include <linux/edac.h>
24#endif
25
26#include <linux/atomic.h>
27#include <asm/traps.h>
28#include <asm/mach_traps.h>
29#include <asm/nmi.h>
30#include <asm/x86_init.h>
31
32struct nmi_desc {
33 spinlock_t lock;
34 struct list_head head;
35};
36
37static struct nmi_desc nmi_desc[NMI_MAX] =
38{
39 {
40 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock),
41 .head = LIST_HEAD_INIT(nmi_desc[0].head),
42 },
43 {
44 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock),
45 .head = LIST_HEAD_INIT(nmi_desc[1].head),
46 },
47 {
48 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock),
49 .head = LIST_HEAD_INIT(nmi_desc[2].head),
50 },
51 {
52 .lock = __SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock),
53 .head = LIST_HEAD_INIT(nmi_desc[3].head),
54 },
55
56};
57
58struct nmi_stats {
59 unsigned int normal;
60 unsigned int unknown;
61 unsigned int external;
62 unsigned int swallow;
63};
64
65static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
66
67static int ignore_nmis;
68
69int unknown_nmi_panic;
70/*
71 * Prevent NMI reason port (0x61) being accessed simultaneously, can
72 * only be used in NMI handler.
73 */
74static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
75
76static int __init setup_unknown_nmi_panic(char *str)
77{
78 unknown_nmi_panic = 1;
79 return 1;
80}
81__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
82
83#define nmi_to_desc(type) (&nmi_desc[type])
84
85static int __kprobes nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
86{
87 struct nmi_desc *desc = nmi_to_desc(type);
88 struct nmiaction *a;
89 int handled=0;
90
91 rcu_read_lock();
92
93 /*
94 * NMIs are edge-triggered, which means if you have enough
95 * of them concurrently, you can lose some because only one
96 * can be latched at any given time. Walk the whole list
97 * to handle those situations.
98 */
99 list_for_each_entry_rcu(a, &desc->head, list)
100 handled += a->handler(type, regs);
101
102 rcu_read_unlock();
103
104 /* return total number of NMI events handled */
105 return handled;
106}
107
108int __register_nmi_handler(unsigned int type, struct nmiaction *action)
109{
110 struct nmi_desc *desc = nmi_to_desc(type);
111 unsigned long flags;
112
113 if (!action->handler)
114 return -EINVAL;
115
116 spin_lock_irqsave(&desc->lock, flags);
117
118 /*
119 * most handlers of type NMI_UNKNOWN never return because
120 * they just assume the NMI is theirs. Just a sanity check
121 * to manage expectations
122 */
123 WARN_ON_ONCE(type == NMI_UNKNOWN && !list_empty(&desc->head));
124 WARN_ON_ONCE(type == NMI_SERR && !list_empty(&desc->head));
125 WARN_ON_ONCE(type == NMI_IO_CHECK && !list_empty(&desc->head));
126
127 /*
128 * some handlers need to be executed first otherwise a fake
129 * event confuses some handlers (kdump uses this flag)
130 */
131 if (action->flags & NMI_FLAG_FIRST)
132 list_add_rcu(&action->list, &desc->head);
133 else
134 list_add_tail_rcu(&action->list, &desc->head);
135
136 spin_unlock_irqrestore(&desc->lock, flags);
137 return 0;
138}
139EXPORT_SYMBOL(__register_nmi_handler);
140
141void unregister_nmi_handler(unsigned int type, const char *name)
142{
143 struct nmi_desc *desc = nmi_to_desc(type);
144 struct nmiaction *n;
145 unsigned long flags;
146
147 spin_lock_irqsave(&desc->lock, flags);
148
149 list_for_each_entry_rcu(n, &desc->head, list) {
150 /*
151 * the name passed in to describe the nmi handler
152 * is used as the lookup key
153 */
154 if (!strcmp(n->name, name)) {
155 WARN(in_nmi(),
156 "Trying to free NMI (%s) from NMI context!\n", n->name);
157 list_del_rcu(&n->list);
158 break;
159 }
160 }
161
162 spin_unlock_irqrestore(&desc->lock, flags);
163 synchronize_rcu();
164}
165EXPORT_SYMBOL_GPL(unregister_nmi_handler);
166
167static __kprobes void
168pci_serr_error(unsigned char reason, struct pt_regs *regs)
169{
170 /* check to see if anyone registered against these types of errors */
171 if (nmi_handle(NMI_SERR, regs, false))
172 return;
173
174 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
175 reason, smp_processor_id());
176
177 /*
178 * On some machines, PCI SERR line is used to report memory
179 * errors. EDAC makes use of it.
180 */
181#if defined(CONFIG_EDAC)
182 if (edac_handler_set()) {
183 edac_atomic_assert_error();
184 return;
185 }
186#endif
187
188 if (panic_on_unrecovered_nmi)
189 panic("NMI: Not continuing");
190
191 pr_emerg("Dazed and confused, but trying to continue\n");
192
193 /* Clear and disable the PCI SERR error line. */
194 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
195 outb(reason, NMI_REASON_PORT);
196}
197
198static __kprobes void
199io_check_error(unsigned char reason, struct pt_regs *regs)
200{
201 unsigned long i;
202
203 /* check to see if anyone registered against these types of errors */
204 if (nmi_handle(NMI_IO_CHECK, regs, false))
205 return;
206
207 pr_emerg(
208 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
209 reason, smp_processor_id());
210 show_regs(regs);
211
212 if (panic_on_io_nmi)
213 panic("NMI IOCK error: Not continuing");
214
215 /* Re-enable the IOCK line, wait for a few seconds */
216 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
217 outb(reason, NMI_REASON_PORT);
218
219 i = 20000;
220 while (--i) {
221 touch_nmi_watchdog();
222 udelay(100);
223 }
224
225 reason &= ~NMI_REASON_CLEAR_IOCHK;
226 outb(reason, NMI_REASON_PORT);
227}
228
229static __kprobes void
230unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
231{
232 int handled;
233
234 /*
235 * Use 'false' as back-to-back NMIs are dealt with one level up.
236 * Of course this makes having multiple 'unknown' handlers useless
237 * as only the first one is ever run (unless it can actually determine
238 * if it caused the NMI)
239 */
240 handled = nmi_handle(NMI_UNKNOWN, regs, false);
241 if (handled) {
242 __this_cpu_add(nmi_stats.unknown, handled);
243 return;
244 }
245
246 __this_cpu_add(nmi_stats.unknown, 1);
247
248 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
249 reason, smp_processor_id());
250
251 pr_emerg("Do you have a strange power saving mode enabled?\n");
252 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
253 panic("NMI: Not continuing");
254
255 pr_emerg("Dazed and confused, but trying to continue\n");
256}
257
258static DEFINE_PER_CPU(bool, swallow_nmi);
259static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
260
261static __kprobes void default_do_nmi(struct pt_regs *regs)
262{
263 unsigned char reason = 0;
264 int handled;
265 bool b2b = false;
266
267 /*
268 * CPU-specific NMI must be processed before non-CPU-specific
269 * NMI, otherwise we may lose it, because the CPU-specific
270 * NMI can not be detected/processed on other CPUs.
271 */
272
273 /*
274 * Back-to-back NMIs are interesting because they can either
275 * be two NMI or more than two NMIs (any thing over two is dropped
276 * due to NMI being edge-triggered). If this is the second half
277 * of the back-to-back NMI, assume we dropped things and process
278 * more handlers. Otherwise reset the 'swallow' NMI behaviour
279 */
280 if (regs->ip == __this_cpu_read(last_nmi_rip))
281 b2b = true;
282 else
283 __this_cpu_write(swallow_nmi, false);
284
285 __this_cpu_write(last_nmi_rip, regs->ip);
286
287 handled = nmi_handle(NMI_LOCAL, regs, b2b);
288 __this_cpu_add(nmi_stats.normal, handled);
289 if (handled) {
290 /*
291 * There are cases when a NMI handler handles multiple
292 * events in the current NMI. One of these events may
293 * be queued for in the next NMI. Because the event is
294 * already handled, the next NMI will result in an unknown
295 * NMI. Instead lets flag this for a potential NMI to
296 * swallow.
297 */
298 if (handled > 1)
299 __this_cpu_write(swallow_nmi, true);
300 return;
301 }
302
303 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
304 raw_spin_lock(&nmi_reason_lock);
305 reason = x86_platform.get_nmi_reason();
306
307 if (reason & NMI_REASON_MASK) {
308 if (reason & NMI_REASON_SERR)
309 pci_serr_error(reason, regs);
310 else if (reason & NMI_REASON_IOCHK)
311 io_check_error(reason, regs);
312#ifdef CONFIG_X86_32
313 /*
314 * Reassert NMI in case it became active
315 * meanwhile as it's edge-triggered:
316 */
317 reassert_nmi();
318#endif
319 __this_cpu_add(nmi_stats.external, 1);
320 raw_spin_unlock(&nmi_reason_lock);
321 return;
322 }
323 raw_spin_unlock(&nmi_reason_lock);
324
325 /*
326 * Only one NMI can be latched at a time. To handle
327 * this we may process multiple nmi handlers at once to
328 * cover the case where an NMI is dropped. The downside
329 * to this approach is we may process an NMI prematurely,
330 * while its real NMI is sitting latched. This will cause
331 * an unknown NMI on the next run of the NMI processing.
332 *
333 * We tried to flag that condition above, by setting the
334 * swallow_nmi flag when we process more than one event.
335 * This condition is also only present on the second half
336 * of a back-to-back NMI, so we flag that condition too.
337 *
338 * If both are true, we assume we already processed this
339 * NMI previously and we swallow it. Otherwise we reset
340 * the logic.
341 *
342 * There are scenarios where we may accidentally swallow
343 * a 'real' unknown NMI. For example, while processing
344 * a perf NMI another perf NMI comes in along with a
345 * 'real' unknown NMI. These two NMIs get combined into
346 * one (as descibed above). When the next NMI gets
347 * processed, it will be flagged by perf as handled, but
348 * noone will know that there was a 'real' unknown NMI sent
349 * also. As a result it gets swallowed. Or if the first
350 * perf NMI returns two events handled then the second
351 * NMI will get eaten by the logic below, again losing a
352 * 'real' unknown NMI. But this is the best we can do
353 * for now.
354 */
355 if (b2b && __this_cpu_read(swallow_nmi))
356 __this_cpu_add(nmi_stats.swallow, 1);
357 else
358 unknown_nmi_error(reason, regs);
359}
360
361/*
362 * NMIs can hit breakpoints which will cause it to lose its
363 * NMI context with the CPU when the breakpoint does an iret.
364 */
365#ifdef CONFIG_X86_32
366/*
367 * For i386, NMIs use the same stack as the kernel, and we can
368 * add a workaround to the iret problem in C (preventing nested
369 * NMIs if an NMI takes a trap). Simply have 3 states the NMI
370 * can be in:
371 *
372 * 1) not running
373 * 2) executing
374 * 3) latched
375 *
376 * When no NMI is in progress, it is in the "not running" state.
377 * When an NMI comes in, it goes into the "executing" state.
378 * Normally, if another NMI is triggered, it does not interrupt
379 * the running NMI and the HW will simply latch it so that when
380 * the first NMI finishes, it will restart the second NMI.
381 * (Note, the latch is binary, thus multiple NMIs triggering,
382 * when one is running, are ignored. Only one NMI is restarted.)
383 *
384 * If an NMI hits a breakpoint that executes an iret, another
385 * NMI can preempt it. We do not want to allow this new NMI
386 * to run, but we want to execute it when the first one finishes.
387 * We set the state to "latched", and the exit of the first NMI will
388 * perform a dec_return, if the result is zero (NOT_RUNNING), then
389 * it will simply exit the NMI handler. If not, the dec_return
390 * would have set the state to NMI_EXECUTING (what we want it to
391 * be when we are running). In this case, we simply jump back
392 * to rerun the NMI handler again, and restart the 'latched' NMI.
393 *
394 * No trap (breakpoint or page fault) should be hit before nmi_restart,
395 * thus there is no race between the first check of state for NOT_RUNNING
396 * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs
397 * at this point.
398 *
399 * In case the NMI takes a page fault, we need to save off the CR2
400 * because the NMI could have preempted another page fault and corrupt
401 * the CR2 that is about to be read. As nested NMIs must be restarted
402 * and they can not take breakpoints or page faults, the update of the
403 * CR2 must be done before converting the nmi state back to NOT_RUNNING.
404 * Otherwise, there would be a race of another nested NMI coming in
405 * after setting state to NOT_RUNNING but before updating the nmi_cr2.
406 */
407enum nmi_states {
408 NMI_NOT_RUNNING = 0,
409 NMI_EXECUTING,
410 NMI_LATCHED,
411};
412static DEFINE_PER_CPU(enum nmi_states, nmi_state);
413static DEFINE_PER_CPU(unsigned long, nmi_cr2);
414
415#define nmi_nesting_preprocess(regs) \
416 do { \
417 if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
418 this_cpu_write(nmi_state, NMI_LATCHED); \
419 return; \
420 } \
421 this_cpu_write(nmi_state, NMI_EXECUTING); \
422 this_cpu_write(nmi_cr2, read_cr2()); \
423 } while (0); \
424 nmi_restart:
425
426#define nmi_nesting_postprocess() \
427 do { \
428 if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
429 write_cr2(this_cpu_read(nmi_cr2)); \
430 if (this_cpu_dec_return(nmi_state)) \
431 goto nmi_restart; \
432 } while (0)
433#else /* x86_64 */
434/*
435 * In x86_64 things are a bit more difficult. This has the same problem
436 * where an NMI hitting a breakpoint that calls iret will remove the
437 * NMI context, allowing a nested NMI to enter. What makes this more
438 * difficult is that both NMIs and breakpoints have their own stack.
439 * When a new NMI or breakpoint is executed, the stack is set to a fixed
440 * point. If an NMI is nested, it will have its stack set at that same
441 * fixed address that the first NMI had, and will start corrupting the
442 * stack. This is handled in entry_64.S, but the same problem exists with
443 * the breakpoint stack.
444 *
445 * If a breakpoint is being processed, and the debug stack is being used,
446 * if an NMI comes in and also hits a breakpoint, the stack pointer
447 * will be set to the same fixed address as the breakpoint that was
448 * interrupted, causing that stack to be corrupted. To handle this case,
449 * check if the stack that was interrupted is the debug stack, and if
450 * so, change the IDT so that new breakpoints will use the current stack
451 * and not switch to the fixed address. On return of the NMI, switch back
452 * to the original IDT.
453 */
454static DEFINE_PER_CPU(int, update_debug_stack);
455
456static inline void nmi_nesting_preprocess(struct pt_regs *regs)
457{
458 /*
459 * If we interrupted a breakpoint, it is possible that
460 * the nmi handler will have breakpoints too. We need to
461 * change the IDT such that breakpoints that happen here
462 * continue to use the NMI stack.
463 */
464 if (unlikely(is_debug_stack(regs->sp))) {
465 debug_stack_set_zero();
466 this_cpu_write(update_debug_stack, 1);
467 }
468}
469
470static inline void nmi_nesting_postprocess(void)
471{
472 if (unlikely(this_cpu_read(update_debug_stack))) {
473 debug_stack_reset();
474 this_cpu_write(update_debug_stack, 0);
475 }
476}
477#endif
478
479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
481{
482 nmi_nesting_preprocess(regs);
483
484 nmi_enter();
485
486 inc_irq_stat(__nmi_count);
487
488 if (!ignore_nmis)
489 default_do_nmi(regs);
490
491 nmi_exit();
492
493 /* On i386, may loop back to preprocess */
494 nmi_nesting_postprocess();
495}
496
497void stop_nmi(void)
498{
499 ignore_nmis++;
500}
501
502void restart_nmi(void)
503{
504 ignore_nmis--;
505}
506
507/* reset the back-to-back NMI logic */
508void local_touch_nmi(void)
509{
510 __this_cpu_write(last_nmi_rip, 0);
511}
diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c
deleted file mode 100644
index 6d9582ec032..00000000000
--- a/arch/x86/kernel/nmi_selftest.c
+++ /dev/null
@@ -1,183 +0,0 @@
1/*
2 * arch/x86/kernel/nmi-selftest.c
3 *
4 * Testsuite for NMI: IPIs
5 *
6 * Started by Don Zickus:
7 * (using lib/locking-selftest.c as a guide)
8 *
9 * Copyright (C) 2011 Red Hat, Inc., Don Zickus <dzickus@redhat.com>
10 */
11
12#include <linux/smp.h>
13#include <linux/cpumask.h>
14#include <linux/delay.h>
15#include <linux/init.h>
16#include <linux/percpu.h>
17
18#include <asm/apic.h>
19#include <asm/nmi.h>
20
21#define SUCCESS 0
22#define FAILURE 1
23#define TIMEOUT 2
24
25static int __initdata nmi_fail;
26
27/* check to see if NMI IPIs work on this machine */
28static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata;
29
30static int __initdata testcase_total;
31static int __initdata testcase_successes;
32static int __initdata expected_testcase_failures;
33static int __initdata unexpected_testcase_failures;
34static int __initdata unexpected_testcase_unknowns;
35
36static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs)
37{
38 unexpected_testcase_unknowns++;
39 return NMI_HANDLED;
40}
41
42static void __init init_nmi_testsuite(void)
43{
44 /* trap all the unknown NMIs we may generate */
45 register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk",
46 __initdata);
47}
48
49static void __init cleanup_nmi_testsuite(void)
50{
51 unregister_nmi_handler(NMI_UNKNOWN, "nmi_selftest_unk");
52}
53
54static int __init test_nmi_ipi_callback(unsigned int val, struct pt_regs *regs)
55{
56 int cpu = raw_smp_processor_id();
57
58 if (cpumask_test_and_clear_cpu(cpu, to_cpumask(nmi_ipi_mask)))
59 return NMI_HANDLED;
60
61 return NMI_DONE;
62}
63
64static void __init test_nmi_ipi(struct cpumask *mask)
65{
66 unsigned long timeout;
67
68 if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback,
69 NMI_FLAG_FIRST, "nmi_selftest", __initdata)) {
70 nmi_fail = FAILURE;
71 return;
72 }
73
74 /* sync above data before sending NMI */
75 wmb();
76
77 apic->send_IPI_mask(mask, NMI_VECTOR);
78
79 /* Don't wait longer than a second */
80 timeout = USEC_PER_SEC;
81 while (!cpumask_empty(mask) && timeout--)
82 udelay(1);
83
84 /* What happens if we timeout, do we still unregister?? */
85 unregister_nmi_handler(NMI_LOCAL, "nmi_selftest");
86
87 if (!timeout)
88 nmi_fail = TIMEOUT;
89 return;
90}
91
92static void __init remote_ipi(void)
93{
94 cpumask_copy(to_cpumask(nmi_ipi_mask), cpu_online_mask);
95 cpumask_clear_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
96 if (!cpumask_empty(to_cpumask(nmi_ipi_mask)))
97 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
98}
99
100static void __init local_ipi(void)
101{
102 cpumask_clear(to_cpumask(nmi_ipi_mask));
103 cpumask_set_cpu(smp_processor_id(), to_cpumask(nmi_ipi_mask));
104 test_nmi_ipi(to_cpumask(nmi_ipi_mask));
105}
106
107static void __init reset_nmi(void)
108{
109 nmi_fail = 0;
110}
111
112static void __init dotest(void (*testcase_fn)(void), int expected)
113{
114 testcase_fn();
115 /*
116 * Filter out expected failures:
117 */
118 if (nmi_fail != expected) {
119 unexpected_testcase_failures++;
120
121 if (nmi_fail == FAILURE)
122 printk(KERN_CONT "FAILED |");
123 else if (nmi_fail == TIMEOUT)
124 printk(KERN_CONT "TIMEOUT|");
125 else
126 printk(KERN_CONT "ERROR |");
127 dump_stack();
128 } else {
129 testcase_successes++;
130 printk(KERN_CONT " ok |");
131 }
132 testcase_total++;
133
134 reset_nmi();
135}
136
137static inline void __init print_testname(const char *testname)
138{
139 printk("%12s:", testname);
140}
141
142void __init nmi_selftest(void)
143{
144 init_nmi_testsuite();
145
146 /*
147 * Run the testsuite:
148 */
149 printk("----------------\n");
150 printk("| NMI testsuite:\n");
151 printk("--------------------\n");
152
153 print_testname("remote IPI");
154 dotest(remote_ipi, SUCCESS);
155 printk(KERN_CONT "\n");
156 print_testname("local IPI");
157 dotest(local_ipi, SUCCESS);
158 printk(KERN_CONT "\n");
159
160 cleanup_nmi_testsuite();
161
162 if (unexpected_testcase_failures) {
163 printk("--------------------\n");
164 printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n",
165 unexpected_testcase_failures, testcase_total);
166 printk("-----------------------------------------------------------------\n");
167 } else if (expected_testcase_failures && testcase_successes) {
168 printk("--------------------\n");
169 printk("%3d out of %3d testcases failed, as expected. |\n",
170 expected_testcase_failures, testcase_total);
171 printk("----------------------------------------------------\n");
172 } else if (expected_testcase_failures && !testcase_successes) {
173 printk("--------------------\n");
174 printk("All %3d testcases failed, as expected. |\n",
175 expected_testcase_failures);
176 printk("----------------------------------------\n");
177 } else {
178 printk("--------------------\n");
179 printk("Good, all %3d testcases passed! |\n",
180 testcase_successes);
181 printk("---------------------------------\n");
182 }
183}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 17fff18a103..d90272e6bc4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -26,7 +26,6 @@
26 26
27#include <asm/bug.h> 27#include <asm/bug.h>
28#include <asm/paravirt.h> 28#include <asm/paravirt.h>
29#include <asm/debugreg.h>
30#include <asm/desc.h> 29#include <asm/desc.h>
31#include <asm/setup.h> 30#include <asm/setup.h>
32#include <asm/pgtable.h> 31#include <asm/pgtable.h>
@@ -38,7 +37,6 @@
38#include <asm/apic.h> 37#include <asm/apic.h>
39#include <asm/tlbflush.h> 38#include <asm/tlbflush.h>
40#include <asm/timer.h> 39#include <asm/timer.h>
41#include <asm/special_insns.h>
42 40
43/* nop stub */ 41/* nop stub */
44void _paravirt_nop(void) 42void _paravirt_nop(void)
@@ -204,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr)
204 __native_flush_tlb_single(addr); 202 __native_flush_tlb_single(addr);
205} 203}
206 204
207struct static_key paravirt_steal_enabled; 205struct jump_label_key paravirt_steal_enabled;
208struct static_key paravirt_steal_rq_enabled; 206struct jump_label_key paravirt_steal_rq_enabled;
209 207
210static u64 native_steal_clock(int cpu) 208static u64 native_steal_clock(int cpu)
211{ 209{
@@ -241,16 +239,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
241 239
242static inline void enter_lazy(enum paravirt_lazy_mode mode) 240static inline void enter_lazy(enum paravirt_lazy_mode mode)
243{ 241{
244 BUG_ON(this_cpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); 242 BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
245 243
246 this_cpu_write(paravirt_lazy_mode, mode); 244 percpu_write(paravirt_lazy_mode, mode);
247} 245}
248 246
249static void leave_lazy(enum paravirt_lazy_mode mode) 247static void leave_lazy(enum paravirt_lazy_mode mode)
250{ 248{
251 BUG_ON(this_cpu_read(paravirt_lazy_mode) != mode); 249 BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
252 250
253 this_cpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); 251 percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
254} 252}
255 253
256void paravirt_enter_lazy_mmu(void) 254void paravirt_enter_lazy_mmu(void)
@@ -267,7 +265,7 @@ void paravirt_start_context_switch(struct task_struct *prev)
267{ 265{
268 BUG_ON(preemptible()); 266 BUG_ON(preemptible());
269 267
270 if (this_cpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { 268 if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
271 arch_leave_lazy_mmu_mode(); 269 arch_leave_lazy_mmu_mode();
272 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); 270 set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
273 } 271 }
@@ -289,7 +287,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
289 if (in_interrupt()) 287 if (in_interrupt())
290 return PARAVIRT_LAZY_NONE; 288 return PARAVIRT_LAZY_NONE;
291 289
292 return this_cpu_read(paravirt_lazy_mode); 290 return percpu_read(paravirt_lazy_mode);
293} 291}
294 292
295void arch_flush_lazy_mmu_mode(void) 293void arch_flush_lazy_mmu_mode(void)
@@ -352,7 +350,9 @@ struct pv_cpu_ops pv_cpu_ops = {
352#endif 350#endif
353 .wbinvd = native_wbinvd, 351 .wbinvd = native_wbinvd,
354 .read_msr = native_read_msr_safe, 352 .read_msr = native_read_msr_safe,
353 .rdmsr_regs = native_rdmsr_safe_regs,
355 .write_msr = native_write_msr_safe, 354 .write_msr = native_write_msr_safe,
355 .wrmsr_regs = native_wrmsr_safe_regs,
356 .read_tsc = native_read_tsc, 356 .read_tsc = native_read_tsc,
357 .read_pmc = native_read_pmc, 357 .read_pmc = native_read_pmc,
358 .read_tscp = native_read_tscp, 358 .read_tscp = native_read_tscp,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 299d49302e7..726494b5834 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -22,8 +22,6 @@
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */ 23 */
24 24
25#define pr_fmt(fmt) "Calgary: " fmt
26
27#include <linux/kernel.h> 25#include <linux/kernel.h>
28#include <linux/init.h> 26#include <linux/init.h>
29#include <linux/types.h> 27#include <linux/types.h>
@@ -44,6 +42,7 @@
44#include <asm/calgary.h> 42#include <asm/calgary.h>
45#include <asm/tce.h> 43#include <asm/tce.h>
46#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
45#include <asm/system.h>
47#include <asm/dma.h> 46#include <asm/dma.h>
48#include <asm/rio.h> 47#include <asm/rio.h>
49#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
@@ -247,7 +246,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
247 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, 246 offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0,
248 npages, 0, boundary_size, 0); 247 npages, 0, boundary_size, 0);
249 if (offset == ~0UL) { 248 if (offset == ~0UL) {
250 pr_warn("IOMMU full\n"); 249 printk(KERN_WARNING "Calgary: IOMMU full.\n");
251 spin_unlock_irqrestore(&tbl->it_lock, flags); 250 spin_unlock_irqrestore(&tbl->it_lock, flags);
252 if (panic_on_overflow) 251 if (panic_on_overflow)
253 panic("Calgary: fix the allocator.\n"); 252 panic("Calgary: fix the allocator.\n");
@@ -273,8 +272,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 entry = iommu_range_alloc(dev, tbl, npages); 272 entry = iommu_range_alloc(dev, tbl, npages);
274 273
275 if (unlikely(entry == DMA_ERROR_CODE)) { 274 if (unlikely(entry == DMA_ERROR_CODE)) {
276 pr_warn("failed to allocate %u pages in iommu %p\n", 275 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
277 npages, tbl); 276 "iommu %p\n", npages, tbl);
278 return DMA_ERROR_CODE; 277 return DMA_ERROR_CODE;
279 } 278 }
280 279
@@ -432,7 +431,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr,
432} 431}
433 432
434static void* calgary_alloc_coherent(struct device *dev, size_t size, 433static void* calgary_alloc_coherent(struct device *dev, size_t size,
435 dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) 434 dma_addr_t *dma_handle, gfp_t flag)
436{ 435{
437 void *ret = NULL; 436 void *ret = NULL;
438 dma_addr_t mapping; 437 dma_addr_t mapping;
@@ -465,8 +464,7 @@ error:
465} 464}
466 465
467static void calgary_free_coherent(struct device *dev, size_t size, 466static void calgary_free_coherent(struct device *dev, size_t size,
468 void *vaddr, dma_addr_t dma_handle, 467 void *vaddr, dma_addr_t dma_handle)
469 struct dma_attrs *attrs)
470{ 468{
471 unsigned int npages; 469 unsigned int npages;
472 struct iommu_table *tbl = find_iommu_table(dev); 470 struct iommu_table *tbl = find_iommu_table(dev);
@@ -479,8 +477,8 @@ static void calgary_free_coherent(struct device *dev, size_t size,
479} 477}
480 478
481static struct dma_map_ops calgary_dma_ops = { 479static struct dma_map_ops calgary_dma_ops = {
482 .alloc = calgary_alloc_coherent, 480 .alloc_coherent = calgary_alloc_coherent,
483 .free = calgary_free_coherent, 481 .free_coherent = calgary_free_coherent,
484 .map_sg = calgary_map_sg, 482 .map_sg = calgary_map_sg,
485 .unmap_sg = calgary_unmap_sg, 483 .unmap_sg = calgary_unmap_sg,
486 .map_page = calgary_map_page, 484 .map_page = calgary_map_page,
@@ -563,7 +561,8 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl)
563 i++; 561 i++;
564 } while ((val & 0xff) != 0xff && i < 100); 562 } while ((val & 0xff) != 0xff && i < 100);
565 if (i == 100) 563 if (i == 100)
566 pr_warn("PCI bus not quiesced, continuing anyway\n"); 564 printk(KERN_WARNING "Calgary: PCI bus not quiesced, "
565 "continuing anyway\n");
567 566
568 /* invalidate TCE cache */ 567 /* invalidate TCE cache */
569 target = calgary_reg(bbar, tar_offset(tbl->it_busno)); 568 target = calgary_reg(bbar, tar_offset(tbl->it_busno));
@@ -605,7 +604,8 @@ begin:
605 i++; 604 i++;
606 } while ((val64 & 0xff) != 0xff && i < 100); 605 } while ((val64 & 0xff) != 0xff && i < 100);
607 if (i == 100) 606 if (i == 100)
608 pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); 607 printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, "
608 "continuing anyway\n");
609 609
610 /* 3. poll Page Migration DEBUG for SoftStopFault */ 610 /* 3. poll Page Migration DEBUG for SoftStopFault */
611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); 611 target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG);
@@ -617,7 +617,8 @@ begin:
617 if (++count < 100) 617 if (++count < 100)
618 goto begin; 618 goto begin;
619 else { 619 else {
620 pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); 620 printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, "
621 "aborting TCE cache flush sequence!\n");
621 return; /* pray for the best */ 622 return; /* pray for the best */
622 } 623 }
623 } 624 }
@@ -839,8 +840,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl)
839 plssr = be32_to_cpu(readl(target)); 840 plssr = be32_to_cpu(readl(target));
840 841
841 /* If no error, the agent ID in the CSR is not valid */ 842 /* If no error, the agent ID in the CSR is not valid */
842 pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", 843 printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, "
843 tbl->it_busno, csr, plssr); 844 "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr);
844} 845}
845 846
846static void calioc2_dump_error_regs(struct iommu_table *tbl) 847static void calioc2_dump_error_regs(struct iommu_table *tbl)
@@ -866,21 +867,22 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl)
866 target = calgary_reg(bbar, phboff | 0x800); 867 target = calgary_reg(bbar, phboff | 0x800);
867 mck = be32_to_cpu(readl(target)); 868 mck = be32_to_cpu(readl(target));
868 869
869 pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); 870 printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n",
871 tbl->it_busno);
870 872
871 pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", 873 printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n",
872 csr, plssr, csmr, mck); 874 csr, plssr, csmr, mck);
873 875
874 /* dump rest of error regs */ 876 /* dump rest of error regs */
875 pr_emerg(""); 877 printk(KERN_EMERG "Calgary: ");
876 for (i = 0; i < ARRAY_SIZE(errregs); i++) { 878 for (i = 0; i < ARRAY_SIZE(errregs); i++) {
877 /* err regs are at 0x810 - 0x870 */ 879 /* err regs are at 0x810 - 0x870 */
878 erroff = (0x810 + (i * 0x10)); 880 erroff = (0x810 + (i * 0x10));
879 target = calgary_reg(bbar, phboff | erroff); 881 target = calgary_reg(bbar, phboff | erroff);
880 errregs[i] = be32_to_cpu(readl(target)); 882 errregs[i] = be32_to_cpu(readl(target));
881 pr_cont("0x%08x@0x%lx ", errregs[i], erroff); 883 printk("0x%08x@0x%lx ", errregs[i], erroff);
882 } 884 }
883 pr_cont("\n"); 885 printk("\n");
884 886
885 /* root complex status */ 887 /* root complex status */
886 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); 888 target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS);
@@ -1478,9 +1480,8 @@ cleanup:
1478static int __init calgary_parse_options(char *p) 1480static int __init calgary_parse_options(char *p)
1479{ 1481{
1480 unsigned int bridge; 1482 unsigned int bridge;
1481 unsigned long val;
1482 size_t len; 1483 size_t len;
1483 ssize_t ret; 1484 char* endp;
1484 1485
1485 while (*p) { 1486 while (*p) {
1486 if (!strncmp(p, "64k", 3)) 1487 if (!strncmp(p, "64k", 3))
@@ -1511,11 +1512,10 @@ static int __init calgary_parse_options(char *p)
1511 ++p; 1512 ++p;
1512 if (*p == '\0') 1513 if (*p == '\0')
1513 break; 1514 break;
1514 ret = kstrtoul(p, 0, &val); 1515 bridge = simple_strtoul(p, &endp, 0);
1515 if (ret) 1516 if (p == endp)
1516 break; 1517 break;
1517 1518
1518 bridge = val;
1519 if (bridge < MAX_PHB_BUS_NUM) { 1519 if (bridge < MAX_PHB_BUS_NUM) {
1520 printk(KERN_INFO "Calgary: disabling " 1520 printk(KERN_INFO "Calgary: disabling "
1521 "translation for PHB %#x\n", bridge); 1521 "translation for PHB %#x\n", bridge);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 0f5dec5c80e..3b730fb1385 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,7 +1,6 @@
1#include <linux/dma-mapping.h> 1#include <linux/dma-mapping.h>
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/export.h>
5#include <linux/bootmem.h> 4#include <linux/bootmem.h>
6#include <linux/gfp.h> 5#include <linux/gfp.h>
7#include <linux/pci.h> 6#include <linux/pci.h>
@@ -45,6 +44,15 @@ int iommu_detected __read_mostly = 0;
45 */ 44 */
46int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
47 46
47/*
48 * Group multi-function PCI devices into a single device-group for the
49 * iommu_device_group interface. This tells the iommu driver to pretend
50 * it cannot distinguish between functions of a device, exposing only one
51 * group for the device. Useful for disallowing use of individual PCI
52 * functions from userspace drivers.
53 */
54int iommu_group_mf __read_mostly;
55
48extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; 56extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
49 57
50/* Dummy device used for NULL arguments (normally ISA). */ 58/* Dummy device used for NULL arguments (normally ISA). */
@@ -87,23 +95,17 @@ void __init pci_iommu_alloc(void)
87 } 95 }
88} 96}
89void *dma_generic_alloc_coherent(struct device *dev, size_t size, 97void *dma_generic_alloc_coherent(struct device *dev, size_t size,
90 dma_addr_t *dma_addr, gfp_t flag, 98 dma_addr_t *dma_addr, gfp_t flag)
91 struct dma_attrs *attrs)
92{ 99{
93 unsigned long dma_mask; 100 unsigned long dma_mask;
94 struct page *page; 101 struct page *page;
95 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
96 dma_addr_t addr; 102 dma_addr_t addr;
97 103
98 dma_mask = dma_alloc_coherent_mask(dev, flag); 104 dma_mask = dma_alloc_coherent_mask(dev, flag);
99 105
100 flag |= __GFP_ZERO; 106 flag |= __GFP_ZERO;
101again: 107again:
102 page = NULL; 108 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
103 if (!(flag & GFP_ATOMIC))
104 page = dma_alloc_from_contiguous(dev, count, get_order(size));
105 if (!page)
106 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
107 if (!page) 109 if (!page)
108 return NULL; 110 return NULL;
109 111
@@ -123,19 +125,9 @@ again:
123 return page_address(page); 125 return page_address(page);
124} 126}
125 127
126void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr,
127 dma_addr_t dma_addr, struct dma_attrs *attrs)
128{
129 unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
130 struct page *page = virt_to_page(vaddr);
131
132 if (!dma_release_from_contiguous(dev, page, count))
133 free_pages((unsigned long)vaddr, get_order(size));
134}
135
136/* 128/*
137 * See <Documentation/x86/x86_64/boot-options.txt> for the iommu kernel 129 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * parameter documentation. 130 * documentation.
139 */ 131 */
140static __init int iommu_setup(char *p) 132static __init int iommu_setup(char *p)
141{ 133{
@@ -185,6 +177,8 @@ static __init int iommu_setup(char *p)
185#endif 177#endif
186 if (!strncmp(p, "pt", 2)) 178 if (!strncmp(p, "pt", 2))
187 iommu_pass_through = 1; 179 iommu_pass_through = 1;
180 if (!strncmp(p, "group_mf", 8))
181 iommu_group_mf = 1;
188 182
189 gart_parse_options(p); 183 gart_parse_options(p);
190 184
@@ -265,13 +259,12 @@ rootfs_initcall(pci_iommu_init);
265#ifdef CONFIG_PCI 259#ifdef CONFIG_PCI
266/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ 260/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
267 261
268static void via_no_dac(struct pci_dev *dev) 262static __devinit void via_no_dac(struct pci_dev *dev)
269{ 263{
270 if (forbid_dac == 0) { 264 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
271 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); 265 dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n");
272 forbid_dac = 1; 266 forbid_dac = 1;
273 } 267 }
274} 268}
275DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, 269DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
276 PCI_CLASS_BRIDGE_PCI, 8, via_no_dac);
277#endif 270#endif
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 871be4a84c7..3af4af810c0 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -74,6 +74,12 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
74 return nents; 74 return nents;
75} 75}
76 76
77static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
78 dma_addr_t dma_addr)
79{
80 free_pages((unsigned long)vaddr, get_order(size));
81}
82
77static void nommu_sync_single_for_device(struct device *dev, 83static void nommu_sync_single_for_device(struct device *dev,
78 dma_addr_t addr, size_t size, 84 dma_addr_t addr, size_t size,
79 enum dma_data_direction dir) 85 enum dma_data_direction dir)
@@ -90,8 +96,8 @@ static void nommu_sync_sg_for_device(struct device *dev,
90} 96}
91 97
92struct dma_map_ops nommu_dma_ops = { 98struct dma_map_ops nommu_dma_ops = {
93 .alloc = dma_generic_alloc_coherent, 99 .alloc_coherent = dma_generic_alloc_coherent,
94 .free = dma_generic_free_coherent, 100 .free_coherent = nommu_free_coherent,
95 .map_sg = nommu_map_sg, 101 .map_sg = nommu_map_sg,
96 .map_page = nommu_map_page, 102 .map_page = nommu_map_page,
97 .sync_single_for_device = nommu_sync_single_for_device, 103 .sync_single_for_device = nommu_sync_single_for_device,
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 6c483ba98b9..8f972cbddef 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -15,30 +15,21 @@
15int swiotlb __read_mostly; 15int swiotlb __read_mostly;
16 16
17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
18 dma_addr_t *dma_handle, gfp_t flags, 18 dma_addr_t *dma_handle, gfp_t flags)
19 struct dma_attrs *attrs)
20{ 19{
21 void *vaddr; 20 void *vaddr;
22 21
23 vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags, 22 vaddr = dma_generic_alloc_coherent(hwdev, size, dma_handle, flags);
24 attrs);
25 if (vaddr) 23 if (vaddr)
26 return vaddr; 24 return vaddr;
27 25
28 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); 26 return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags);
29} 27}
30 28
31static void x86_swiotlb_free_coherent(struct device *dev, size_t size,
32 void *vaddr, dma_addr_t dma_addr,
33 struct dma_attrs *attrs)
34{
35 swiotlb_free_coherent(dev, size, vaddr, dma_addr);
36}
37
38static struct dma_map_ops swiotlb_dma_ops = { 29static struct dma_map_ops swiotlb_dma_ops = {
39 .mapping_error = swiotlb_dma_mapping_error, 30 .mapping_error = swiotlb_dma_mapping_error,
40 .alloc = x86_swiotlb_alloc_coherent, 31 .alloc_coherent = x86_swiotlb_alloc_coherent,
41 .free = x86_swiotlb_free_coherent, 32 .free_coherent = swiotlb_free_coherent,
42 .sync_single_for_cpu = swiotlb_sync_single_for_cpu, 33 .sync_single_for_cpu = swiotlb_sync_single_for_cpu,
43 .sync_single_for_device = swiotlb_sync_single_for_device, 34 .sync_single_for_device = swiotlb_sync_single_for_device,
44 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, 35 .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
deleted file mode 100644
index e309cc5c276..00000000000
--- a/arch/x86/kernel/perf_regs.c
+++ /dev/null
@@ -1,105 +0,0 @@
1#include <linux/errno.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/perf_event.h>
5#include <linux/bug.h>
6#include <linux/stddef.h>
7#include <asm/perf_regs.h>
8#include <asm/ptrace.h>
9
10#ifdef CONFIG_X86_32
11#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX
12#else
13#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX
14#endif
15
16#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r)
17
18static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = {
19 PT_REGS_OFFSET(PERF_REG_X86_AX, ax),
20 PT_REGS_OFFSET(PERF_REG_X86_BX, bx),
21 PT_REGS_OFFSET(PERF_REG_X86_CX, cx),
22 PT_REGS_OFFSET(PERF_REG_X86_DX, dx),
23 PT_REGS_OFFSET(PERF_REG_X86_SI, si),
24 PT_REGS_OFFSET(PERF_REG_X86_DI, di),
25 PT_REGS_OFFSET(PERF_REG_X86_BP, bp),
26 PT_REGS_OFFSET(PERF_REG_X86_SP, sp),
27 PT_REGS_OFFSET(PERF_REG_X86_IP, ip),
28 PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags),
29 PT_REGS_OFFSET(PERF_REG_X86_CS, cs),
30 PT_REGS_OFFSET(PERF_REG_X86_SS, ss),
31#ifdef CONFIG_X86_32
32 PT_REGS_OFFSET(PERF_REG_X86_DS, ds),
33 PT_REGS_OFFSET(PERF_REG_X86_ES, es),
34 PT_REGS_OFFSET(PERF_REG_X86_FS, fs),
35 PT_REGS_OFFSET(PERF_REG_X86_GS, gs),
36#else
37 /*
38 * The pt_regs struct does not store
39 * ds, es, fs, gs in 64 bit mode.
40 */
41 (unsigned int) -1,
42 (unsigned int) -1,
43 (unsigned int) -1,
44 (unsigned int) -1,
45#endif
46#ifdef CONFIG_X86_64
47 PT_REGS_OFFSET(PERF_REG_X86_R8, r8),
48 PT_REGS_OFFSET(PERF_REG_X86_R9, r9),
49 PT_REGS_OFFSET(PERF_REG_X86_R10, r10),
50 PT_REGS_OFFSET(PERF_REG_X86_R11, r11),
51 PT_REGS_OFFSET(PERF_REG_X86_R12, r12),
52 PT_REGS_OFFSET(PERF_REG_X86_R13, r13),
53 PT_REGS_OFFSET(PERF_REG_X86_R14, r14),
54 PT_REGS_OFFSET(PERF_REG_X86_R15, r15),
55#endif
56};
57
58u64 perf_reg_value(struct pt_regs *regs, int idx)
59{
60 if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset)))
61 return 0;
62
63 return regs_get_register(regs, pt_regs_offset[idx]);
64}
65
66#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL))
67
68#ifdef CONFIG_X86_32
69int perf_reg_validate(u64 mask)
70{
71 if (!mask || mask & REG_RESERVED)
72 return -EINVAL;
73
74 return 0;
75}
76
77u64 perf_reg_abi(struct task_struct *task)
78{
79 return PERF_SAMPLE_REGS_ABI_32;
80}
81#else /* CONFIG_X86_64 */
82#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
83 (1ULL << PERF_REG_X86_ES) | \
84 (1ULL << PERF_REG_X86_FS) | \
85 (1ULL << PERF_REG_X86_GS))
86
87int perf_reg_validate(u64 mask)
88{
89 if (!mask || mask & REG_RESERVED)
90 return -EINVAL;
91
92 if (mask & REG_NOSUPPORT)
93 return -EINVAL;
94
95 return 0;
96}
97
98u64 perf_reg_abi(struct task_struct *task)
99{
100 if (test_tsk_thread_flag(task, TIF_IA32))
101 return PERF_SAMPLE_REGS_ABI_32;
102 else
103 return PERF_SAMPLE_REGS_ABI_64;
104}
105#endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index d5f15c3f7b2..63228035f9d 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -10,10 +10,9 @@
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/pfn.h> 11#include <linux/pfn.h>
12#include <linux/pci.h> 12#include <linux/pci.h>
13#include <linux/export.h>
14
15#include <asm/probe_roms.h>
16#include <asm/pci-direct.h> 13#include <asm/pci-direct.h>
14
15
17#include <asm/e820.h> 16#include <asm/e820.h>
18#include <asm/mmzone.h> 17#include <asm/mmzone.h>
19#include <asm/setup.h> 18#include <asm/setup.h>
@@ -150,7 +149,7 @@ static struct resource *find_oprom(struct pci_dev *pdev)
150 return oprom; 149 return oprom;
151} 150}
152 151
153void __iomem *pci_map_biosrom(struct pci_dev *pdev) 152void *pci_map_biosrom(struct pci_dev *pdev)
154{ 153{
155 struct resource *oprom = find_oprom(pdev); 154 struct resource *oprom = find_oprom(pdev);
156 155
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2ed787f15bf..30eb651d1fa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/errno.h> 1#include <linux/errno.h>
4#include <linux/kernel.h> 2#include <linux/kernel.h>
5#include <linux/mm.h> 3#include <linux/mm.h>
@@ -14,54 +12,20 @@
14#include <linux/user-return-notifier.h> 12#include <linux/user-return-notifier.h>
15#include <linux/dmi.h> 13#include <linux/dmi.h>
16#include <linux/utsname.h> 14#include <linux/utsname.h>
17#include <linux/stackprotector.h>
18#include <linux/tick.h>
19#include <linux/cpuidle.h>
20#include <trace/events/power.h> 15#include <trace/events/power.h>
21#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
22#include <asm/cpu.h> 17#include <asm/cpu.h>
18#include <asm/system.h>
23#include <asm/apic.h> 19#include <asm/apic.h>
24#include <asm/syscalls.h> 20#include <asm/syscalls.h>
25#include <asm/idle.h> 21#include <asm/idle.h>
26#include <asm/uaccess.h> 22#include <asm/uaccess.h>
27#include <asm/i387.h> 23#include <asm/i387.h>
28#include <asm/fpu-internal.h>
29#include <asm/debugreg.h> 24#include <asm/debugreg.h>
30#include <asm/nmi.h>
31
32/*
33 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
34 * no more per-task TSS's. The TSS size is kept cacheline-aligned
35 * so they are allowed to end up in the .data..cacheline_aligned
36 * section. Since TSS's are completely CPU-local, we want them
37 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
38 */
39DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
40
41#ifdef CONFIG_X86_64
42static DEFINE_PER_CPU(unsigned char, is_idle);
43static ATOMIC_NOTIFIER_HEAD(idle_notifier);
44
45void idle_notifier_register(struct notifier_block *n)
46{
47 atomic_notifier_chain_register(&idle_notifier, n);
48}
49EXPORT_SYMBOL_GPL(idle_notifier_register);
50
51void idle_notifier_unregister(struct notifier_block *n)
52{
53 atomic_notifier_chain_unregister(&idle_notifier, n);
54}
55EXPORT_SYMBOL_GPL(idle_notifier_unregister);
56#endif
57 25
58struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
59EXPORT_SYMBOL_GPL(task_xstate_cachep); 27EXPORT_SYMBOL_GPL(task_xstate_cachep);
60 28
61/*
62 * this gets called so that we can store lazy state into memory and copy the
63 * current task into the new thread.
64 */
65int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 29int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
66{ 30{
67 int ret; 31 int ret;
@@ -72,7 +36,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
72 ret = fpu_alloc(&dst->thread.fpu); 36 ret = fpu_alloc(&dst->thread.fpu);
73 if (ret) 37 if (ret)
74 return ret; 38 return ret;
75 fpu_copy(dst, src); 39 fpu_copy(&dst->thread.fpu, &src->thread.fpu);
76 } 40 }
77 return 0; 41 return 0;
78} 42}
@@ -82,9 +46,10 @@ void free_thread_xstate(struct task_struct *tsk)
82 fpu_free(&tsk->thread.fpu); 46 fpu_free(&tsk->thread.fpu);
83} 47}
84 48
85void arch_release_task_struct(struct task_struct *tsk) 49void free_thread_info(struct thread_info *ti)
86{ 50{
87 free_thread_xstate(tsk); 51 free_thread_xstate(ti->task);
52 free_pages((unsigned long)ti, get_order(THREAD_SIZE));
88} 53}
89 54
90void arch_task_cache_init(void) 55void arch_task_cache_init(void)
@@ -117,8 +82,12 @@ void exit_thread(void)
117 put_cpu(); 82 put_cpu();
118 kfree(bp); 83 kfree(bp);
119 } 84 }
85}
120 86
121 drop_fpu(me); 87void show_regs(struct pt_regs *regs)
88{
89 show_registers(regs);
90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
122} 91}
123 92
124void show_regs_common(void) 93void show_regs_common(void)
@@ -135,14 +104,16 @@ void show_regs_common(void)
135 /* Board Name is optional */ 104 /* Board Name is optional */
136 board = dmi_get_system_info(DMI_BOARD_NAME); 105 board = dmi_get_system_info(DMI_BOARD_NAME);
137 106
138 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", 107 printk(KERN_CONT "\n");
139 current->pid, current->comm, print_tainted(), 108 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
140 init_utsname()->release, 109 current->pid, current->comm, print_tainted(),
141 (int)strcspn(init_utsname()->version, " "), 110 init_utsname()->release,
142 init_utsname()->version, 111 (int)strcspn(init_utsname()->version, " "),
143 vendor, product, 112 init_utsname()->version);
144 board ? "/" : "", 113 printk(KERN_CONT " %s %s", vendor, product);
145 board ? board : ""); 114 if (board)
115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "\n");
146} 117}
147 118
148void flush_thread(void) 119void flush_thread(void)
@@ -151,13 +122,12 @@ void flush_thread(void)
151 122
152 flush_ptrace_hw_breakpoint(tsk); 123 flush_ptrace_hw_breakpoint(tsk);
153 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 124 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
154 drop_init_fpu(tsk);
155 /* 125 /*
156 * Free the FPU state for non xsave platforms. They get reallocated 126 * Forget coprocessor state..
157 * lazily at the first use.
158 */ 127 */
159 if (!use_eager_fpu()) 128 tsk->fpu_counter = 0;
160 free_thread_xstate(tsk); 129 clear_fpu(tsk);
130 clear_used_math();
161} 131}
162 132
163static void hard_disable_TSC(void) 133static void hard_disable_TSC(void)
@@ -262,112 +232,143 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
262 propagate_user_return_notify(prev_p, next_p); 232 propagate_user_return_notify(prev_p, next_p);
263} 233}
264 234
265/* 235int sys_fork(struct pt_regs *regs)
266 * Idle related variables and functions
267 */
268unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
269EXPORT_SYMBOL(boot_option_idle_override);
270
271/*
272 * Powermanagement idle function, if any..
273 */
274void (*pm_idle)(void);
275#ifdef CONFIG_APM_MODULE
276EXPORT_SYMBOL(pm_idle);
277#endif
278
279#ifndef CONFIG_SMP
280static inline void play_dead(void)
281{ 236{
282 BUG(); 237 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
283} 238}
284#endif
285 239
286#ifdef CONFIG_X86_64 240/*
287void enter_idle(void) 241 * This is trivial, and on the face of it looks like it
242 * could equally well be done in user mode.
243 *
244 * Not so, for quite unobvious reasons - register pressure.
245 * In user mode vfork() cannot have a stack frame, and if
246 * done by calling the "clone()" system call directly, you
247 * do not have enough call-clobbered registers to hold all
248 * the information you need.
249 */
250int sys_vfork(struct pt_regs *regs)
288{ 251{
289 this_cpu_write(is_idle, 1); 252 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
290 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 253 NULL, NULL);
291} 254}
292 255
293static void __exit_idle(void) 256long
257sys_clone(unsigned long clone_flags, unsigned long newsp,
258 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
294{ 259{
295 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 260 if (!newsp)
296 return; 261 newsp = regs->sp;
297 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 262 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
298} 263}
299 264
300/* Called from interrupts to signify idle end */ 265/*
301void exit_idle(void) 266 * This gets run with %si containing the
302{ 267 * function to call, and %di containing
303 /* idle loop has pid 0 */ 268 * the "args".
304 if (current->pid) 269 */
305 return; 270extern void kernel_thread_helper(void);
306 __exit_idle();
307}
308#endif
309 271
310/* 272/*
311 * The idle thread. There's no useful work to be 273 * Create a kernel thread
312 * done, so just try to conserve power and have a
313 * low exit latency (ie sit in a loop waiting for
314 * somebody to say that they'd like to reschedule)
315 */ 274 */
316void cpu_idle(void) 275int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
317{ 276{
318 /* 277 struct pt_regs regs;
319 * If we're the non-boot CPU, nothing set the stack canary up
320 * for us. CPU0 already has it initialized but no harm in
321 * doing it again. This is a good place for updating it, as
322 * we wont ever return from this function (so the invalid
323 * canaries already on the stack wont ever trigger).
324 */
325 boot_init_stack_canary();
326 current_thread_info()->status |= TS_POLLING;
327 278
328 while (1) { 279 memset(&regs, 0, sizeof(regs));
329 tick_nohz_idle_enter();
330 280
331 while (!need_resched()) { 281 regs.si = (unsigned long) fn;
332 rmb(); 282 regs.di = (unsigned long) arg;
333 283
334 if (cpu_is_offline(smp_processor_id())) 284#ifdef CONFIG_X86_32
335 play_dead(); 285 regs.ds = __USER_DS;
286 regs.es = __USER_DS;
287 regs.fs = __KERNEL_PERCPU;
288 regs.gs = __KERNEL_STACK_CANARY;
289#else
290 regs.ss = __KERNEL_DS;
291#endif
336 292
337 /* 293 regs.orig_ax = -1;
338 * Idle routines should keep interrupts disabled 294 regs.ip = (unsigned long) kernel_thread_helper;
339 * from here on, until they go to idle. 295 regs.cs = __KERNEL_CS | get_kernel_rpl();
340 * Otherwise, idle callbacks can misfire. 296 regs.flags = X86_EFLAGS_IF | 0x2;
341 */ 297
342 local_touch_nmi(); 298 /* Ok, create the new process.. */
343 local_irq_disable(); 299 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
300}
301EXPORT_SYMBOL(kernel_thread);
344 302
345 enter_idle(); 303/*
304 * sys_execve() executes a new program.
305 */
306long sys_execve(const char __user *name,
307 const char __user *const __user *argv,
308 const char __user *const __user *envp, struct pt_regs *regs)
309{
310 long error;
311 char *filename;
312
313 filename = getname(name);
314 error = PTR_ERR(filename);
315 if (IS_ERR(filename))
316 return error;
317 error = do_execve(filename, argv, envp, regs);
318
319#ifdef CONFIG_X86_32
320 if (error == 0) {
321 /* Make sure we don't return using sysenter.. */
322 set_thread_flag(TIF_IRET);
323 }
324#endif
346 325
347 /* Don't trace irqs off for idle */ 326 putname(filename);
348 stop_critical_timings(); 327 return error;
328}
349 329
350 /* enter_idle() needs rcu for notifiers */ 330/*
351 rcu_idle_enter(); 331 * Idle related variables and functions
332 */
333unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
334EXPORT_SYMBOL(boot_option_idle_override);
352 335
353 if (cpuidle_idle_call()) 336/*
354 pm_idle(); 337 * Powermanagement idle function, if any..
338 */
339void (*pm_idle)(void);
340#ifdef CONFIG_APM_MODULE
341EXPORT_SYMBOL(pm_idle);
342#endif
355 343
356 rcu_idle_exit(); 344#ifdef CONFIG_X86_32
357 start_critical_timings(); 345/*
346 * This halt magic was a workaround for ancient floppy DMA
347 * wreckage. It should be safe to remove.
348 */
349static int hlt_counter;
350void disable_hlt(void)
351{
352 hlt_counter++;
353}
354EXPORT_SYMBOL(disable_hlt);
358 355
359 /* In many cases the interrupt that ended idle 356void enable_hlt(void)
360 has already called exit_idle. But some idle 357{
361 loops can be woken up without interrupt. */ 358 hlt_counter--;
362 __exit_idle(); 359}
363 } 360EXPORT_SYMBOL(enable_hlt);
364 361
365 tick_nohz_idle_exit(); 362static inline int hlt_use_halt(void)
366 preempt_enable_no_resched(); 363{
367 schedule(); 364 return (!hlt_counter && boot_cpu_data.hlt_works_ok);
368 preempt_disable(); 365}
369 } 366#else
367static inline int hlt_use_halt(void)
368{
369 return 1;
370} 370}
371#endif
371 372
372/* 373/*
373 * We use this if we don't have any better 374 * We use this if we don't have any better
@@ -375,22 +376,28 @@ void cpu_idle(void)
375 */ 376 */
376void default_idle(void) 377void default_idle(void)
377{ 378{
378 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); 379 if (hlt_use_halt()) {
379 trace_cpu_idle_rcuidle(1, smp_processor_id()); 380 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
380 current_thread_info()->status &= ~TS_POLLING; 381 trace_cpu_idle(1, smp_processor_id());
381 /* 382 current_thread_info()->status &= ~TS_POLLING;
382 * TS_POLLING-cleared state must be visible before we 383 /*
383 * test NEED_RESCHED: 384 * TS_POLLING-cleared state must be visible before we
384 */ 385 * test NEED_RESCHED:
385 smp_mb(); 386 */
387 smp_mb();
386 388
387 if (!need_resched()) 389 if (!need_resched())
388 safe_halt(); /* enables interrupts racelessly */ 390 safe_halt(); /* enables interrupts racelessly */
389 else 391 else
392 local_irq_enable();
393 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
396 } else {
390 local_irq_enable(); 397 local_irq_enable();
391 current_thread_info()->status |= TS_POLLING; 398 /* loop is done by the caller */
392 trace_power_end_rcuidle(smp_processor_id()); 399 cpu_relax();
393 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 400 }
394} 401}
395#ifdef CONFIG_APM_MODULE 402#ifdef CONFIG_APM_MODULE
396EXPORT_SYMBOL(default_idle); 403EXPORT_SYMBOL(default_idle);
@@ -419,12 +426,32 @@ void stop_this_cpu(void *dummy)
419 } 426 }
420} 427}
421 428
429static void do_nothing(void *unused)
430{
431}
432
433/*
434 * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
435 * pm_idle and update to new pm_idle value. Required while changing pm_idle
436 * handler on SMP systems.
437 *
438 * Caller must have changed pm_idle to the new value before the call. Old
439 * pm_idle value will not be used by any CPU after the return of this function.
440 */
441void cpu_idle_wait(void)
442{
443 smp_mb();
444 /* kick all the CPUs so that they exit out of pm_idle */
445 smp_call_function(do_nothing, NULL, 1);
446}
447EXPORT_SYMBOL_GPL(cpu_idle_wait);
448
422/* Default MONITOR/MWAIT with no hints, used for default C1 state */ 449/* Default MONITOR/MWAIT with no hints, used for default C1 state */
423static void mwait_idle(void) 450static void mwait_idle(void)
424{ 451{
425 if (!need_resched()) { 452 if (!need_resched()) {
426 trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id()); 453 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
427 trace_cpu_idle_rcuidle(1, smp_processor_id()); 454 trace_cpu_idle(1, smp_processor_id());
428 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR)) 455 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
429 clflush((void *)&current_thread_info()->flags); 456 clflush((void *)&current_thread_info()->flags);
430 457
@@ -434,8 +461,8 @@ static void mwait_idle(void)
434 __sti_mwait(0, 0); 461 __sti_mwait(0, 0);
435 else 462 else
436 local_irq_enable(); 463 local_irq_enable();
437 trace_power_end_rcuidle(smp_processor_id()); 464 trace_power_end(smp_processor_id());
438 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 465 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
439 } else 466 } else
440 local_irq_enable(); 467 local_irq_enable();
441} 468}
@@ -447,13 +474,13 @@ static void mwait_idle(void)
447 */ 474 */
448static void poll_idle(void) 475static void poll_idle(void)
449{ 476{
450 trace_power_start_rcuidle(POWER_CSTATE, 0, smp_processor_id()); 477 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
451 trace_cpu_idle_rcuidle(0, smp_processor_id()); 478 trace_cpu_idle(0, smp_processor_id());
452 local_irq_enable(); 479 local_irq_enable();
453 while (!need_resched()) 480 while (!need_resched())
454 cpu_relax(); 481 cpu_relax();
455 trace_power_end_rcuidle(smp_processor_id()); 482 trace_power_end(smp_processor_id());
456 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 483 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
457} 484}
458 485
459/* 486/*
@@ -477,17 +504,9 @@ int mwait_usable(const struct cpuinfo_x86 *c)
477{ 504{
478 u32 eax, ebx, ecx, edx; 505 u32 eax, ebx, ecx, edx;
479 506
480 /* Use mwait if idle=mwait boot option is given */
481 if (boot_option_idle_override == IDLE_FORCE_MWAIT) 507 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
482 return 1; 508 return 1;
483 509
484 /*
485 * Any idle= boot option other than idle=mwait means that we must not
486 * use mwait. Eg: idle=halt or idle=poll or idle=nomwait
487 */
488 if (boot_option_idle_override != IDLE_NO_OVERRIDE)
489 return 0;
490
491 if (c->cpuid_level < MWAIT_INFO) 510 if (c->cpuid_level < MWAIT_INFO)
492 return 0; 511 return 0;
493 512
@@ -533,7 +552,7 @@ static void amd_e400_idle(void)
533 amd_e400_c1e_detected = true; 552 amd_e400_c1e_detected = true;
534 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 553 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
535 mark_tsc_unstable("TSC halt in AMD C1E"); 554 mark_tsc_unstable("TSC halt in AMD C1E");
536 pr_info("System has AMD C1E enabled\n"); 555 printk(KERN_INFO "System has AMD C1E enabled\n");
537 } 556 }
538 } 557 }
539 558
@@ -547,7 +566,8 @@ static void amd_e400_idle(void)
547 */ 566 */
548 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, 567 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
549 &cpu); 568 &cpu);
550 pr_info("Switch to broadcast mode on CPU%d\n", cpu); 569 printk(KERN_INFO "Switch to broadcast mode on CPU%d\n",
570 cpu);
551 } 571 }
552 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 572 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
553 573
@@ -568,7 +588,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
568{ 588{
569#ifdef CONFIG_SMP 589#ifdef CONFIG_SMP
570 if (pm_idle == poll_idle && smp_num_siblings > 1) { 590 if (pm_idle == poll_idle && smp_num_siblings > 1) {
571 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); 591 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
592 " performance may degrade.\n");
572 } 593 }
573#endif 594#endif
574 if (pm_idle) 595 if (pm_idle)
@@ -578,11 +599,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
578 /* 599 /*
579 * One CPU supports mwait => All CPUs supports mwait 600 * One CPU supports mwait => All CPUs supports mwait
580 */ 601 */
581 pr_info("using mwait in idle threads\n"); 602 printk(KERN_INFO "using mwait in idle threads.\n");
582 pm_idle = mwait_idle; 603 pm_idle = mwait_idle;
583 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 604 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
584 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 605 /* E400: APIC timer interrupt does not wake up CPU from C1e */
585 pr_info("using AMD E400 aware idle routine\n"); 606 printk(KERN_INFO "using AMD E400 aware idle routine\n");
586 pm_idle = amd_e400_idle; 607 pm_idle = amd_e400_idle;
587 } else 608 } else
588 pm_idle = default_idle; 609 pm_idle = default_idle;
@@ -601,7 +622,7 @@ static int __init idle_setup(char *str)
601 return -EINVAL; 622 return -EINVAL;
602 623
603 if (!strcmp(str, "poll")) { 624 if (!strcmp(str, "poll")) {
604 pr_info("using polling idle threads\n"); 625 printk("using polling idle threads.\n");
605 pm_idle = poll_idle; 626 pm_idle = poll_idle;
606 boot_option_idle_override = IDLE_POLL; 627 boot_option_idle_override = IDLE_POLL;
607 } else if (!strcmp(str, "mwait")) { 628 } else if (!strcmp(str, "mwait")) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b5a8905785e..7a3b65107a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,6 +9,7 @@
9 * This file handles the architecture-dependent parts of process handling.. 9 * This file handles the architecture-dependent parts of process handling..
10 */ 10 */
11 11
12#include <linux/stackprotector.h>
12#include <linux/cpu.h> 13#include <linux/cpu.h>
13#include <linux/errno.h> 14#include <linux/errno.h>
14#include <linux/sched.h> 15#include <linux/sched.h>
@@ -30,18 +31,20 @@
30#include <linux/kallsyms.h> 31#include <linux/kallsyms.h>
31#include <linux/ptrace.h> 32#include <linux/ptrace.h>
32#include <linux/personality.h> 33#include <linux/personality.h>
34#include <linux/tick.h>
33#include <linux/percpu.h> 35#include <linux/percpu.h>
34#include <linux/prctl.h> 36#include <linux/prctl.h>
35#include <linux/ftrace.h> 37#include <linux/ftrace.h>
36#include <linux/uaccess.h> 38#include <linux/uaccess.h>
37#include <linux/io.h> 39#include <linux/io.h>
38#include <linux/kdebug.h> 40#include <linux/kdebug.h>
41#include <linux/cpuidle.h>
39 42
40#include <asm/pgtable.h> 43#include <asm/pgtable.h>
44#include <asm/system.h>
41#include <asm/ldt.h> 45#include <asm/ldt.h>
42#include <asm/processor.h> 46#include <asm/processor.h>
43#include <asm/i387.h> 47#include <asm/i387.h>
44#include <asm/fpu-internal.h>
45#include <asm/desc.h> 48#include <asm/desc.h>
46#ifdef CONFIG_MATH_EMULATION 49#ifdef CONFIG_MATH_EMULATION
47#include <asm/math_emu.h> 50#include <asm/math_emu.h>
@@ -54,10 +57,8 @@
54#include <asm/idle.h> 57#include <asm/idle.h>
55#include <asm/syscalls.h> 58#include <asm/syscalls.h>
56#include <asm/debugreg.h> 59#include <asm/debugreg.h>
57#include <asm/switch_to.h>
58 60
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
61 62
62/* 63/*
63 * Return saved PC of a blocked thread. 64 * Return saved PC of a blocked thread.
@@ -67,6 +68,59 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
67 return ((unsigned long *)tsk->thread.sp)[3]; 68 return ((unsigned long *)tsk->thread.sp)[3];
68} 69}
69 70
71#ifndef CONFIG_SMP
72static inline void play_dead(void)
73{
74 BUG();
75}
76#endif
77
78/*
79 * The idle thread. There's no useful work to be
80 * done, so just try to conserve power and have a
81 * low exit latency (ie sit in a loop waiting for
82 * somebody to say that they'd like to reschedule)
83 */
84void cpu_idle(void)
85{
86 int cpu = smp_processor_id();
87
88 /*
89 * If we're the non-boot CPU, nothing set the stack canary up
90 * for us. CPU0 already has it initialized but no harm in
91 * doing it again. This is a good place for updating it, as
92 * we wont ever return from this function (so the invalid
93 * canaries already on the stack wont ever trigger).
94 */
95 boot_init_stack_canary();
96
97 current_thread_info()->status |= TS_POLLING;
98
99 /* endless idle loop with no priority at all */
100 while (1) {
101 tick_nohz_stop_sched_tick(1);
102 while (!need_resched()) {
103
104 check_pgt_cache();
105 rmb();
106
107 if (cpu_is_offline(cpu))
108 play_dead();
109
110 local_irq_disable();
111 /* Don't trace irqs off for idle */
112 stop_critical_timings();
113 if (cpuidle_idle_call())
114 pm_idle();
115 start_critical_timings();
116 }
117 tick_nohz_restart_sched_tick();
118 preempt_enable_no_resched();
119 schedule();
120 preempt_disable();
121 }
122}
123
70void __show_regs(struct pt_regs *regs, int all) 124void __show_regs(struct pt_regs *regs, int all)
71{ 125{
72 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 126 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -127,43 +181,35 @@ void release_thread(struct task_struct *dead_task)
127 release_vm86_irqs(dead_task); 181 release_vm86_irqs(dead_task);
128} 182}
129 183
184/*
185 * This gets called before we allocate a new thread and copy
186 * the current task into it.
187 */
188void prepare_to_copy(struct task_struct *tsk)
189{
190 unlazy_fpu(tsk);
191}
192
130int copy_thread(unsigned long clone_flags, unsigned long sp, 193int copy_thread(unsigned long clone_flags, unsigned long sp,
131 unsigned long arg, struct task_struct *p) 194 unsigned long unused,
195 struct task_struct *p, struct pt_regs *regs)
132{ 196{
133 struct pt_regs *childregs = task_pt_regs(p); 197 struct pt_regs *childregs;
134 struct task_struct *tsk; 198 struct task_struct *tsk;
135 int err; 199 int err;
136 200
201 childregs = task_pt_regs(p);
202 *childregs = *regs;
203 childregs->ax = 0;
204 childregs->sp = sp;
205
137 p->thread.sp = (unsigned long) childregs; 206 p->thread.sp = (unsigned long) childregs;
138 p->thread.sp0 = (unsigned long) (childregs+1); 207 p->thread.sp0 = (unsigned long) (childregs+1);
139 208
140 if (unlikely(p->flags & PF_KTHREAD)) {
141 /* kernel thread */
142 memset(childregs, 0, sizeof(struct pt_regs));
143 p->thread.ip = (unsigned long) ret_from_kernel_thread;
144 task_user_gs(p) = __KERNEL_STACK_CANARY;
145 childregs->ds = __USER_DS;
146 childregs->es = __USER_DS;
147 childregs->fs = __KERNEL_PERCPU;
148 childregs->bx = sp; /* function */
149 childregs->bp = arg;
150 childregs->orig_ax = -1;
151 childregs->cs = __KERNEL_CS | get_kernel_rpl();
152 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
153 p->fpu_counter = 0;
154 p->thread.io_bitmap_ptr = NULL;
155 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
156 return 0;
157 }
158 *childregs = *current_pt_regs();
159 childregs->ax = 0;
160 if (sp)
161 childregs->sp = sp;
162
163 p->thread.ip = (unsigned long) ret_from_fork; 209 p->thread.ip = (unsigned long) ret_from_fork;
164 task_user_gs(p) = get_user_gs(current_pt_regs());
165 210
166 p->fpu_counter = 0; 211 task_user_gs(p) = get_user_gs(regs);
212
167 p->thread.io_bitmap_ptr = NULL; 213 p->thread.io_bitmap_ptr = NULL;
168 tsk = current; 214 tsk = current;
169 err = -ENOMEM; 215 err = -ENOMEM;
@@ -207,18 +253,16 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
207 regs->cs = __USER_CS; 253 regs->cs = __USER_CS;
208 regs->ip = new_ip; 254 regs->ip = new_ip;
209 regs->sp = new_sp; 255 regs->sp = new_sp;
210 regs->flags = X86_EFLAGS_IF;
211 /* 256 /*
212 * force it to the iret return path by making it look as if there was 257 * Free the old FP and other extended state
213 * some work pending.
214 */ 258 */
215 set_thread_flag(TIF_NOTIFY_RESUME); 259 free_thread_xstate(current);
216} 260}
217EXPORT_SYMBOL_GPL(start_thread); 261EXPORT_SYMBOL_GPL(start_thread);
218 262
219 263
220/* 264/*
221 * switch_to(x,y) should switch tasks from x to y. 265 * switch_to(x,yn) should switch tasks from x to y.
222 * 266 *
223 * We fsave/fwait so that an exception goes off at the right time 267 * We fsave/fwait so that an exception goes off at the right time
224 * (as a call from the fsave or fwait in effect) rather than to 268 * (as a call from the fsave or fwait in effect) rather than to
@@ -251,11 +295,22 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
251 *next = &next_p->thread; 295 *next = &next_p->thread;
252 int cpu = smp_processor_id(); 296 int cpu = smp_processor_id();
253 struct tss_struct *tss = &per_cpu(init_tss, cpu); 297 struct tss_struct *tss = &per_cpu(init_tss, cpu);
254 fpu_switch_t fpu; 298 bool preload_fpu;
255 299
256 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ 300 /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
257 301
258 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 302 /*
303 * If the task has used fpu the last 5 timeslices, just do a full
304 * restore of the math state immediately to avoid the trap; the
305 * chances of needing FPU soon are obviously high now
306 */
307 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
308
309 __unlazy_fpu(prev_p);
310
311 /* we're going to use this soon, after a few expensive things */
312 if (preload_fpu)
313 prefetch(next->fpu.state);
259 314
260 /* 315 /*
261 * Reload esp0. 316 * Reload esp0.
@@ -295,6 +350,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
295 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) 350 task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
296 __switch_to_xtra(prev_p, next_p, tss); 351 __switch_to_xtra(prev_p, next_p, tss);
297 352
353 /* If we're going to preload the fpu context, make sure clts
354 is run while we're batching the cpu state updates. */
355 if (preload_fpu)
356 clts();
357
298 /* 358 /*
299 * Leave lazy mode, flushing any hypercalls made here. 359 * Leave lazy mode, flushing any hypercalls made here.
300 * This must be done before restoring TLS segments so 360 * This must be done before restoring TLS segments so
@@ -304,15 +364,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
304 */ 364 */
305 arch_end_context_switch(next_p); 365 arch_end_context_switch(next_p);
306 366
367 if (preload_fpu)
368 __math_state_restore();
369
307 /* 370 /*
308 * Restore %gs if needed (which is common) 371 * Restore %gs if needed (which is common)
309 */ 372 */
310 if (prev->gs | next->gs) 373 if (prev->gs | next->gs)
311 lazy_load_gs(next->gs); 374 lazy_load_gs(next->gs);
312 375
313 switch_fpu_finish(next_p, fpu); 376 percpu_write(current_task, next_p);
314
315 this_cpu_write(current_task, next_p);
316 377
317 return prev_p; 378 return prev_p;
318} 379}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6e68a619496..cbd26458911 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,6 +14,7 @@
14 * This file handles the architecture-dependent parts of process handling.. 14 * This file handles the architecture-dependent parts of process handling..
15 */ 15 */
16 16
17#include <linux/stackprotector.h>
17#include <linux/cpu.h> 18#include <linux/cpu.h>
18#include <linux/errno.h> 19#include <linux/errno.h>
19#include <linux/sched.h> 20#include <linux/sched.h>
@@ -31,15 +32,17 @@
31#include <linux/notifier.h> 32#include <linux/notifier.h>
32#include <linux/kprobes.h> 33#include <linux/kprobes.h>
33#include <linux/kdebug.h> 34#include <linux/kdebug.h>
35#include <linux/tick.h>
34#include <linux/prctl.h> 36#include <linux/prctl.h>
35#include <linux/uaccess.h> 37#include <linux/uaccess.h>
36#include <linux/io.h> 38#include <linux/io.h>
37#include <linux/ftrace.h> 39#include <linux/ftrace.h>
40#include <linux/cpuidle.h>
38 41
39#include <asm/pgtable.h> 42#include <asm/pgtable.h>
43#include <asm/system.h>
40#include <asm/processor.h> 44#include <asm/processor.h>
41#include <asm/i387.h> 45#include <asm/i387.h>
42#include <asm/fpu-internal.h>
43#include <asm/mmu_context.h> 46#include <asm/mmu_context.h>
44#include <asm/prctl.h> 47#include <asm/prctl.h>
45#include <asm/desc.h> 48#include <asm/desc.h>
@@ -48,11 +51,94 @@
48#include <asm/idle.h> 51#include <asm/idle.h>
49#include <asm/syscalls.h> 52#include <asm/syscalls.h>
50#include <asm/debugreg.h> 53#include <asm/debugreg.h>
51#include <asm/switch_to.h>
52 54
53asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
54 56
55DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
58static DEFINE_PER_CPU(unsigned char, is_idle);
59
60void enter_idle(void)
61{
62 percpu_write(is_idle, 1);
63 idle_notifier_call_chain(IDLE_START);
64}
65
66static void __exit_idle(void)
67{
68 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
69 return;
70 idle_notifier_call_chain(IDLE_END);
71}
72
73/* Called from interrupts to signify idle end */
74void exit_idle(void)
75{
76 /* idle loop has pid 0 */
77 if (current->pid)
78 return;
79 __exit_idle();
80}
81
82#ifndef CONFIG_SMP
83static inline void play_dead(void)
84{
85 BUG();
86}
87#endif
88
89/*
90 * The idle thread. There's no useful work to be
91 * done, so just try to conserve power and have a
92 * low exit latency (ie sit in a loop waiting for
93 * somebody to say that they'd like to reschedule)
94 */
95void cpu_idle(void)
96{
97 current_thread_info()->status |= TS_POLLING;
98
99 /*
100 * If we're the non-boot CPU, nothing set the stack canary up
101 * for us. CPU0 already has it initialized but no harm in
102 * doing it again. This is a good place for updating it, as
103 * we wont ever return from this function (so the invalid
104 * canaries already on the stack wont ever trigger).
105 */
106 boot_init_stack_canary();
107
108 /* endless idle loop with no priority at all */
109 while (1) {
110 tick_nohz_stop_sched_tick(1);
111 while (!need_resched()) {
112
113 rmb();
114
115 if (cpu_is_offline(smp_processor_id()))
116 play_dead();
117 /*
118 * Idle routines should keep interrupts disabled
119 * from here on, until they go to idle.
120 * Otherwise, idle callbacks can misfire.
121 */
122 local_irq_disable();
123 enter_idle();
124 /* Don't trace irqs off for idle */
125 stop_critical_timings();
126 if (cpuidle_idle_call())
127 pm_idle();
128 start_critical_timings();
129
130 /* In many cases the interrupt that ended idle
131 has already called exit_idle. But some idle
132 loops can be woken up without interrupt. */
133 __exit_idle();
134 }
135
136 tick_nohz_restart_sched_tick();
137 preempt_enable_no_resched();
138 schedule();
139 preempt_disable();
140 }
141}
56 142
57/* Prints also some state that isn't saved in the pt_regs */ 143/* Prints also some state that isn't saved in the pt_regs */
58void __show_regs(struct pt_regs *regs, int all) 144void __show_regs(struct pt_regs *regs, int all)
@@ -117,10 +203,10 @@ void release_thread(struct task_struct *dead_task)
117{ 203{
118 if (dead_task->mm) { 204 if (dead_task->mm) {
119 if (dead_task->mm->context.size) { 205 if (dead_task->mm->context.size) {
120 pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", 206 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
121 dead_task->comm, 207 dead_task->comm,
122 dead_task->mm->context.ldt, 208 dead_task->mm->context.ldt,
123 dead_task->mm->context.size); 209 dead_task->mm->context.size);
124 BUG(); 210 BUG();
125 } 211 }
126 } 212 }
@@ -145,19 +231,39 @@ static inline u32 read_32bit_tls(struct task_struct *t, int tls)
145 return get_desc_base(&t->thread.tls_array[tls]); 231 return get_desc_base(&t->thread.tls_array[tls]);
146} 232}
147 233
234/*
235 * This gets called before we allocate a new thread and copy
236 * the current task into it.
237 */
238void prepare_to_copy(struct task_struct *tsk)
239{
240 unlazy_fpu(tsk);
241}
242
148int copy_thread(unsigned long clone_flags, unsigned long sp, 243int copy_thread(unsigned long clone_flags, unsigned long sp,
149 unsigned long arg, struct task_struct *p) 244 unsigned long unused,
245 struct task_struct *p, struct pt_regs *regs)
150{ 246{
151 int err; 247 int err;
152 struct pt_regs *childregs; 248 struct pt_regs *childregs;
153 struct task_struct *me = current; 249 struct task_struct *me = current;
154 250
155 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; 251 childregs = ((struct pt_regs *)
156 childregs = task_pt_regs(p); 252 (THREAD_SIZE + task_stack_page(p))) - 1;
253 *childregs = *regs;
254
255 childregs->ax = 0;
256 if (user_mode(regs))
257 childregs->sp = sp;
258 else
259 childregs->sp = (unsigned long)childregs;
260
157 p->thread.sp = (unsigned long) childregs; 261 p->thread.sp = (unsigned long) childregs;
262 p->thread.sp0 = (unsigned long) (childregs+1);
158 p->thread.usersp = me->thread.usersp; 263 p->thread.usersp = me->thread.usersp;
264
159 set_tsk_thread_flag(p, TIF_FORK); 265 set_tsk_thread_flag(p, TIF_FORK);
160 p->fpu_counter = 0; 266
161 p->thread.io_bitmap_ptr = NULL; 267 p->thread.io_bitmap_ptr = NULL;
162 268
163 savesegment(gs, p->thread.gsindex); 269 savesegment(gs, p->thread.gsindex);
@@ -166,36 +272,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
166 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; 272 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
167 savesegment(es, p->thread.es); 273 savesegment(es, p->thread.es);
168 savesegment(ds, p->thread.ds); 274 savesegment(ds, p->thread.ds);
169 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
170
171 if (unlikely(p->flags & PF_KTHREAD)) {
172 /* kernel thread */
173 memset(childregs, 0, sizeof(struct pt_regs));
174 childregs->sp = (unsigned long)childregs;
175 childregs->ss = __KERNEL_DS;
176 childregs->bx = sp; /* function */
177 childregs->bp = arg;
178 childregs->orig_ax = -1;
179 childregs->cs = __KERNEL_CS | get_kernel_rpl();
180 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1;
181 return 0;
182 }
183 *childregs = *current_pt_regs();
184
185 childregs->ax = 0;
186 if (sp)
187 childregs->sp = sp;
188 275
189 err = -ENOMEM; 276 err = -ENOMEM;
190 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 277 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191 278
192 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 279 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
193 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, 280 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
194 IO_BITMAP_BYTES, GFP_KERNEL);
195 if (!p->thread.io_bitmap_ptr) { 281 if (!p->thread.io_bitmap_ptr) {
196 p->thread.io_bitmap_max = 0; 282 p->thread.io_bitmap_max = 0;
197 return -ENOMEM; 283 return -ENOMEM;
198 } 284 }
285 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
286 IO_BITMAP_BYTES);
199 set_tsk_thread_flag(p, TIF_IO_BITMAP); 287 set_tsk_thread_flag(p, TIF_IO_BITMAP);
200 } 288 }
201 289
@@ -232,13 +320,16 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
232 loadsegment(es, _ds); 320 loadsegment(es, _ds);
233 loadsegment(ds, _ds); 321 loadsegment(ds, _ds);
234 load_gs_index(0); 322 load_gs_index(0);
235 current->thread.usersp = new_sp;
236 regs->ip = new_ip; 323 regs->ip = new_ip;
237 regs->sp = new_sp; 324 regs->sp = new_sp;
238 this_cpu_write(old_rsp, new_sp); 325 percpu_write(old_rsp, new_sp);
239 regs->cs = _cs; 326 regs->cs = _cs;
240 regs->ss = _ss; 327 regs->ss = _ss;
241 regs->flags = X86_EFLAGS_IF; 328 regs->flags = X86_EFLAGS_IF;
329 /*
330 * Free the old FP and other extended state
331 */
332 free_thread_xstate(current);
242} 333}
243 334
244void 335void
@@ -252,9 +343,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
252void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) 343void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253{ 344{
254 start_thread_common(regs, new_ip, new_sp, 345 start_thread_common(regs, new_ip, new_sp,
255 test_thread_flag(TIF_X32) 346 __USER32_CS, __USER32_DS, __USER32_DS);
256 ? __USER_CS : __USER32_CS,
257 __USER_DS, __USER_DS);
258} 347}
259#endif 348#endif
260 349
@@ -276,9 +365,18 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
276 int cpu = smp_processor_id(); 365 int cpu = smp_processor_id();
277 struct tss_struct *tss = &per_cpu(init_tss, cpu); 366 struct tss_struct *tss = &per_cpu(init_tss, cpu);
278 unsigned fsindex, gsindex; 367 unsigned fsindex, gsindex;
279 fpu_switch_t fpu; 368 bool preload_fpu;
369
370 /*
371 * If the task has used fpu the last 5 timeslices, just do a full
372 * restore of the math state immediately to avoid the trap; the
373 * chances of needing FPU soon are obviously high now
374 */
375 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
280 376
281 fpu = switch_fpu_prepare(prev_p, next_p, cpu); 377 /* we're going to use this soon, after a few expensive things */
378 if (preload_fpu)
379 prefetch(next->fpu.state);
282 380
283 /* 381 /*
284 * Reload esp0, LDT and the page table pointer: 382 * Reload esp0, LDT and the page table pointer:
@@ -308,6 +406,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
308 406
309 load_TLS(next, cpu); 407 load_TLS(next, cpu);
310 408
409 /* Must be after DS reload */
410 __unlazy_fpu(prev_p);
411
412 /* Make sure cpu is ready for new context */
413 if (preload_fpu)
414 clts();
415
311 /* 416 /*
312 * Leave lazy mode, flushing any hypercalls made here. 417 * Leave lazy mode, flushing any hypercalls made here.
313 * This must be done before restoring TLS segments so 418 * This must be done before restoring TLS segments so
@@ -348,16 +453,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
348 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 453 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
349 prev->gsindex = gsindex; 454 prev->gsindex = gsindex;
350 455
351 switch_fpu_finish(next_p, fpu);
352
353 /* 456 /*
354 * Switch the PDA and FPU contexts. 457 * Switch the PDA and FPU contexts.
355 */ 458 */
356 prev->usersp = this_cpu_read(old_rsp); 459 prev->usersp = percpu_read(old_rsp);
357 this_cpu_write(old_rsp, next->usersp); 460 percpu_write(old_rsp, next->usersp);
358 this_cpu_write(current_task, next_p); 461 percpu_write(current_task, next_p);
359 462
360 this_cpu_write(kernel_stack, 463 percpu_write(kernel_stack,
361 (unsigned long)task_stack_page(next_p) + 464 (unsigned long)task_stack_page(next_p) +
362 THREAD_SIZE - KERNEL_STACK_OFFSET); 465 THREAD_SIZE - KERNEL_STACK_OFFSET);
363 466
@@ -368,6 +471,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
368 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 471 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
369 __switch_to_xtra(prev_p, next_p, tss); 472 __switch_to_xtra(prev_p, next_p, tss);
370 473
474 /*
475 * Preload the FPU context, now that we've determined that the
476 * task is likely to be using it.
477 */
478 if (preload_fpu)
479 __math_state_restore();
480
371 return prev_p; 481 return prev_p;
372} 482}
373 483
@@ -377,8 +487,6 @@ void set_personality_64bit(void)
377 487
378 /* Make sure to be in 64bit mode */ 488 /* Make sure to be in 64bit mode */
379 clear_thread_flag(TIF_IA32); 489 clear_thread_flag(TIF_IA32);
380 clear_thread_flag(TIF_ADDR32);
381 clear_thread_flag(TIF_X32);
382 490
383 /* Ensure the corresponding mm is not marked. */ 491 /* Ensure the corresponding mm is not marked. */
384 if (current->mm) 492 if (current->mm)
@@ -391,33 +499,21 @@ void set_personality_64bit(void)
391 current->personality &= ~READ_IMPLIES_EXEC; 499 current->personality &= ~READ_IMPLIES_EXEC;
392} 500}
393 501
394void set_personality_ia32(bool x32) 502void set_personality_ia32(void)
395{ 503{
396 /* inherit personality from parent */ 504 /* inherit personality from parent */
397 505
398 /* Make sure to be in 32bit mode */ 506 /* Make sure to be in 32bit mode */
399 set_thread_flag(TIF_ADDR32); 507 set_thread_flag(TIF_IA32);
508 current->personality |= force_personality32;
400 509
401 /* Mark the associated mm as containing 32-bit tasks. */ 510 /* Mark the associated mm as containing 32-bit tasks. */
402 if (current->mm) 511 if (current->mm)
403 current->mm->context.ia32_compat = 1; 512 current->mm->context.ia32_compat = 1;
404 513
405 if (x32) { 514 /* Prepare the first "return" to user space */
406 clear_thread_flag(TIF_IA32); 515 current_thread_info()->status |= TS_COMPAT;
407 set_thread_flag(TIF_X32);
408 current->personality &= ~READ_IMPLIES_EXEC;
409 /* is_compat_task() uses the presence of the x32
410 syscall bit flag to determine compat status */
411 current_thread_info()->status &= ~TS_COMPAT;
412 } else {
413 set_thread_flag(TIF_IA32);
414 clear_thread_flag(TIF_X32);
415 current->personality |= force_personality32;
416 /* Prepare the first "return" to user space */
417 current_thread_info()->status |= TS_COMPAT;
418 }
419} 516}
420EXPORT_SYMBOL_GPL(set_personality_ia32);
421 517
422unsigned long get_wchan(struct task_struct *p) 518unsigned long get_wchan(struct task_struct *p)
423{ 519{
@@ -469,7 +565,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
469 task->thread.gs = addr; 565 task->thread.gs = addr;
470 if (doit) { 566 if (doit) {
471 load_gs_index(0); 567 load_gs_index(0);
472 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); 568 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
473 } 569 }
474 } 570 }
475 put_cpu(); 571 put_cpu();
@@ -497,7 +593,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
497 /* set the selector to 0 to not confuse 593 /* set the selector to 0 to not confuse
498 __switch_to */ 594 __switch_to */
499 loadsegment(fs, 0); 595 loadsegment(fs, 0);
500 ret = wrmsrl_safe(MSR_FS_BASE, addr); 596 ret = checking_wrmsrl(MSR_FS_BASE, addr);
501 } 597 }
502 } 598 }
503 put_cpu(); 599 put_cpu();
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index b629bbe0d9b..82528799c5d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -21,22 +21,18 @@
21#include <linux/signal.h> 21#include <linux/signal.h>
22#include <linux/perf_event.h> 22#include <linux/perf_event.h>
23#include <linux/hw_breakpoint.h> 23#include <linux/hw_breakpoint.h>
24#include <linux/rcupdate.h>
25#include <linux/module.h>
26#include <linux/context_tracking.h>
27 24
28#include <asm/uaccess.h> 25#include <asm/uaccess.h>
29#include <asm/pgtable.h> 26#include <asm/pgtable.h>
27#include <asm/system.h>
30#include <asm/processor.h> 28#include <asm/processor.h>
31#include <asm/i387.h> 29#include <asm/i387.h>
32#include <asm/fpu-internal.h>
33#include <asm/debugreg.h> 30#include <asm/debugreg.h>
34#include <asm/ldt.h> 31#include <asm/ldt.h>
35#include <asm/desc.h> 32#include <asm/desc.h>
36#include <asm/prctl.h> 33#include <asm/prctl.h>
37#include <asm/proto.h> 34#include <asm/proto.h>
38#include <asm/hw_breakpoint.h> 35#include <asm/hw_breakpoint.h>
39#include <asm/traps.h>
40 36
41#include "tls.h" 37#include "tls.h"
42 38
@@ -168,35 +164,6 @@ static inline bool invalid_selector(u16 value)
168 164
169#define FLAG_MASK FLAG_MASK_32 165#define FLAG_MASK FLAG_MASK_32
170 166
171/*
172 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
173 * when it traps. The previous stack will be directly underneath the saved
174 * registers, and 'sp/ss' won't even have been saved. Thus the '&regs->sp'.
175 *
176 * Now, if the stack is empty, '&regs->sp' is out of range. In this
177 * case we try to take the previous stack. To always return a non-null
178 * stack pointer we fall back to regs as stack if no previous stack
179 * exists.
180 *
181 * This is valid only for kernel mode traps.
182 */
183unsigned long kernel_stack_pointer(struct pt_regs *regs)
184{
185 unsigned long context = (unsigned long)regs & ~(THREAD_SIZE - 1);
186 unsigned long sp = (unsigned long)&regs->sp;
187 struct thread_info *tinfo;
188
189 if (context == (sp & ~(THREAD_SIZE - 1)))
190 return sp;
191
192 tinfo = (struct thread_info *)context;
193 if (tinfo->previous_esp)
194 return tinfo->previous_esp;
195
196 return (unsigned long)regs;
197}
198EXPORT_SYMBOL_GPL(kernel_stack_pointer);
199
200static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 167static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
201{ 168{
202 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 169 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
@@ -782,8 +749,7 @@ put:
782/* 749/*
783 * Handle PTRACE_POKEUSR calls for the debug register area. 750 * Handle PTRACE_POKEUSR calls for the debug register area.
784 */ 751 */
785static int ptrace_set_debugreg(struct task_struct *tsk, int n, 752int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
786 unsigned long val)
787{ 753{
788 struct thread_struct *thread = &(tsk->thread); 754 struct thread_struct *thread = &(tsk->thread);
789 int rc = 0; 755 int rc = 0;
@@ -1163,94 +1129,6 @@ static int genregs32_set(struct task_struct *target,
1163 return ret; 1129 return ret;
1164} 1130}
1165 1131
1166#ifdef CONFIG_X86_X32_ABI
1167static long x32_arch_ptrace(struct task_struct *child,
1168 compat_long_t request, compat_ulong_t caddr,
1169 compat_ulong_t cdata)
1170{
1171 unsigned long addr = caddr;
1172 unsigned long data = cdata;
1173 void __user *datap = compat_ptr(data);
1174 int ret;
1175
1176 switch (request) {
1177 /* Read 32bits at location addr in the USER area. Only allow
1178 to return the lower 32bits of segment and debug registers. */
1179 case PTRACE_PEEKUSR: {
1180 u32 tmp;
1181
1182 ret = -EIO;
1183 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
1184 addr < offsetof(struct user_regs_struct, cs))
1185 break;
1186
1187 tmp = 0; /* Default return condition */
1188 if (addr < sizeof(struct user_regs_struct))
1189 tmp = getreg(child, addr);
1190 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
1191 addr <= offsetof(struct user, u_debugreg[7])) {
1192 addr -= offsetof(struct user, u_debugreg[0]);
1193 tmp = ptrace_get_debugreg(child, addr / sizeof(data));
1194 }
1195 ret = put_user(tmp, (__u32 __user *)datap);
1196 break;
1197 }
1198
1199 /* Write the word at location addr in the USER area. Only allow
1200 to update segment and debug registers with the upper 32bits
1201 zero-extended. */
1202 case PTRACE_POKEUSR:
1203 ret = -EIO;
1204 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) ||
1205 addr < offsetof(struct user_regs_struct, cs))
1206 break;
1207
1208 if (addr < sizeof(struct user_regs_struct))
1209 ret = putreg(child, addr, data);
1210 else if (addr >= offsetof(struct user, u_debugreg[0]) &&
1211 addr <= offsetof(struct user, u_debugreg[7])) {
1212 addr -= offsetof(struct user, u_debugreg[0]);
1213 ret = ptrace_set_debugreg(child,
1214 addr / sizeof(data), data);
1215 }
1216 break;
1217
1218 case PTRACE_GETREGS: /* Get all gp regs from the child. */
1219 return copy_regset_to_user(child,
1220 task_user_regset_view(current),
1221 REGSET_GENERAL,
1222 0, sizeof(struct user_regs_struct),
1223 datap);
1224
1225 case PTRACE_SETREGS: /* Set all gp regs in the child. */
1226 return copy_regset_from_user(child,
1227 task_user_regset_view(current),
1228 REGSET_GENERAL,
1229 0, sizeof(struct user_regs_struct),
1230 datap);
1231
1232 case PTRACE_GETFPREGS: /* Get the child FPU state. */
1233 return copy_regset_to_user(child,
1234 task_user_regset_view(current),
1235 REGSET_FP,
1236 0, sizeof(struct user_i387_struct),
1237 datap);
1238
1239 case PTRACE_SETFPREGS: /* Set the child FPU state. */
1240 return copy_regset_from_user(child,
1241 task_user_regset_view(current),
1242 REGSET_FP,
1243 0, sizeof(struct user_i387_struct),
1244 datap);
1245
1246 default:
1247 return compat_ptrace_request(child, request, addr, data);
1248 }
1249
1250 return ret;
1251}
1252#endif
1253
1254long compat_arch_ptrace(struct task_struct *child, compat_long_t request, 1132long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1255 compat_ulong_t caddr, compat_ulong_t cdata) 1133 compat_ulong_t caddr, compat_ulong_t cdata)
1256{ 1134{
@@ -1260,11 +1138,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1260 int ret; 1138 int ret;
1261 __u32 val; 1139 __u32 val;
1262 1140
1263#ifdef CONFIG_X86_X32_ABI
1264 if (!is_ia32_task())
1265 return x32_arch_ptrace(child, request, caddr, cdata);
1266#endif
1267
1268 switch (request) { 1141 switch (request) {
1269 case PTRACE_PEEKUSR: 1142 case PTRACE_PEEKUSR:
1270 ret = getreg32(child, addr, &val); 1143 ret = getreg32(child, addr, &val);
@@ -1364,6 +1237,9 @@ static const struct user_regset_view user_x86_64_view = {
1364#define genregs32_get genregs_get 1237#define genregs32_get genregs_get
1365#define genregs32_set genregs_set 1238#define genregs32_set genregs_set
1366 1239
1240#define user_i387_ia32_struct user_i387_struct
1241#define user32_fxsr_struct user_fxsr_struct
1242
1367#endif /* CONFIG_X86_64 */ 1243#endif /* CONFIG_X86_64 */
1368 1244
1369#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1245#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -1449,7 +1325,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
1449 int error_code, int si_code, 1325 int error_code, int si_code,
1450 struct siginfo *info) 1326 struct siginfo *info)
1451{ 1327{
1452 tsk->thread.trap_nr = X86_TRAP_DB; 1328 tsk->thread.trap_no = 1;
1453 tsk->thread.error_code = error_code; 1329 tsk->thread.error_code = error_code;
1454 1330
1455 memset(info, 0, sizeof(*info)); 1331 memset(info, 0, sizeof(*info));
@@ -1492,8 +1368,6 @@ long syscall_trace_enter(struct pt_regs *regs)
1492{ 1368{
1493 long ret = 0; 1369 long ret = 0;
1494 1370
1495 user_exit();
1496
1497 /* 1371 /*
1498 * If we stepped into a sysenter/syscall insn, it trapped in 1372 * If we stepped into a sysenter/syscall insn, it trapped in
1499 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. 1373 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
@@ -1505,11 +1379,7 @@ long syscall_trace_enter(struct pt_regs *regs)
1505 regs->flags |= X86_EFLAGS_TF; 1379 regs->flags |= X86_EFLAGS_TF;
1506 1380
1507 /* do the secure computing check first */ 1381 /* do the secure computing check first */
1508 if (secure_computing(regs->orig_ax)) { 1382 secure_computing(regs->orig_ax);
1509 /* seccomp failures shouldn't expose any additional code. */
1510 ret = -1L;
1511 goto out;
1512 }
1513 1383
1514 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1384 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1515 ret = -1L; 1385 ret = -1L;
@@ -1521,20 +1391,21 @@ long syscall_trace_enter(struct pt_regs *regs)
1521 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1391 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1522 trace_sys_enter(regs, regs->orig_ax); 1392 trace_sys_enter(regs, regs->orig_ax);
1523 1393
1524 if (IS_IA32) 1394 if (unlikely(current->audit_context)) {
1525 audit_syscall_entry(AUDIT_ARCH_I386, 1395 if (IS_IA32)
1526 regs->orig_ax, 1396 audit_syscall_entry(AUDIT_ARCH_I386,
1527 regs->bx, regs->cx, 1397 regs->orig_ax,
1528 regs->dx, regs->si); 1398 regs->bx, regs->cx,
1399 regs->dx, regs->si);
1529#ifdef CONFIG_X86_64 1400#ifdef CONFIG_X86_64
1530 else 1401 else
1531 audit_syscall_entry(AUDIT_ARCH_X86_64, 1402 audit_syscall_entry(AUDIT_ARCH_X86_64,
1532 regs->orig_ax, 1403 regs->orig_ax,
1533 regs->di, regs->si, 1404 regs->di, regs->si,
1534 regs->dx, regs->r10); 1405 regs->dx, regs->r10);
1535#endif 1406#endif
1407 }
1536 1408
1537out:
1538 return ret ?: regs->orig_ax; 1409 return ret ?: regs->orig_ax;
1539} 1410}
1540 1411
@@ -1542,14 +1413,8 @@ void syscall_trace_leave(struct pt_regs *regs)
1542{ 1413{
1543 bool step; 1414 bool step;
1544 1415
1545 /* 1416 if (unlikely(current->audit_context))
1546 * We may come here right after calling schedule_user() 1417 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1547 * or do_notify_resume(), in which case we can be in RCU
1548 * user mode.
1549 */
1550 user_exit();
1551
1552 audit_syscall_exit(regs);
1553 1418
1554 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1419 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1555 trace_sys_exit(regs, regs->ax); 1420 trace_sys_exit(regs, regs->ax);
@@ -1564,6 +1429,4 @@ void syscall_trace_leave(struct pt_regs *regs)
1564 !test_thread_flag(TIF_SYSCALL_EMU); 1429 !test_thread_flag(TIF_SYSCALL_EMU);
1565 if (step || test_thread_flag(TIF_SYSCALL_TRACE)) 1430 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1566 tracehook_report_syscall_exit(regs, step); 1431 tracehook_report_syscall_exit(regs, step);
1567
1568 user_enter();
1569} 1432}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 85c39590c1a..42eb3300dfc 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,13 +17,23 @@
17 17
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/notifier.h>
21#include <linux/sched.h>
22#include <linux/gfp.h>
23#include <linux/bootmem.h>
24#include <asm/fixmap.h>
25#include <asm/pvclock.h> 20#include <asm/pvclock.h>
26 21
22/*
23 * These are perodically updated
24 * xen: magic shared_info page
25 * kvm: gpa registered via msr
26 * and then copied here.
27 */
28struct pvclock_shadow_time {
29 u64 tsc_timestamp; /* TSC at last update of time vals. */
30 u64 system_timestamp; /* Time, in nanosecs, since boot. */
31 u32 tsc_to_nsec_mul;
32 int tsc_shift;
33 u32 version;
34 u8 flags;
35};
36
27static u8 valid_flags __read_mostly = 0; 37static u8 valid_flags __read_mostly = 0;
28 38
29void pvclock_set_flags(u8 flags) 39void pvclock_set_flags(u8 flags)
@@ -31,6 +41,34 @@ void pvclock_set_flags(u8 flags)
31 valid_flags = flags; 41 valid_flags = flags;
32} 42}
33 43
44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
45{
46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
49}
50
51/*
52 * Reads a consistent set of time-base values from hypervisor,
53 * into a shadow data area.
54 */
55static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
56 struct pvclock_vcpu_time_info *src)
57{
58 do {
59 dst->version = src->version;
60 rmb(); /* fetch version before data */
61 dst->tsc_timestamp = src->tsc_timestamp;
62 dst->system_timestamp = src->system_time;
63 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
64 dst->tsc_shift = src->tsc_shift;
65 dst->flags = src->flags;
66 rmb(); /* test version after fetching data */
67 } while ((src->version & 1) || (dst->version != src->version));
68
69 return dst->version;
70}
71
34unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) 72unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
35{ 73{
36 u64 pv_tsc_khz = 1000000ULL << 32; 74 u64 pv_tsc_khz = 1000000ULL << 32;
@@ -50,32 +88,23 @@ void pvclock_resume(void)
50 atomic64_set(&last_value, 0); 88 atomic64_set(&last_value, 0);
51} 89}
52 90
53u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
54{
55 unsigned version;
56 cycle_t ret;
57 u8 flags;
58
59 do {
60 version = __pvclock_read_cycles(src, &ret, &flags);
61 } while ((src->version & 1) || version != src->version);
62
63 return flags & valid_flags;
64}
65
66cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
67{ 92{
93 struct pvclock_shadow_time shadow;
68 unsigned version; 94 unsigned version;
69 cycle_t ret; 95 cycle_t ret, offset;
70 u64 last; 96 u64 last;
71 u8 flags;
72 97
73 do { 98 do {
74 version = __pvclock_read_cycles(src, &ret, &flags); 99 version = pvclock_get_time_values(&shadow, src);
75 } while ((src->version & 1) || version != src->version); 100 barrier();
101 offset = pvclock_get_nsec_offset(&shadow);
102 ret = shadow.system_timestamp + offset;
103 barrier();
104 } while (version != src->version);
76 105
77 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 106 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
78 (flags & PVCLOCK_TSC_STABLE_BIT)) 107 (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
79 return ret; 108 return ret;
80 109
81 /* 110 /*
@@ -127,71 +156,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
127 156
128 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
129} 158}
130
131static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
132
133static struct pvclock_vsyscall_time_info *
134pvclock_get_vsyscall_user_time_info(int cpu)
135{
136 if (!pvclock_vdso_info) {
137 BUG();
138 return NULL;
139 }
140
141 return &pvclock_vdso_info[cpu];
142}
143
144struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
145{
146 return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
147}
148
149#ifdef CONFIG_X86_64
150static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
151 void *v)
152{
153 struct task_migration_notifier *mn = v;
154 struct pvclock_vsyscall_time_info *pvti;
155
156 pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
157
158 /* this is NULL when pvclock vsyscall is not initialized */
159 if (unlikely(pvti == NULL))
160 return NOTIFY_DONE;
161
162 pvti->migrate_count++;
163
164 return NOTIFY_DONE;
165}
166
167static struct notifier_block pvclock_migrate = {
168 .notifier_call = pvclock_task_migrate,
169};
170
171/*
172 * Initialize the generic pvclock vsyscall state. This will allocate
173 * a/some page(s) for the per-vcpu pvclock information, set up a
174 * fixmap mapping for the page(s)
175 */
176
177int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
178 int size)
179{
180 int idx;
181
182 WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
183
184 pvclock_vdso_info = i;
185
186 for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
187 __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
188 __pa_symbol(i) + (idx*PAGE_SIZE),
189 PAGE_KERNEL_VVAR);
190 }
191
192
193 register_task_migration_notifier(&pvclock_migrate);
194
195 return 0;
196}
197#endif
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 26ee48a33dc..b78643d0f9a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -8,7 +8,7 @@
8 8
9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) 9#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
10 10
11static void quirk_intel_irqbalance(struct pci_dev *dev) 11static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
12{ 12{
13 u8 config; 13 u8 config;
14 u16 word; 14 u16 word;
@@ -512,7 +512,7 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
512 512
513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 513#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
514/* Set correct numa_node information for AMD NB functions */ 514/* Set correct numa_node information for AMD NB functions */
515static void quirk_amd_nb_node(struct pci_dev *dev) 515static void __init quirk_amd_nb_node(struct pci_dev *dev)
516{ 516{
517 struct pci_dev *nb_ht; 517 struct pci_dev *nb_ht;
518 unsigned int devfn; 518 unsigned int devfn;
@@ -553,17 +553,4 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
553 quirk_amd_nb_node); 553 quirk_amd_nb_node);
554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, 554DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
555 quirk_amd_nb_node); 555 quirk_amd_nb_node);
556DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0,
557 quirk_amd_nb_node);
558DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1,
559 quirk_amd_nb_node);
560DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2,
561 quirk_amd_nb_node);
562DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3,
563 quirk_amd_nb_node);
564DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4,
565 quirk_amd_nb_node);
566DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5,
567 quirk_amd_nb_node);
568
569#endif 556#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 4e8ba39eaf0..d4a705f2283 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/module.h> 1#include <linux/module.h>
4#include <linux/reboot.h> 2#include <linux/reboot.h>
5#include <linux/init.h> 3#include <linux/init.h>
@@ -22,12 +20,13 @@
22#include <asm/virtext.h> 20#include <asm/virtext.h>
23#include <asm/cpu.h> 21#include <asm/cpu.h>
24#include <asm/nmi.h> 22#include <asm/nmi.h>
25#include <asm/smp.h>
26 23
27#include <linux/ctype.h> 24#ifdef CONFIG_X86_32
28#include <linux/mc146818rtc.h> 25# include <linux/ctype.h>
29#include <asm/realmode.h> 26# include <linux/mc146818rtc.h>
30#include <asm/x86_init.h> 27#else
28# include <asm/x86_init.h>
29#endif
31 30
32/* 31/*
33 * Power off function, if any 32 * Power off function, if any
@@ -40,21 +39,11 @@ static int reboot_mode;
40enum reboot_type reboot_type = BOOT_ACPI; 39enum reboot_type reboot_type = BOOT_ACPI;
41int reboot_force; 40int reboot_force;
42 41
43/* 42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
44 * This variable is used privately to keep track of whether or not
45 * reboot_type is still set to its default value (i.e., reboot= hasn't
46 * been set on the command line). This is needed so that we can
47 * suppress DMI scanning for reboot quirks. Without it, it's
48 * impossible to override a faulty reboot quirk without recompiling.
49 */
50static int reboot_default = 1;
51
52#ifdef CONFIG_SMP
53static int reboot_cpu = -1; 43static int reboot_cpu = -1;
54#endif 44#endif
55 45
56/* 46/* This is set if we need to go through the 'emergency' path.
57 * This is set if we need to go through the 'emergency' path.
58 * When machine_emergency_restart() is called, we may be on 47 * When machine_emergency_restart() is called, we may be on
59 * an inconsistent state and won't be able to do a clean cleanup 48 * an inconsistent state and won't be able to do a clean cleanup
60 */ 49 */
@@ -63,29 +52,21 @@ static int reboot_emergency;
63/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ 52/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
64bool port_cf9_safe = false; 53bool port_cf9_safe = false;
65 54
66/* 55/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]
67 * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] 56 warm Don't set the cold reboot flag
68 * warm Don't set the cold reboot flag 57 cold Set the cold reboot flag
69 * cold Set the cold reboot flag 58 bios Reboot by jumping through the BIOS (only for X86_32)
70 * bios Reboot by jumping through the BIOS 59 smp Reboot by executing reset on BSP or other CPU (only for X86_32)
71 * smp Reboot by executing reset on BSP or other CPU 60 triple Force a triple fault (init)
72 * triple Force a triple fault (init) 61 kbd Use the keyboard controller. cold reset (default)
73 * kbd Use the keyboard controller. cold reset (default) 62 acpi Use the RESET_REG in the FADT
74 * acpi Use the RESET_REG in the FADT 63 efi Use efi reset_system runtime service
75 * efi Use efi reset_system runtime service 64 pci Use the so-called "PCI reset register", CF9
76 * pci Use the so-called "PCI reset register", CF9 65 force Avoid anything that could hang.
77 * force Avoid anything that could hang.
78 */ 66 */
79static int __init reboot_setup(char *str) 67static int __init reboot_setup(char *str)
80{ 68{
81 for (;;) { 69 for (;;) {
82 /*
83 * Having anything passed on the command line via
84 * reboot= will cause us to disable DMI checking
85 * below.
86 */
87 reboot_default = 0;
88
89 switch (*str) { 70 switch (*str) {
90 case 'w': 71 case 'w':
91 reboot_mode = 0x1234; 72 reboot_mode = 0x1234;
@@ -95,6 +76,7 @@ static int __init reboot_setup(char *str)
95 reboot_mode = 0; 76 reboot_mode = 0;
96 break; 77 break;
97 78
79#ifdef CONFIG_X86_32
98#ifdef CONFIG_SMP 80#ifdef CONFIG_SMP
99 case 's': 81 case 's':
100 if (isdigit(*(str+1))) { 82 if (isdigit(*(str+1))) {
@@ -102,15 +84,14 @@ static int __init reboot_setup(char *str)
102 if (isdigit(*(str+2))) 84 if (isdigit(*(str+2)))
103 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); 85 reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0');
104 } 86 }
105 /* 87 /* we will leave sorting out the final value
106 * We will leave sorting out the final value 88 when we are ready to reboot, since we might not
107 * when we are ready to reboot, since we might not 89 have detected BSP APIC ID or smp_num_cpu */
108 * have detected BSP APIC ID or smp_num_cpu
109 */
110 break; 90 break;
111#endif /* CONFIG_SMP */ 91#endif /* CONFIG_SMP */
112 92
113 case 'b': 93 case 'b':
94#endif
114 case 'a': 95 case 'a':
115 case 'k': 96 case 'k':
116 case 't': 97 case 't':
@@ -136,6 +117,7 @@ static int __init reboot_setup(char *str)
136__setup("reboot=", reboot_setup); 117__setup("reboot=", reboot_setup);
137 118
138 119
120#ifdef CONFIG_X86_32
139/* 121/*
140 * Reboot options and system auto-detection code provided by 122 * Reboot options and system auto-detection code provided by
141 * Dell Inc. so their systems "just work". :-) 123 * Dell Inc. so their systems "just work". :-)
@@ -149,64 +131,7 @@ static int __init set_bios_reboot(const struct dmi_system_id *d)
149{ 131{
150 if (reboot_type != BOOT_BIOS) { 132 if (reboot_type != BOOT_BIOS) {
151 reboot_type = BOOT_BIOS; 133 reboot_type = BOOT_BIOS;
152 pr_info("%s series board detected. Selecting %s-method for reboots.\n", 134 printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident);
153 "BIOS", d->ident);
154 }
155 return 0;
156}
157
158void __noreturn machine_real_restart(unsigned int type)
159{
160 local_irq_disable();
161
162 /*
163 * Write zero to CMOS register number 0x0f, which the BIOS POST
164 * routine will recognize as telling it to do a proper reboot. (Well
165 * that's what this book in front of me says -- it may only apply to
166 * the Phoenix BIOS though, it's not clear). At the same time,
167 * disable NMIs by setting the top bit in the CMOS address register,
168 * as we're about to do peculiar things to the CPU. I'm not sure if
169 * `outb_p' is needed instead of just `outb'. Use it to be on the
170 * safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
171 */
172 spin_lock(&rtc_lock);
173 CMOS_WRITE(0x00, 0x8f);
174 spin_unlock(&rtc_lock);
175
176 /*
177 * Switch back to the initial page table.
178 */
179#ifdef CONFIG_X86_32
180 load_cr3(initial_page_table);
181#else
182 write_cr3(real_mode_header->trampoline_pgd);
183#endif
184
185 /* Jump to the identity-mapped low memory code */
186#ifdef CONFIG_X86_32
187 asm volatile("jmpl *%0" : :
188 "rm" (real_mode_header->machine_real_restart_asm),
189 "a" (type));
190#else
191 asm volatile("ljmpl *%0" : :
192 "m" (real_mode_header->machine_real_restart_asm),
193 "D" (type));
194#endif
195 unreachable();
196}
197#ifdef CONFIG_APM_MODULE
198EXPORT_SYMBOL(machine_real_restart);
199#endif
200
201/*
202 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
203 */
204static int __init set_pci_reboot(const struct dmi_system_id *d)
205{
206 if (reboot_type != BOOT_CF9) {
207 reboot_type = BOOT_CF9;
208 pr_info("%s series board detected. Selecting %s-method for reboots.\n",
209 "PCI", d->ident);
210 } 135 }
211 return 0; 136 return 0;
212} 137}
@@ -215,15 +140,11 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d)
215{ 140{
216 if (reboot_type != BOOT_KBD) { 141 if (reboot_type != BOOT_KBD) {
217 reboot_type = BOOT_KBD; 142 reboot_type = BOOT_KBD;
218 pr_info("%s series board detected. Selecting %s-method for reboot.\n", 143 printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident);
219 "KBD", d->ident);
220 } 144 }
221 return 0; 145 return 0;
222} 146}
223 147
224/*
225 * This is a single dmi_table handling all reboot quirks.
226 */
227static struct dmi_system_id __initdata reboot_dmi_table[] = { 148static struct dmi_system_id __initdata reboot_dmi_table[] = {
228 { /* Handle problems with rebooting on Dell E520's */ 149 { /* Handle problems with rebooting on Dell E520's */
229 .callback = set_bios_reboot, 150 .callback = set_bios_reboot,
@@ -249,7 +170,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
249 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), 170 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"),
250 }, 171 },
251 }, 172 },
252 { /* Handle problems with rebooting on Dell Optiplex 745's SFF */ 173 { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/
253 .callback = set_bios_reboot, 174 .callback = set_bios_reboot,
254 .ident = "Dell OptiPlex 745", 175 .ident = "Dell OptiPlex 745",
255 .matches = { 176 .matches = {
@@ -257,7 +178,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
257 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), 178 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"),
258 }, 179 },
259 }, 180 },
260 { /* Handle problems with rebooting on Dell Optiplex 745's DFF */ 181 { /* Handle problems with rebooting on Dell Optiplex 745's DFF*/
261 .callback = set_bios_reboot, 182 .callback = set_bios_reboot,
262 .ident = "Dell OptiPlex 745", 183 .ident = "Dell OptiPlex 745",
263 .matches = { 184 .matches = {
@@ -266,7 +187,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
266 DMI_MATCH(DMI_BOARD_NAME, "0MM599"), 187 DMI_MATCH(DMI_BOARD_NAME, "0MM599"),
267 }, 188 },
268 }, 189 },
269 { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */ 190 { /* Handle problems with rebooting on Dell Optiplex 745 with 0KW626 */
270 .callback = set_bios_reboot, 191 .callback = set_bios_reboot,
271 .ident = "Dell OptiPlex 745", 192 .ident = "Dell OptiPlex 745",
272 .matches = { 193 .matches = {
@@ -275,7 +196,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
275 DMI_MATCH(DMI_BOARD_NAME, "0KW626"), 196 DMI_MATCH(DMI_BOARD_NAME, "0KW626"),
276 }, 197 },
277 }, 198 },
278 { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */ 199 { /* Handle problems with rebooting on Dell Optiplex 330 with 0KP561 */
279 .callback = set_bios_reboot, 200 .callback = set_bios_reboot,
280 .ident = "Dell OptiPlex 330", 201 .ident = "Dell OptiPlex 330",
281 .matches = { 202 .matches = {
@@ -284,7 +205,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
284 DMI_MATCH(DMI_BOARD_NAME, "0KP561"), 205 DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
285 }, 206 },
286 }, 207 },
287 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */ 208 { /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
288 .callback = set_bios_reboot, 209 .callback = set_bios_reboot,
289 .ident = "Dell OptiPlex 360", 210 .ident = "Dell OptiPlex 360",
290 .matches = { 211 .matches = {
@@ -293,7 +214,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
293 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 214 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
294 }, 215 },
295 }, 216 },
296 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G */ 217 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
297 .callback = set_bios_reboot, 218 .callback = set_bios_reboot,
298 .ident = "Dell OptiPlex 760", 219 .ident = "Dell OptiPlex 760",
299 .matches = { 220 .matches = {
@@ -358,7 +279,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
358 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), 279 DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"),
359 }, 280 },
360 }, 281 },
361 { /* Handle problems with rebooting on ASUS P4S800 */ 282 { /* Handle problems with rebooting on CompuLab SBC-FITPC2 */
283 .callback = set_bios_reboot,
284 .ident = "CompuLab SBC-FITPC2",
285 .matches = {
286 DMI_MATCH(DMI_SYS_VENDOR, "CompuLab"),
287 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
288 },
289 },
290 { /* Handle problems with rebooting on ASUS P4S800 */
362 .callback = set_bios_reboot, 291 .callback = set_bios_reboot,
363 .ident = "ASUS P4S800", 292 .ident = "ASUS P4S800",
364 .matches = { 293 .matches = {
@@ -366,8 +295,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
366 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 295 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
367 }, 296 },
368 }, 297 },
369 298 { /* Handle problems with rebooting on VersaLogic Menlow boards */
370 { /* Handle reboot issue on Acer Aspire one */ 299 .callback = set_bios_reboot,
300 .ident = "VersaLogic Menlow based board",
301 .matches = {
302 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
303 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
304 },
305 },
306 { /* Handle reboot issue on Acer Aspire one */
371 .callback = set_kbd_reboot, 307 .callback = set_kbd_reboot,
372 .ident = "Acer Aspire One A110", 308 .ident = "Acer Aspire One A110",
373 .matches = { 309 .matches = {
@@ -375,6 +311,91 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
375 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), 311 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
376 }, 312 },
377 }, 313 },
314 { }
315};
316
317static int __init reboot_init(void)
318{
319 dmi_check_system(reboot_dmi_table);
320 return 0;
321}
322core_initcall(reboot_init);
323
324extern const unsigned char machine_real_restart_asm[];
325extern const u64 machine_real_restart_gdt[3];
326
327void machine_real_restart(unsigned int type)
328{
329 void *restart_va;
330 unsigned long restart_pa;
331 void (*restart_lowmem)(unsigned int);
332 u64 *lowmem_gdt;
333
334 local_irq_disable();
335
336 /* Write zero to CMOS register number 0x0f, which the BIOS POST
337 routine will recognize as telling it to do a proper reboot. (Well
338 that's what this book in front of me says -- it may only apply to
339 the Phoenix BIOS though, it's not clear). At the same time,
340 disable NMIs by setting the top bit in the CMOS address register,
341 as we're about to do peculiar things to the CPU. I'm not sure if
342 `outb_p' is needed instead of just `outb'. Use it to be on the
343 safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.)
344 */
345 spin_lock(&rtc_lock);
346 CMOS_WRITE(0x00, 0x8f);
347 spin_unlock(&rtc_lock);
348
349 /*
350 * Switch back to the initial page table.
351 */
352 load_cr3(initial_page_table);
353
354 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
355 this on booting to tell it to "Bypass memory test (also warm
356 boot)". This seems like a fairly standard thing that gets set by
357 REBOOT.COM programs, and the previous reset routine did this
358 too. */
359 *((unsigned short *)0x472) = reboot_mode;
360
361 /* Patch the GDT in the low memory trampoline */
362 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
363
364 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
365 restart_pa = virt_to_phys(restart_va);
366 restart_lowmem = (void (*)(unsigned int))restart_pa;
367
368 /* GDT[0]: GDT self-pointer */
369 lowmem_gdt[0] =
370 (u64)(sizeof(machine_real_restart_gdt) - 1) +
371 ((u64)virt_to_phys(lowmem_gdt) << 16);
372 /* GDT[1]: 64K real mode code segment */
373 lowmem_gdt[1] =
374 GDT_ENTRY(0x009b, restart_pa, 0xffff);
375
376 /* Jump to the identity-mapped low memory code */
377 restart_lowmem(type);
378}
379#ifdef CONFIG_APM_MODULE
380EXPORT_SYMBOL(machine_real_restart);
381#endif
382
383#endif /* CONFIG_X86_32 */
384
385/*
386 * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot
387 */
388static int __init set_pci_reboot(const struct dmi_system_id *d)
389{
390 if (reboot_type != BOOT_CF9) {
391 reboot_type = BOOT_CF9;
392 printk(KERN_INFO "%s series board detected. "
393 "Selecting PCI-method for reboots.\n", d->ident);
394 }
395 return 0;
396}
397
398static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
378 { /* Handle problems with rebooting on Apple MacBook5 */ 399 { /* Handle problems with rebooting on Apple MacBook5 */
379 .callback = set_pci_reboot, 400 .callback = set_pci_reboot,
380 .ident = "Apple MacBook5", 401 .ident = "Apple MacBook5",
@@ -431,36 +452,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
431 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), 452 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
432 }, 453 },
433 }, 454 },
434 { /* Handle problems with rebooting on the OptiPlex 990. */
435 .callback = set_pci_reboot,
436 .ident = "Dell OptiPlex 990",
437 .matches = {
438 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
439 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"),
440 },
441 },
442 { /* Handle problems with rebooting on the Precision M6600. */
443 .callback = set_pci_reboot,
444 .ident = "Dell OptiPlex 990",
445 .matches = {
446 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
447 DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"),
448 },
449 },
450 { } 455 { }
451}; 456};
452 457
453static int __init reboot_init(void) 458static int __init pci_reboot_init(void)
454{ 459{
455 /* 460 dmi_check_system(pci_reboot_dmi_table);
456 * Only do the DMI check if reboot_type hasn't been overridden
457 * on the command line
458 */
459 if (reboot_default)
460 dmi_check_system(reboot_dmi_table);
461 return 0; 461 return 0;
462} 462}
463core_initcall(reboot_init); 463core_initcall(pci_reboot_init);
464 464
465static inline void kb_wait(void) 465static inline void kb_wait(void)
466{ 466{
@@ -473,19 +473,19 @@ static inline void kb_wait(void)
473 } 473 }
474} 474}
475 475
476static void vmxoff_nmi(int cpu, struct pt_regs *regs) 476static void vmxoff_nmi(int cpu, struct die_args *args)
477{ 477{
478 cpu_emergency_vmxoff(); 478 cpu_emergency_vmxoff();
479} 479}
480 480
481/* Use NMIs as IPIs to tell all CPUs to disable virtualization */ 481/* Use NMIs as IPIs to tell all CPUs to disable virtualization
482 */
482static void emergency_vmx_disable_all(void) 483static void emergency_vmx_disable_all(void)
483{ 484{
484 /* Just make sure we won't change CPUs while doing this */ 485 /* Just make sure we won't change CPUs while doing this */
485 local_irq_disable(); 486 local_irq_disable();
486 487
487 /* 488 /* We need to disable VMX on all CPUs before rebooting, otherwise
488 * We need to disable VMX on all CPUs before rebooting, otherwise
489 * we risk hanging up the machine, because the CPU ignore INIT 489 * we risk hanging up the machine, because the CPU ignore INIT
490 * signals when VMX is enabled. 490 * signals when VMX is enabled.
491 * 491 *
@@ -504,7 +504,8 @@ static void emergency_vmx_disable_all(void)
504 * is still enabling VMX. 504 * is still enabling VMX.
505 */ 505 */
506 if (cpu_has_vmx() && cpu_vmx_enabled()) { 506 if (cpu_has_vmx() && cpu_vmx_enabled()) {
507 /* Disable VMX on this CPU. */ 507 /* Disable VMX on this CPU.
508 */
508 cpu_vmxoff(); 509 cpu_vmxoff();
509 510
510 /* Halt and disable VMX on the other CPUs */ 511 /* Halt and disable VMX on the other CPUs */
@@ -549,12 +550,12 @@ static void native_machine_emergency_restart(void)
549 /* Could also try the reset bit in the Hammer NB */ 550 /* Could also try the reset bit in the Hammer NB */
550 switch (reboot_type) { 551 switch (reboot_type) {
551 case BOOT_KBD: 552 case BOOT_KBD:
552 mach_reboot_fixups(); /* For board specific fixups */ 553 mach_reboot_fixups(); /* for board specific fixups */
553 554
554 for (i = 0; i < 10; i++) { 555 for (i = 0; i < 10; i++) {
555 kb_wait(); 556 kb_wait();
556 udelay(50); 557 udelay(50);
557 outb(0xfe, 0x64); /* Pulse reset low */ 558 outb(0xfe, 0x64); /* pulse reset low */
558 udelay(50); 559 udelay(50);
559 } 560 }
560 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { 561 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
@@ -572,11 +573,13 @@ static void native_machine_emergency_restart(void)
572 reboot_type = BOOT_KBD; 573 reboot_type = BOOT_KBD;
573 break; 574 break;
574 575
576#ifdef CONFIG_X86_32
575 case BOOT_BIOS: 577 case BOOT_BIOS:
576 machine_real_restart(MRR_BIOS); 578 machine_real_restart(MRR_BIOS);
577 579
578 reboot_type = BOOT_KBD; 580 reboot_type = BOOT_KBD;
579 break; 581 break;
582#endif
580 583
581 case BOOT_ACPI: 584 case BOOT_ACPI:
582 acpi_reboot(); 585 acpi_reboot();
@@ -594,7 +597,7 @@ static void native_machine_emergency_restart(void)
594 597
595 case BOOT_CF9: 598 case BOOT_CF9:
596 port_cf9_safe = true; 599 port_cf9_safe = true;
597 /* Fall through */ 600 /* fall through */
598 601
599 case BOOT_CF9_COND: 602 case BOOT_CF9_COND:
600 if (port_cf9_safe) { 603 if (port_cf9_safe) {
@@ -618,10 +621,12 @@ void native_machine_shutdown(void)
618 /* The boot cpu is always logical cpu 0 */ 621 /* The boot cpu is always logical cpu 0 */
619 int reboot_cpu_id = 0; 622 int reboot_cpu_id = 0;
620 623
624#ifdef CONFIG_X86_32
621 /* See if there has been given a command line override */ 625 /* See if there has been given a command line override */
622 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && 626 if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) &&
623 cpu_online(reboot_cpu)) 627 cpu_online(reboot_cpu))
624 reboot_cpu_id = reboot_cpu; 628 reboot_cpu_id = reboot_cpu;
629#endif
625 630
626 /* Make certain the cpu I'm about to reboot on is online */ 631 /* Make certain the cpu I'm about to reboot on is online */
627 if (!cpu_online(reboot_cpu_id)) 632 if (!cpu_online(reboot_cpu_id))
@@ -630,12 +635,9 @@ void native_machine_shutdown(void)
630 /* Make certain I only run on the appropriate processor */ 635 /* Make certain I only run on the appropriate processor */
631 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); 636 set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id));
632 637
633 /* 638 /* O.K Now that I'm on the appropriate processor,
634 * O.K Now that I'm on the appropriate processor, stop all of the 639 * stop all of the others.
635 * others. Also disable the local irq to not receive the per-cpu
636 * timer interrupt which may trigger scheduler's load balance.
637 */ 640 */
638 local_irq_disable();
639 stop_other_cpus(); 641 stop_other_cpus();
640#endif 642#endif
641 643
@@ -662,7 +664,7 @@ static void __machine_emergency_restart(int emergency)
662 664
663static void native_machine_restart(char *__unused) 665static void native_machine_restart(char *__unused)
664{ 666{
665 pr_notice("machine restart\n"); 667 printk("machine restart\n");
666 668
667 if (!reboot_force) 669 if (!reboot_force)
668 machine_shutdown(); 670 machine_shutdown();
@@ -671,11 +673,12 @@ static void native_machine_restart(char *__unused)
671 673
672static void native_machine_halt(void) 674static void native_machine_halt(void)
673{ 675{
674 /* Stop other cpus and apics */ 676 /* stop other cpus and apics */
675 machine_shutdown(); 677 machine_shutdown();
676 678
677 tboot_shutdown(TB_SHUTDOWN_HALT); 679 tboot_shutdown(TB_SHUTDOWN_HALT);
678 680
681 /* stop this cpu */
679 stop_this_cpu(NULL); 682 stop_this_cpu(NULL);
680} 683}
681 684
@@ -686,7 +689,7 @@ static void native_machine_power_off(void)
686 machine_shutdown(); 689 machine_shutdown();
687 pm_power_off(); 690 pm_power_off();
688 } 691 }
689 /* A fallback in case there is no PM info available */ 692 /* a fallback in case there is no PM info available */
690 tboot_shutdown(TB_SHUTDOWN_HALT); 693 tboot_shutdown(TB_SHUTDOWN_HALT);
691} 694}
692 695
@@ -742,22 +745,25 @@ static nmi_shootdown_cb shootdown_callback;
742 745
743static atomic_t waiting_for_crash_ipi; 746static atomic_t waiting_for_crash_ipi;
744 747
745static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) 748static int crash_nmi_callback(struct notifier_block *self,
749 unsigned long val, void *data)
746{ 750{
747 int cpu; 751 int cpu;
748 752
753 if (val != DIE_NMI)
754 return NOTIFY_OK;
755
749 cpu = raw_smp_processor_id(); 756 cpu = raw_smp_processor_id();
750 757
751 /* 758 /* Don't do anything if this handler is invoked on crashing cpu.
752 * Don't do anything if this handler is invoked on crashing cpu.
753 * Otherwise, system will completely hang. Crashing cpu can get 759 * Otherwise, system will completely hang. Crashing cpu can get
754 * an NMI if system was initially booted with nmi_watchdog parameter. 760 * an NMI if system was initially booted with nmi_watchdog parameter.
755 */ 761 */
756 if (cpu == crashing_cpu) 762 if (cpu == crashing_cpu)
757 return NMI_HANDLED; 763 return NOTIFY_STOP;
758 local_irq_disable(); 764 local_irq_disable();
759 765
760 shootdown_callback(cpu, regs); 766 shootdown_callback(cpu, (struct die_args *)data);
761 767
762 atomic_dec(&waiting_for_crash_ipi); 768 atomic_dec(&waiting_for_crash_ipi);
763 /* Assume hlt works */ 769 /* Assume hlt works */
@@ -765,7 +771,7 @@ static int crash_nmi_callback(unsigned int val, struct pt_regs *regs)
765 for (;;) 771 for (;;)
766 cpu_relax(); 772 cpu_relax();
767 773
768 return NMI_HANDLED; 774 return 1;
769} 775}
770 776
771static void smp_send_nmi_allbutself(void) 777static void smp_send_nmi_allbutself(void)
@@ -773,8 +779,13 @@ static void smp_send_nmi_allbutself(void)
773 apic->send_IPI_allbutself(NMI_VECTOR); 779 apic->send_IPI_allbutself(NMI_VECTOR);
774} 780}
775 781
776/* 782static struct notifier_block crash_nmi_nb = {
777 * Halt all other CPUs, calling the specified function on each of them 783 .notifier_call = crash_nmi_callback,
784 /* we want to be the first one called */
785 .priority = NMI_LOCAL_HIGH_PRIOR+1,
786};
787
788/* Halt all other CPUs, calling the specified function on each of them
778 * 789 *
779 * This function can be used to halt all other CPUs on crash 790 * This function can be used to halt all other CPUs on crash
780 * or emergency reboot time. The function passed as parameter 791 * or emergency reboot time. The function passed as parameter
@@ -785,18 +796,16 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback)
785 unsigned long msecs; 796 unsigned long msecs;
786 local_irq_disable(); 797 local_irq_disable();
787 798
788 /* Make a note of crashing cpu. Will be used in NMI callback. */ 799 /* Make a note of crashing cpu. Will be used in NMI callback.*/
789 crashing_cpu = safe_smp_processor_id(); 800 crashing_cpu = safe_smp_processor_id();
790 801
791 shootdown_callback = callback; 802 shootdown_callback = callback;
792 803
793 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); 804 atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
794 /* Would it be better to replace the trap vector here? */ 805 /* Would it be better to replace the trap vector here? */
795 if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, 806 if (register_die_notifier(&crash_nmi_nb))
796 NMI_FLAG_FIRST, "crash")) 807 return; /* return what? */
797 return; /* Return what? */ 808 /* Ensure the new callback function is set before sending
798 /*
799 * Ensure the new callback function is set before sending
800 * out the NMI 809 * out the NMI
801 */ 810 */
802 wmb(); 811 wmb();
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 801602b5d74..ccdbc16b894 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -5,14 +5,12 @@
5#include <linux/mc146818rtc.h> 5#include <linux/mc146818rtc.h>
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/export.h>
9#include <linux/pnp.h> 8#include <linux/pnp.h>
10#include <linux/of.h> 9#include <linux/of.h>
11 10
12#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
13#include <asm/x86_init.h> 12#include <asm/x86_init.h>
14#include <asm/time.h> 13#include <asm/time.h>
15#include <asm/mrst.h>
16 14
17#ifdef CONFIG_X86_32 15#ifdef CONFIG_X86_32
18/* 16/*
@@ -195,6 +193,12 @@ void read_persistent_clock(struct timespec *ts)
195 ts->tv_nsec = 0; 193 ts->tv_nsec = 0;
196} 194}
197 195
196unsigned long long native_read_tsc(void)
197{
198 return __native_read_tsc();
199}
200EXPORT_SYMBOL(native_read_tsc);
201
198 202
199static struct resource rtc_resources[] = { 203static struct resource rtc_resources[] = {
200 [0] = { 204 [0] = {
@@ -219,7 +223,7 @@ static struct platform_device rtc_device = {
219static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
220{ 224{
221#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
222 static const char * const const ids[] __initconst = 226 static const char *ids[] __initconst =
223 { "PNP0b00", "PNP0b01", "PNP0b02", }; 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
224 struct pnp_dev *dev; 228 struct pnp_dev *dev;
225 struct pnp_id *id; 229 struct pnp_id *id;
@@ -237,10 +241,6 @@ static __init int add_rtc_cmos(void)
237 if (of_have_populated_dt()) 241 if (of_have_populated_dt())
238 return 0; 242 return 0;
239 243
240 /* Intel MID platforms don't have ioport rtc */
241 if (mrst_identify_cpu())
242 return -ENODEV;
243
244 platform_device_register(&rtc_device); 244 platform_device_register(&rtc_device);
245 dev_info(&rtc_device.dev, 245 dev_info(&rtc_device.dev,
246 "registered platform RTC device (no PNP device found)\n"); 246 "registered platform RTC device (no PNP device found)\n");
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 23ddd558fbd..afaf38447ef 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -34,6 +34,7 @@
34#include <linux/memblock.h> 34#include <linux/memblock.h>
35#include <linux/seq_file.h> 35#include <linux/seq_file.h>
36#include <linux/console.h> 36#include <linux/console.h>
37#include <linux/mca.h>
37#include <linux/root_dev.h> 38#include <linux/root_dev.h>
38#include <linux/highmem.h> 39#include <linux/highmem.h>
39#include <linux/module.h> 40#include <linux/module.h>
@@ -49,7 +50,6 @@
49#include <asm/pci-direct.h> 50#include <asm/pci-direct.h>
50#include <linux/init_ohci1394_dma.h> 51#include <linux/init_ohci1394_dma.h>
51#include <linux/kvm_para.h> 52#include <linux/kvm_para.h>
52#include <linux/dma-contiguous.h>
53 53
54#include <linux/errno.h> 54#include <linux/errno.h>
55#include <linux/kernel.h> 55#include <linux/kernel.h>
@@ -68,13 +68,12 @@
68#include <linux/percpu.h> 68#include <linux/percpu.h>
69#include <linux/crash_dump.h> 69#include <linux/crash_dump.h>
70#include <linux/tboot.h> 70#include <linux/tboot.h>
71#include <linux/jiffies.h>
72 71
73#include <video/edid.h> 72#include <video/edid.h>
74 73
75#include <asm/mtrr.h> 74#include <asm/mtrr.h>
76#include <asm/apic.h> 75#include <asm/apic.h>
77#include <asm/realmode.h> 76#include <asm/trampoline.h>
78#include <asm/e820.h> 77#include <asm/e820.h>
79#include <asm/mpspec.h> 78#include <asm/mpspec.h>
80#include <asm/setup.h> 79#include <asm/setup.h>
@@ -91,6 +90,7 @@
91#include <asm/processor.h> 90#include <asm/processor.h>
92#include <asm/bugs.h> 91#include <asm/bugs.h>
93 92
93#include <asm/system.h>
94#include <asm/vsyscall.h> 94#include <asm/vsyscall.h>
95#include <asm/cpu.h> 95#include <asm/cpu.h>
96#include <asm/desc.h> 96#include <asm/desc.h>
@@ -143,7 +143,11 @@ int default_check_phys_apicid_present(int phys_apicid)
143} 143}
144#endif 144#endif
145 145
146#ifndef CONFIG_DEBUG_BOOT_PARAMS
147struct boot_params __initdata boot_params;
148#else
146struct boot_params boot_params; 149struct boot_params boot_params;
150#endif
147 151
148/* 152/*
149 * Machine setup.. 153 * Machine setup..
@@ -176,6 +180,12 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1};
176/* common cpu data for all cpus */ 180/* common cpu data for all cpus */
177struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; 181struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1};
178EXPORT_SYMBOL(boot_cpu_data); 182EXPORT_SYMBOL(boot_cpu_data);
183static void set_mca_bus(int x)
184{
185#ifdef CONFIG_MCA
186 MCA_bus = x;
187#endif
188}
179 189
180unsigned int def_to_bigsmp; 190unsigned int def_to_bigsmp;
181 191
@@ -296,8 +306,7 @@ static void __init cleanup_highmap(void)
296static void __init reserve_brk(void) 306static void __init reserve_brk(void)
297{ 307{
298 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
299 memblock_reserve(__pa(_brk_start), 309 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
300 __pa(_brk_end) - __pa(_brk_start));
301 310
302 /* Mark brk area as locked down and no longer taking any 311 /* Mark brk area as locked down and no longer taking any
303 new allocations */ 312 new allocations */
@@ -322,17 +331,17 @@ static void __init relocate_initrd(void)
322 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, 331 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
323 PAGE_SIZE); 332 PAGE_SIZE);
324 333
325 if (!ramdisk_here) 334 if (ramdisk_here == MEMBLOCK_ERROR)
326 panic("Cannot find place for new RAMDISK of size %lld\n", 335 panic("Cannot find place for new RAMDISK of size %lld\n",
327 ramdisk_size); 336 ramdisk_size);
328 337
329 /* Note: this includes all the lowmem currently occupied by 338 /* Note: this includes all the lowmem currently occupied by
330 the initrd, we rely on that fact to keep the data intact. */ 339 the initrd, we rely on that fact to keep the data intact. */
331 memblock_reserve(ramdisk_here, area_size); 340 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
332 initrd_start = ramdisk_here + PAGE_OFFSET; 341 initrd_start = ramdisk_here + PAGE_OFFSET;
333 initrd_end = initrd_start + ramdisk_size; 342 initrd_end = initrd_start + ramdisk_size;
334 printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", 343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
335 ramdisk_here, ramdisk_here + ramdisk_size - 1); 344 ramdisk_here, ramdisk_here + ramdisk_size);
336 345
337 q = (char *)initrd_start; 346 q = (char *)initrd_start;
338 347
@@ -363,8 +372,8 @@ static void __init relocate_initrd(void)
363 /* high pages is not converted by early_res_to_bootmem */ 372 /* high pages is not converted by early_res_to_bootmem */
364 ramdisk_image = boot_params.hdr.ramdisk_image; 373 ramdisk_image = boot_params.hdr.ramdisk_image;
365 ramdisk_size = boot_params.hdr.ramdisk_size; 374 ramdisk_size = boot_params.hdr.ramdisk_size;
366 printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" 375 printk(KERN_INFO "Move RAMDISK from %016llx - %016llx to"
367 " [mem %#010llx-%#010llx]\n", 376 " %08llx - %08llx\n",
368 ramdisk_image, ramdisk_image + ramdisk_size - 1, 377 ramdisk_image, ramdisk_image + ramdisk_size - 1,
369 ramdisk_here, ramdisk_here + ramdisk_size - 1); 378 ramdisk_here, ramdisk_here + ramdisk_size - 1);
370} 379}
@@ -384,13 +393,14 @@ static void __init reserve_initrd(void)
384 initrd_start = 0; 393 initrd_start = 0;
385 394
386 if (ramdisk_size >= (end_of_lowmem>>1)) { 395 if (ramdisk_size >= (end_of_lowmem>>1)) {
387 panic("initrd too large to handle, " 396 memblock_x86_free_range(ramdisk_image, ramdisk_end);
388 "disabling initrd (%lld needed, %lld available)\n", 397 printk(KERN_ERR "initrd too large to handle, "
389 ramdisk_size, end_of_lowmem>>1); 398 "disabling initrd\n");
399 return;
390 } 400 }
391 401
392 printk(KERN_INFO "RAMDISK: [mem %#010llx-%#010llx]\n", ramdisk_image, 402 printk(KERN_INFO "RAMDISK: %08llx - %08llx\n", ramdisk_image,
393 ramdisk_end - 1); 403 ramdisk_end);
394 404
395 405
396 if (ramdisk_end <= end_of_lowmem) { 406 if (ramdisk_end <= end_of_lowmem) {
@@ -406,7 +416,7 @@ static void __init reserve_initrd(void)
406 416
407 relocate_initrd(); 417 relocate_initrd();
408 418
409 memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); 419 memblock_x86_free_range(ramdisk_image, ramdisk_end);
410} 420}
411#else 421#else
412static void __init reserve_initrd(void) 422static void __init reserve_initrd(void)
@@ -480,13 +490,15 @@ static void __init memblock_x86_reserve_range_setup_data(void)
480{ 490{
481 struct setup_data *data; 491 struct setup_data *data;
482 u64 pa_data; 492 u64 pa_data;
493 char buf[32];
483 494
484 if (boot_params.hdr.version < 0x0209) 495 if (boot_params.hdr.version < 0x0209)
485 return; 496 return;
486 pa_data = boot_params.hdr.setup_data; 497 pa_data = boot_params.hdr.setup_data;
487 while (pa_data) { 498 while (pa_data) {
488 data = early_memremap(pa_data, sizeof(*data)); 499 data = early_memremap(pa_data, sizeof(*data));
489 memblock_reserve(pa_data, sizeof(*data) + data->len); 500 sprintf(buf, "setup data %x", data->type);
501 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
490 pa_data = data->next; 502 pa_data = data->next;
491 early_iounmap(data, sizeof(*data)); 503 early_iounmap(data, sizeof(*data));
492 } 504 }
@@ -498,6 +510,15 @@ static void __init memblock_x86_reserve_range_setup_data(void)
498 510
499#ifdef CONFIG_KEXEC 511#ifdef CONFIG_KEXEC
500 512
513static inline unsigned long long get_total_mem(void)
514{
515 unsigned long long total;
516
517 total = max_pfn - min_low_pfn;
518
519 return total << PAGE_SHIFT;
520}
521
501/* 522/*
502 * Keep the crash kernel below this limit. On 32 bits earlier kernels 523 * Keep the crash kernel below this limit. On 32 bits earlier kernels
503 * would limit the kernel to the low 512 MiB due to mapping restrictions. 524 * would limit the kernel to the low 512 MiB due to mapping restrictions.
@@ -516,7 +537,7 @@ static void __init reserve_crashkernel(void)
516 unsigned long long crash_size, crash_base; 537 unsigned long long crash_size, crash_base;
517 int ret; 538 int ret;
518 539
519 total_mem = memblock_phys_mem_size(); 540 total_mem = get_total_mem();
520 541
521 ret = parse_crashkernel(boot_command_line, total_mem, 542 ret = parse_crashkernel(boot_command_line, total_mem,
522 &crash_size, &crash_base); 543 &crash_size, &crash_base);
@@ -533,7 +554,7 @@ static void __init reserve_crashkernel(void)
533 crash_base = memblock_find_in_range(alignment, 554 crash_base = memblock_find_in_range(alignment,
534 CRASH_KERNEL_ADDR_MAX, crash_size, alignment); 555 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
535 556
536 if (!crash_base) { 557 if (crash_base == MEMBLOCK_ERROR) {
537 pr_info("crashkernel reservation failed - No suitable area found.\n"); 558 pr_info("crashkernel reservation failed - No suitable area found.\n");
538 return; 559 return;
539 } 560 }
@@ -547,7 +568,7 @@ static void __init reserve_crashkernel(void)
547 return; 568 return;
548 } 569 }
549 } 570 }
550 memblock_reserve(crash_base, crash_size); 571 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
551 572
552 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 573 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
553 "for crashkernel (System RAM: %ldMB)\n", 574 "for crashkernel (System RAM: %ldMB)\n",
@@ -605,7 +626,7 @@ static __init void reserve_ibft_region(void)
605 addr = find_ibft_region(&size); 626 addr = find_ibft_region(&size);
606 627
607 if (size) 628 if (size)
608 memblock_reserve(addr, size); 629 memblock_x86_reserve_range(addr, addr + size, "* ibft");
609} 630}
610 631
611static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; 632static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
@@ -707,6 +728,7 @@ void __init setup_arch(char **cmdline_p)
707 apm_info.bios = boot_params.apm_bios_info; 728 apm_info.bios = boot_params.apm_bios_info;
708 ist_info = boot_params.ist_info; 729 ist_info = boot_params.ist_info;
709 if (boot_params.sys_desc_table.length != 0) { 730 if (boot_params.sys_desc_table.length != 0) {
731 set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
710 machine_id = boot_params.sys_desc_table.table[0]; 732 machine_id = boot_params.sys_desc_table.table[0];
711 machine_submodel_id = boot_params.sys_desc_table.table[1]; 733 machine_submodel_id = boot_params.sys_desc_table.table[1];
712 BIOS_revision = boot_params.sys_desc_table.table[2]; 734 BIOS_revision = boot_params.sys_desc_table.table[2];
@@ -728,16 +750,15 @@ void __init setup_arch(char **cmdline_p)
728#endif 750#endif
729#ifdef CONFIG_EFI 751#ifdef CONFIG_EFI
730 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 752 if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature,
731 "EL32", 4)) { 753#ifdef CONFIG_X86_32
732 efi_enabled = 1; 754 "EL32",
733 efi_64bit = false; 755#else
734 } else if (!strncmp((char *)&boot_params.efi_info.efi_loader_signature, 756 "EL64",
735 "EL64", 4)) { 757#endif
758 4)) {
736 efi_enabled = 1; 759 efi_enabled = 1;
737 efi_64bit = true; 760 efi_memblock_x86_reserve_range();
738 } 761 }
739 if (efi_enabled && efi_memblock_x86_reserve_range())
740 efi_enabled = 0;
741#endif 762#endif
742 763
743 x86_init.oem.arch_setup(); 764 x86_init.oem.arch_setup();
@@ -903,10 +924,10 @@ void __init setup_arch(char **cmdline_p)
903 setup_bios_corruption_check(); 924 setup_bios_corruption_check();
904#endif 925#endif
905 926
906 printk(KERN_DEBUG "initial memory mapped: [mem 0x00000000-%#010lx]\n", 927 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
907 (max_pfn_mapped<<PAGE_SHIFT) - 1); 928 max_pfn_mapped<<PAGE_SHIFT);
908 929
909 setup_real_mode(); 930 setup_trampolines();
910 931
911 init_gbpages(); 932 init_gbpages();
912 933
@@ -916,28 +937,13 @@ void __init setup_arch(char **cmdline_p)
916 937
917#ifdef CONFIG_X86_64 938#ifdef CONFIG_X86_64
918 if (max_pfn > max_low_pfn) { 939 if (max_pfn > max_low_pfn) {
919 int i; 940 max_pfn_mapped = init_memory_mapping(1UL<<32,
920 unsigned long start, end; 941 max_pfn<<PAGE_SHIFT);
921 unsigned long start_pfn, end_pfn;
922
923 for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn,
924 NULL) {
925
926 end = PFN_PHYS(end_pfn);
927 if (end <= (1UL<<32))
928 continue;
929
930 start = PFN_PHYS(start_pfn);
931 max_pfn_mapped = init_memory_mapping(
932 max((1UL<<32), start), end);
933 }
934
935 /* can we preseve max_low_pfn ?*/ 942 /* can we preseve max_low_pfn ?*/
936 max_low_pfn = max_pfn; 943 max_low_pfn = max_pfn;
937 } 944 }
938#endif 945#endif
939 memblock.current_limit = get_max_mapped(); 946 memblock.current_limit = get_max_mapped();
940 dma_contiguous_reserve(0);
941 947
942 /* 948 /*
943 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 949 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -952,10 +958,6 @@ void __init setup_arch(char **cmdline_p)
952 958
953 reserve_initrd(); 959 reserve_initrd();
954 960
955#if defined(CONFIG_ACPI) && defined(CONFIG_BLK_DEV_INITRD)
956 acpi_initrd_override((void *)initrd_start, initrd_end - initrd_start);
957#endif
958
959 reserve_crashkernel(); 961 reserve_crashkernel();
960 962
961 vsmp_init(); 963 vsmp_init();
@@ -972,17 +974,17 @@ void __init setup_arch(char **cmdline_p)
972 initmem_init(); 974 initmem_init();
973 memblock_find_dma_reserve(); 975 memblock_find_dma_reserve();
974 976
975#ifdef CONFIG_KVM_GUEST 977#ifdef CONFIG_KVM_CLOCK
976 kvmclock_init(); 978 kvmclock_init();
977#endif 979#endif
978 980
979 x86_init.paging.pagetable_init(); 981 x86_init.paging.pagetable_setup_start(swapper_pg_dir);
982 paging_init();
983 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
980 984
981 if (boot_cpu_data.cpuid_level >= 0) { 985 if (boot_cpu_data.cpuid_level >= 0) {
982 /* A CPU has %cr4 if and only if it has CPUID */ 986 /* A CPU has %cr4 if and only if it has CPUID */
983 mmu_cr4_features = read_cr4(); 987 mmu_cr4_features = read_cr4();
984 if (trampoline_cr4_features)
985 *trampoline_cr4_features = mmu_cr4_features;
986 } 988 }
987 989
988#ifdef CONFIG_X86_32 990#ifdef CONFIG_X86_32
@@ -1020,8 +1022,7 @@ void __init setup_arch(char **cmdline_p)
1020 init_cpu_to_node(); 1022 init_cpu_to_node();
1021 1023
1022 init_apic_mappings(); 1024 init_apic_mappings();
1023 if (x86_io_apic_ops.init) 1025 ioapic_and_gsi_init();
1024 x86_io_apic_ops.init();
1025 1026
1026 kvm_guest_init(); 1027 kvm_guest_init();
1027 1028
@@ -1047,20 +1048,6 @@ void __init setup_arch(char **cmdline_p)
1047 mcheck_init(); 1048 mcheck_init();
1048 1049
1049 arch_init_ideal_nops(); 1050 arch_init_ideal_nops();
1050
1051 register_refined_jiffies(CLOCK_TICK_RATE);
1052
1053#ifdef CONFIG_EFI
1054 /* Once setup is done above, disable efi_enabled on mismatched
1055 * firmware/kernel archtectures since there is no support for
1056 * runtime services.
1057 */
1058 if (efi_enabled && IS_ENABLED(CONFIG_X86_64) != efi_64bit) {
1059 pr_info("efi: Setup done, disabling due to 32/64-bit mismatch\n");
1060 efi_unmap_memmap();
1061 efi_enabled = 0;
1062 }
1063#endif
1064} 1051}
1065 1052
1066#ifdef CONFIG_X86_32 1053#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5cdff035774..71f4727da37 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,7 +21,7 @@
21#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
23 23
24DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); 24DEFINE_PER_CPU(int, cpu_number);
25EXPORT_PER_CPU_SYMBOL(cpu_number); 25EXPORT_PER_CPU_SYMBOL(cpu_number);
26 26
27#ifdef CONFIG_X86_64 27#ifdef CONFIG_X86_64
@@ -185,22 +185,10 @@ void __init setup_per_cpu_areas(void)
185#endif 185#endif
186 rc = -EINVAL; 186 rc = -EINVAL;
187 if (pcpu_chosen_fc != PCPU_FC_PAGE) { 187 if (pcpu_chosen_fc != PCPU_FC_PAGE) {
188 const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
188 const size_t dyn_size = PERCPU_MODULE_RESERVE + 189 const size_t dyn_size = PERCPU_MODULE_RESERVE +
189 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE; 190 PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
190 size_t atom_size;
191 191
192 /*
193 * On 64bit, use PMD_SIZE for atom_size so that embedded
194 * percpu areas are aligned to PMD. This, in the future,
195 * can also allow using PMD mappings in vmalloc area. Use
196 * PAGE_SIZE on 32bit as vmalloc space is highly contended
197 * and large vmalloc area allocs can easily fail.
198 */
199#ifdef CONFIG_X86_64
200 atom_size = PMD_SIZE;
201#else
202 atom_size = PAGE_SIZE;
203#endif
204 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, 192 rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
205 dyn_size, atom_size, 193 dyn_size, atom_size,
206 pcpu_cpu_distance, 194 pcpu_cpu_distance,
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index d6bf1f34a6e..54ddaeb221c 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,36 +6,30 @@
6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes 6 * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
7 * 2000-2002 x86-64 support by Andi Kleen 7 * 2000-2002 x86-64 support by Andi Kleen
8 */ 8 */
9
10#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
11
12#include <linux/sched.h> 9#include <linux/sched.h>
13#include <linux/mm.h> 10#include <linux/mm.h>
14#include <linux/smp.h> 11#include <linux/smp.h>
15#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/signal.h>
16#include <linux/errno.h> 14#include <linux/errno.h>
17#include <linux/wait.h> 15#include <linux/wait.h>
16#include <linux/ptrace.h>
18#include <linux/tracehook.h> 17#include <linux/tracehook.h>
19#include <linux/unistd.h> 18#include <linux/unistd.h>
20#include <linux/stddef.h> 19#include <linux/stddef.h>
21#include <linux/personality.h> 20#include <linux/personality.h>
22#include <linux/uaccess.h> 21#include <linux/uaccess.h>
23#include <linux/user-return-notifier.h> 22#include <linux/user-return-notifier.h>
24#include <linux/uprobes.h>
25#include <linux/context_tracking.h>
26 23
27#include <asm/processor.h> 24#include <asm/processor.h>
28#include <asm/ucontext.h> 25#include <asm/ucontext.h>
29#include <asm/i387.h> 26#include <asm/i387.h>
30#include <asm/fpu-internal.h>
31#include <asm/vdso.h> 27#include <asm/vdso.h>
32#include <asm/mce.h> 28#include <asm/mce.h>
33#include <asm/sighandling.h>
34 29
35#ifdef CONFIG_X86_64 30#ifdef CONFIG_X86_64
36#include <asm/proto.h> 31#include <asm/proto.h>
37#include <asm/ia32_unistd.h> 32#include <asm/ia32_unistd.h>
38#include <asm/sys_ia32.h>
39#endif /* CONFIG_X86_64 */ 33#endif /* CONFIG_X86_64 */
40 34
41#include <asm/syscall.h> 35#include <asm/syscall.h>
@@ -43,6 +37,13 @@
43 37
44#include <asm/sigframe.h> 38#include <asm/sigframe.h>
45 39
40#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
41
42#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
43 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
44 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
45 X86_EFLAGS_CF)
46
46#ifdef CONFIG_X86_32 47#ifdef CONFIG_X86_32
47# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF) 48# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
48#else 49#else
@@ -67,8 +68,9 @@
67 regs->seg = GET_SEG(seg) | 3; \ 68 regs->seg = GET_SEG(seg) | 3; \
68} while (0) 69} while (0)
69 70
70int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 71static int
71 unsigned long *pax) 72restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
73 unsigned long *pax)
72{ 74{
73 void __user *buf; 75 void __user *buf;
74 unsigned int tmpflags; 76 unsigned int tmpflags;
@@ -115,17 +117,17 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
115 regs->orig_ax = -1; /* disable syscall checks */ 117 regs->orig_ax = -1; /* disable syscall checks */
116 118
117 get_user_ex(buf, &sc->fpstate); 119 get_user_ex(buf, &sc->fpstate);
120 err |= restore_i387_xstate(buf);
118 121
119 get_user_ex(*pax, &sc->ax); 122 get_user_ex(*pax, &sc->ax);
120 } get_user_catch(err); 123 } get_user_catch(err);
121 124
122 err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
123
124 return err; 125 return err;
125} 126}
126 127
127int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, 128static int
128 struct pt_regs *regs, unsigned long mask) 129setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
130 struct pt_regs *regs, unsigned long mask)
129{ 131{
130 int err = 0; 132 int err = 0;
131 133
@@ -157,7 +159,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
157 put_user_ex(regs->r15, &sc->r15); 159 put_user_ex(regs->r15, &sc->r15);
158#endif /* CONFIG_X86_64 */ 160#endif /* CONFIG_X86_64 */
159 161
160 put_user_ex(current->thread.trap_nr, &sc->trapno); 162 put_user_ex(current->thread.trap_no, &sc->trapno);
161 put_user_ex(current->thread.error_code, &sc->err); 163 put_user_ex(current->thread.error_code, &sc->err);
162 put_user_ex(regs->ip, &sc->ip); 164 put_user_ex(regs->ip, &sc->ip);
163#ifdef CONFIG_X86_32 165#ifdef CONFIG_X86_32
@@ -208,32 +210,35 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
208 void __user **fpstate) 210 void __user **fpstate)
209{ 211{
210 /* Default to using normal stack */ 212 /* Default to using normal stack */
211 unsigned long math_size = 0;
212 unsigned long sp = regs->sp; 213 unsigned long sp = regs->sp;
213 unsigned long buf_fx = 0;
214 int onsigstack = on_sig_stack(sp); 214 int onsigstack = on_sig_stack(sp);
215 215
216#ifdef CONFIG_X86_64
216 /* redzone */ 217 /* redzone */
217 if (config_enabled(CONFIG_X86_64)) 218 sp -= 128;
218 sp -= 128; 219#endif /* CONFIG_X86_64 */
219 220
220 if (!onsigstack) { 221 if (!onsigstack) {
221 /* This is the X/Open sanctioned signal stack switching. */ 222 /* This is the X/Open sanctioned signal stack switching. */
222 if (ka->sa.sa_flags & SA_ONSTACK) { 223 if (ka->sa.sa_flags & SA_ONSTACK) {
223 if (current->sas_ss_size) 224 if (current->sas_ss_size)
224 sp = current->sas_ss_sp + current->sas_ss_size; 225 sp = current->sas_ss_sp + current->sas_ss_size;
225 } else if (config_enabled(CONFIG_X86_32) && 226 } else {
226 (regs->ss & 0xffff) != __USER_DS && 227#ifdef CONFIG_X86_32
227 !(ka->sa.sa_flags & SA_RESTORER) && 228 /* This is the legacy signal stack switching. */
228 ka->sa.sa_restorer) { 229 if ((regs->ss & 0xffff) != __USER_DS &&
229 /* This is the legacy signal stack switching. */ 230 !(ka->sa.sa_flags & SA_RESTORER) &&
231 ka->sa.sa_restorer)
230 sp = (unsigned long) ka->sa.sa_restorer; 232 sp = (unsigned long) ka->sa.sa_restorer;
233#endif /* CONFIG_X86_32 */
231 } 234 }
232 } 235 }
233 236
234 if (used_math()) { 237 if (used_math()) {
235 sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), 238 sp -= sig_xstate_size;
236 &buf_fx, &math_size); 239#ifdef CONFIG_X86_64
240 sp = round_down(sp, 64);
241#endif /* CONFIG_X86_64 */
237 *fpstate = (void __user *)sp; 242 *fpstate = (void __user *)sp;
238 } 243 }
239 244
@@ -246,9 +251,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
246 if (onsigstack && !likely(on_sig_stack(sp))) 251 if (onsigstack && !likely(on_sig_stack(sp)))
247 return (void __user *)-1L; 252 return (void __user *)-1L;
248 253
249 /* save i387 and extended state */ 254 /* save i387 state */
250 if (used_math() && 255 if (used_math() && save_i387_xstate(*fpstate) < 0)
251 save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)
252 return (void __user *)-1L; 256 return (void __user *)-1L;
253 257
254 return (void __user *)sp; 258 return (void __user *)sp;
@@ -357,6 +361,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
357 put_user_ex(sig, &frame->sig); 361 put_user_ex(sig, &frame->sig);
358 put_user_ex(&frame->info, &frame->pinfo); 362 put_user_ex(&frame->info, &frame->pinfo);
359 put_user_ex(&frame->uc, &frame->puc); 363 put_user_ex(&frame->uc, &frame->puc);
364 err |= copy_siginfo_to_user(&frame->info, info);
360 365
361 /* Create the ucontext. */ 366 /* Create the ucontext. */
362 if (cpu_has_xsave) 367 if (cpu_has_xsave)
@@ -364,7 +369,13 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
364 else 369 else
365 put_user_ex(0, &frame->uc.uc_flags); 370 put_user_ex(0, &frame->uc.uc_flags);
366 put_user_ex(0, &frame->uc.uc_link); 371 put_user_ex(0, &frame->uc.uc_link);
367 err |= __save_altstack(&frame->uc.uc_stack, regs->sp); 372 put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
373 put_user_ex(sas_ss_flags(regs->sp),
374 &frame->uc.uc_stack.ss_flags);
375 put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
376 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
377 regs, set->sig[0]);
378 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
368 379
369 /* Set up to return from userspace. */ 380 /* Set up to return from userspace. */
370 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 381 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -381,11 +392,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
381 */ 392 */
382 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode); 393 put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);
383 } put_user_catch(err); 394 } put_user_catch(err);
384
385 err |= copy_siginfo_to_user(&frame->info, info);
386 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
387 regs, set->sig[0]);
388 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
389 395
390 if (err) 396 if (err)
391 return -EFAULT; 397 return -EFAULT;
@@ -411,6 +417,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
411 struct rt_sigframe __user *frame; 417 struct rt_sigframe __user *frame;
412 void __user *fp = NULL; 418 void __user *fp = NULL;
413 int err = 0; 419 int err = 0;
420 struct task_struct *me = current;
414 421
415 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp); 422 frame = get_sigframe(ka, regs, sizeof(struct rt_sigframe), &fp);
416 423
@@ -429,7 +436,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
429 else 436 else
430 put_user_ex(0, &frame->uc.uc_flags); 437 put_user_ex(0, &frame->uc.uc_flags);
431 put_user_ex(0, &frame->uc.uc_link); 438 put_user_ex(0, &frame->uc.uc_link);
432 err |= __save_altstack(&frame->uc.uc_stack, regs->sp); 439 put_user_ex(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
440 put_user_ex(sas_ss_flags(regs->sp),
441 &frame->uc.uc_stack.ss_flags);
442 put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
443 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
444 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
433 445
434 /* Set up to return from userspace. If provided, use a stub 446 /* Set up to return from userspace. If provided, use a stub
435 already in userspace. */ 447 already in userspace. */
@@ -442,9 +454,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
442 } 454 }
443 } put_user_catch(err); 455 } put_user_catch(err);
444 456
445 err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]);
446 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
447
448 if (err) 457 if (err)
449 return -EFAULT; 458 return -EFAULT;
450 459
@@ -469,72 +478,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
469} 478}
470#endif /* CONFIG_X86_32 */ 479#endif /* CONFIG_X86_32 */
471 480
472static int x32_setup_rt_frame(int sig, struct k_sigaction *ka,
473 siginfo_t *info, compat_sigset_t *set,
474 struct pt_regs *regs)
475{
476#ifdef CONFIG_X86_X32_ABI
477 struct rt_sigframe_x32 __user *frame;
478 void __user *restorer;
479 int err = 0;
480 void __user *fpstate = NULL;
481
482 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
483
484 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
485 return -EFAULT;
486
487 if (ka->sa.sa_flags & SA_SIGINFO) {
488 if (copy_siginfo_to_user32(&frame->info, info))
489 return -EFAULT;
490 }
491
492 put_user_try {
493 /* Create the ucontext. */
494 if (cpu_has_xsave)
495 put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags);
496 else
497 put_user_ex(0, &frame->uc.uc_flags);
498 put_user_ex(0, &frame->uc.uc_link);
499 err |= __compat_save_altstack(&frame->uc.uc_stack, regs->sp);
500 put_user_ex(0, &frame->uc.uc__pad0);
501
502 if (ka->sa.sa_flags & SA_RESTORER) {
503 restorer = ka->sa.sa_restorer;
504 } else {
505 /* could use a vstub here */
506 restorer = NULL;
507 err |= -EFAULT;
508 }
509 put_user_ex(restorer, &frame->pretcode);
510 } put_user_catch(err);
511
512 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
513 regs, set->sig[0]);
514 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
515
516 if (err)
517 return -EFAULT;
518
519 /* Set up registers for signal handler */
520 regs->sp = (unsigned long) frame;
521 regs->ip = (unsigned long) ka->sa.sa_handler;
522
523 /* We use the x32 calling convention here... */
524 regs->di = sig;
525 regs->si = (unsigned long) &frame->info;
526 regs->dx = (unsigned long) &frame->uc;
527
528 loadsegment(ds, __USER_DS);
529 loadsegment(es, __USER_DS);
530
531 regs->cs = __USER_CS;
532 regs->ss = __USER_DS;
533#endif /* CONFIG_X86_X32_ABI */
534
535 return 0;
536}
537
538#ifdef CONFIG_X86_32 481#ifdef CONFIG_X86_32
539/* 482/*
540 * Atomically swap in the new signal mask, and wait for a signal. 483 * Atomically swap in the new signal mask, and wait for a signal.
@@ -543,8 +486,18 @@ asmlinkage int
543sys_sigsuspend(int history0, int history1, old_sigset_t mask) 486sys_sigsuspend(int history0, int history1, old_sigset_t mask)
544{ 487{
545 sigset_t blocked; 488 sigset_t blocked;
489
490 current->saved_sigmask = current->blocked;
491
492 mask &= _BLOCKABLE;
546 siginitset(&blocked, mask); 493 siginitset(&blocked, mask);
547 return sigsuspend(&blocked); 494 set_current_blocked(&blocked);
495
496 current->state = TASK_INTERRUPTIBLE;
497 schedule();
498
499 set_restore_sigmask();
500 return -ERESTARTNOHAND;
548} 501}
549 502
550asmlinkage int 503asmlinkage int
@@ -593,6 +546,13 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
593} 546}
594#endif /* CONFIG_X86_32 */ 547#endif /* CONFIG_X86_32 */
595 548
549long
550sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
551 struct pt_regs *regs)
552{
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555
596/* 556/*
597 * Do a signal return; undo the signal stack. 557 * Do a signal return; undo the signal stack.
598 */ 558 */
@@ -612,6 +572,7 @@ unsigned long sys_sigreturn(struct pt_regs *regs)
612 sizeof(frame->extramask)))) 572 sizeof(frame->extramask))))
613 goto badframe; 573 goto badframe;
614 574
575 sigdelsetmask(&set, ~_BLOCKABLE);
615 set_current_blocked(&set); 576 set_current_blocked(&set);
616 577
617 if (restore_sigcontext(regs, &frame->sc, &ax)) 578 if (restore_sigcontext(regs, &frame->sc, &ax))
@@ -637,12 +598,13 @@ long sys_rt_sigreturn(struct pt_regs *regs)
637 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) 598 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
638 goto badframe; 599 goto badframe;
639 600
601 sigdelsetmask(&set, ~_BLOCKABLE);
640 set_current_blocked(&set); 602 set_current_blocked(&set);
641 603
642 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 604 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
643 goto badframe; 605 goto badframe;
644 606
645 if (restore_altstack(&frame->uc.uc_stack)) 607 if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
646 goto badframe; 608 goto badframe;
647 609
648 return ax; 610 return ax;
@@ -666,31 +628,63 @@ static int signr_convert(int sig)
666 return sig; 628 return sig;
667} 629}
668 630
631#ifdef CONFIG_X86_32
632
633#define is_ia32 1
634#define ia32_setup_frame __setup_frame
635#define ia32_setup_rt_frame __setup_rt_frame
636
637#else /* !CONFIG_X86_32 */
638
639#ifdef CONFIG_IA32_EMULATION
640#define is_ia32 test_thread_flag(TIF_IA32)
641#else /* !CONFIG_IA32_EMULATION */
642#define is_ia32 0
643#endif /* CONFIG_IA32_EMULATION */
644
645int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
646 sigset_t *set, struct pt_regs *regs);
647int ia32_setup_frame(int sig, struct k_sigaction *ka,
648 sigset_t *set, struct pt_regs *regs);
649
650#endif /* CONFIG_X86_32 */
651
669static int 652static int
670setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 653setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
671 struct pt_regs *regs) 654 struct pt_regs *regs)
672{ 655{
673 int usig = signr_convert(sig); 656 int usig = signr_convert(sig);
674 sigset_t *set = sigmask_to_save(); 657 sigset_t *set = &current->blocked;
675 compat_sigset_t *cset = (compat_sigset_t *) set; 658 int ret;
659
660 if (current_thread_info()->status & TS_RESTORE_SIGMASK)
661 set = &current->saved_sigmask;
676 662
677 /* Set up the stack frame */ 663 /* Set up the stack frame */
678 if (is_ia32_frame()) { 664 if (is_ia32) {
679 if (ka->sa.sa_flags & SA_SIGINFO) 665 if (ka->sa.sa_flags & SA_SIGINFO)
680 return ia32_setup_rt_frame(usig, ka, info, cset, regs); 666 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
681 else 667 else
682 return ia32_setup_frame(usig, ka, cset, regs); 668 ret = ia32_setup_frame(usig, ka, set, regs);
683 } else if (is_x32_frame()) { 669 } else
684 return x32_setup_rt_frame(usig, ka, info, cset, regs); 670 ret = __setup_rt_frame(sig, ka, info, set, regs);
685 } else { 671
686 return __setup_rt_frame(sig, ka, info, set, regs); 672 if (ret) {
673 force_sigsegv(sig, current);
674 return -EFAULT;
687 } 675 }
676
677 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
678 return ret;
688} 679}
689 680
690static void 681static int
691handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
692 struct pt_regs *regs) 683 struct pt_regs *regs)
693{ 684{
685 sigset_t blocked;
686 int ret;
687
694 /* Are we from a system call? */ 688 /* Are we from a system call? */
695 if (syscall_get_nr(current, regs) >= 0) { 689 if (syscall_get_nr(current, regs) >= 0) {
696 /* If so, check system call restarting.. */ 690 /* If so, check system call restarting.. */
@@ -721,10 +715,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
721 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 715 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
722 regs->flags &= ~X86_EFLAGS_TF; 716 regs->flags &= ~X86_EFLAGS_TF;
723 717
724 if (setup_rt_frame(sig, ka, info, regs) < 0) { 718 ret = setup_rt_frame(sig, ka, info, regs);
725 force_sigsegv(sig, current); 719
726 return; 720 if (ret)
727 } 721 return ret;
728 722
729 /* 723 /*
730 * Clear the direction flag as per the ABI for function entry. 724 * Clear the direction flag as per the ABI for function entry.
@@ -739,8 +733,15 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
739 */ 733 */
740 regs->flags &= ~X86_EFLAGS_TF; 734 regs->flags &= ~X86_EFLAGS_TF;
741 735
742 signal_delivered(sig, info, ka, regs, 736 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
743 test_thread_flag(TIF_SINGLESTEP)); 737 if (!(ka->sa.sa_flags & SA_NODEFER))
738 sigaddset(&blocked, sig);
739 set_current_blocked(&blocked);
740
741 tracehook_signal_handler(sig, info, ka, regs,
742 test_thread_flag(TIF_SINGLESTEP));
743
744 return 0;
744} 745}
745 746
746#ifdef CONFIG_X86_32 747#ifdef CONFIG_X86_32
@@ -761,6 +762,16 @@ static void do_signal(struct pt_regs *regs)
761 siginfo_t info; 762 siginfo_t info;
762 int signr; 763 int signr;
763 764
765 /*
766 * We want the common case to go fast, which is why we may in certain
767 * cases get here from kernel mode. Just return without doing anything
768 * if so.
769 * X86_32: vm86 regs switched out by assembly code before reaching
770 * here, so testing against kernel CS suffices.
771 */
772 if (!user_mode(regs))
773 return;
774
764 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 775 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
765 if (signr > 0) { 776 if (signr > 0) {
766 /* Whee! Actually deliver the signal. */ 777 /* Whee! Actually deliver the signal. */
@@ -790,7 +801,10 @@ static void do_signal(struct pt_regs *regs)
790 * If there's no signal to deliver, we just put the saved sigmask 801 * If there's no signal to deliver, we just put the saved sigmask
791 * back. 802 * back.
792 */ 803 */
793 restore_saved_sigmask(); 804 if (current_thread_info()->status & TS_RESTORE_SIGMASK) {
805 current_thread_info()->status &= ~TS_RESTORE_SIGMASK;
806 set_current_blocked(&current->saved_sigmask);
807 }
794} 808}
795 809
796/* 810/*
@@ -800,17 +814,12 @@ static void do_signal(struct pt_regs *regs)
800void 814void
801do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 815do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
802{ 816{
803 user_exit();
804
805#ifdef CONFIG_X86_MCE 817#ifdef CONFIG_X86_MCE
806 /* notify userspace of pending MCEs */ 818 /* notify userspace of pending MCEs */
807 if (thread_info_flags & _TIF_MCE_NOTIFY) 819 if (thread_info_flags & _TIF_MCE_NOTIFY)
808 mce_notify_process(); 820 mce_notify_process();
809#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ 821#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
810 822
811 if (thread_info_flags & _TIF_UPROBE)
812 uprobe_notify_resume(regs);
813
814 /* deal with pending signal delivery */ 823 /* deal with pending signal delivery */
815 if (thread_info_flags & _TIF_SIGPENDING) 824 if (thread_info_flags & _TIF_SIGPENDING)
816 do_signal(regs); 825 do_signal(regs);
@@ -818,11 +827,15 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
818 if (thread_info_flags & _TIF_NOTIFY_RESUME) { 827 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
819 clear_thread_flag(TIF_NOTIFY_RESUME); 828 clear_thread_flag(TIF_NOTIFY_RESUME);
820 tracehook_notify_resume(regs); 829 tracehook_notify_resume(regs);
830 if (current->replacement_session_keyring)
831 key_replace_session_keyring();
821 } 832 }
822 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) 833 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
823 fire_user_return_notifiers(); 834 fire_user_return_notifiers();
824 835
825 user_enter(); 836#ifdef CONFIG_X86_32
837 clear_thread_flag(TIF_IRET);
838#endif /* CONFIG_X86_32 */
826} 839}
827 840
828void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 841void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
@@ -836,38 +849,8 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
836 me->comm, me->pid, where, frame, 849 me->comm, me->pid, where, frame,
837 regs->ip, regs->sp, regs->orig_ax); 850 regs->ip, regs->sp, regs->orig_ax);
838 print_vma_addr(" in ", regs->ip); 851 print_vma_addr(" in ", regs->ip);
839 pr_cont("\n"); 852 printk(KERN_CONT "\n");
840 } 853 }
841 854
842 force_sig(SIGSEGV, me); 855 force_sig(SIGSEGV, me);
843} 856}
844
845#ifdef CONFIG_X86_X32_ABI
846asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)
847{
848 struct rt_sigframe_x32 __user *frame;
849 sigset_t set;
850 unsigned long ax;
851
852 frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
853
854 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
855 goto badframe;
856 if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
857 goto badframe;
858
859 set_current_blocked(&set);
860
861 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
862 goto badframe;
863
864 if (compat_restore_altstack(&frame->uc.uc_stack))
865 goto badframe;
866
867 return ax;
868
869badframe:
870 signal_fault(regs, frame, "x32 rt_sigreturn");
871 return 0;
872}
873#endif
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 48d2b7ded42..013e7eba83b 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -16,7 +16,6 @@
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/export.h>
20#include <linux/kernel_stat.h> 19#include <linux/kernel_stat.h>
21#include <linux/mc146818rtc.h> 20#include <linux/mc146818rtc.h>
22#include <linux/cache.h> 21#include <linux/cache.h>
@@ -29,7 +28,6 @@
29#include <asm/mmu_context.h> 28#include <asm/mmu_context.h>
30#include <asm/proto.h> 29#include <asm/proto.h>
31#include <asm/apic.h> 30#include <asm/apic.h>
32#include <asm/nmi.h>
33/* 31/*
34 * Some notes on x86 processor bugs affecting SMP operation: 32 * Some notes on x86 processor bugs affecting SMP operation:
35 * 33 *
@@ -109,9 +107,6 @@
109 * about nothing of note with C stepping upwards. 107 * about nothing of note with C stepping upwards.
110 */ 108 */
111 109
112static atomic_t stopping_cpu = ATOMIC_INIT(-1);
113static bool smp_no_nmi_ipi = false;
114
115/* 110/*
116 * this function sends a 'reschedule' IPI to another CPU. 111 * this function sends a 'reschedule' IPI to another CPU.
117 * it goes straight through and wastes no time serializing 112 * it goes straight through and wastes no time serializing
@@ -152,17 +147,6 @@ void native_send_call_func_ipi(const struct cpumask *mask)
152 free_cpumask_var(allbutself); 147 free_cpumask_var(allbutself);
153} 148}
154 149
155static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
156{
157 /* We are registered on stopping cpu too, avoid spurious NMI */
158 if (raw_smp_processor_id() == atomic_read(&stopping_cpu))
159 return NMI_HANDLED;
160
161 stop_this_cpu(NULL);
162
163 return NMI_HANDLED;
164}
165
166/* 150/*
167 * this function calls the 'stop' function on all other CPUs in the system. 151 * this function calls the 'stop' function on all other CPUs in the system.
168 */ 152 */
@@ -186,25 +170,13 @@ static void native_stop_other_cpus(int wait)
186 /* 170 /*
187 * Use an own vector here because smp_call_function 171 * Use an own vector here because smp_call_function
188 * does lots of things not suitable in a panic situation. 172 * does lots of things not suitable in a panic situation.
189 */ 173 * On most systems we could also use an NMI here,
190 174 * but there are a few systems around where NMI
191 /* 175 * is problematic so stay with an non NMI for now
192 * We start by using the REBOOT_VECTOR irq. 176 * (this implies we cannot stop CPUs spinning with irq off
193 * The irq is treated as a sync point to allow critical 177 * currently)
194 * regions of code on other cpus to release their spin locks
195 * and re-enable irqs. Jumping straight to an NMI might
196 * accidentally cause deadlocks with further shutdown/panic
197 * code. By syncing, we give the cpus up to one second to
198 * finish their work before we force them off with the NMI.
199 */ 178 */
200 if (num_online_cpus() > 1) { 179 if (num_online_cpus() > 1) {
201 /* did someone beat us here? */
202 if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
203 return;
204
205 /* sync above data before sending IRQ */
206 wmb();
207
208 apic->send_IPI_allbutself(REBOOT_VECTOR); 180 apic->send_IPI_allbutself(REBOOT_VECTOR);
209 181
210 /* 182 /*
@@ -215,32 +187,7 @@ static void native_stop_other_cpus(int wait)
215 while (num_online_cpus() > 1 && (wait || timeout--)) 187 while (num_online_cpus() > 1 && (wait || timeout--))
216 udelay(1); 188 udelay(1);
217 } 189 }
218
219 /* if the REBOOT_VECTOR didn't work, try with the NMI */
220 if ((num_online_cpus() > 1) && (!smp_no_nmi_ipi)) {
221 if (register_nmi_handler(NMI_LOCAL, smp_stop_nmi_callback,
222 NMI_FLAG_FIRST, "smp_stop"))
223 /* Note: we ignore failures here */
224 /* Hope the REBOOT_IRQ is good enough */
225 goto finish;
226
227 /* sync above data before sending IRQ */
228 wmb();
229
230 pr_emerg("Shutting down cpus with NMI\n");
231
232 apic->send_IPI_allbutself(NMI_VECTOR);
233
234 /*
235 * Don't wait longer than a 10 ms if the caller
236 * didn't ask us to wait.
237 */
238 timeout = USEC_PER_MSEC * 10;
239 while (num_online_cpus() > 1 && (wait || timeout--))
240 udelay(1);
241 }
242 190
243finish:
244 local_irq_save(flags); 191 local_irq_save(flags);
245 disable_local_APIC(); 192 disable_local_APIC();
246 local_irq_restore(flags); 193 local_irq_restore(flags);
@@ -277,14 +224,6 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
277 irq_exit(); 224 irq_exit();
278} 225}
279 226
280static int __init nonmi_ipi_setup(char *str)
281{
282 smp_no_nmi_ipi = true;
283 return 1;
284}
285
286__setup("nonmi_ipi", nonmi_ipi_setup);
287
288struct smp_ops smp_ops = { 227struct smp_ops smp_ops = {
289 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 228 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
290 .smp_prepare_cpus = native_smp_prepare_cpus, 229 .smp_prepare_cpus = native_smp_prepare_cpus,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index ed0fe385289..39e11500b9b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1,4 +1,4 @@
1 /* 1/*
2 * x86 SMP booting functions 2 * x86 SMP booting functions
3 * 3 *
4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> 4 * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk>
@@ -39,8 +39,6 @@
39 * Glauber Costa : i386 and x86_64 integration 39 * Glauber Costa : i386 and x86_64 integration
40 */ 40 */
41 41
42#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
43
44#include <linux/init.h> 42#include <linux/init.h>
45#include <linux/smp.h> 43#include <linux/smp.h>
46#include <linux/module.h> 44#include <linux/module.h>
@@ -52,14 +50,13 @@
52#include <linux/tboot.h> 50#include <linux/tboot.h>
53#include <linux/stackprotector.h> 51#include <linux/stackprotector.h>
54#include <linux/gfp.h> 52#include <linux/gfp.h>
55#include <linux/cpuidle.h>
56 53
57#include <asm/acpi.h> 54#include <asm/acpi.h>
58#include <asm/desc.h> 55#include <asm/desc.h>
59#include <asm/nmi.h> 56#include <asm/nmi.h>
60#include <asm/irq.h> 57#include <asm/irq.h>
61#include <asm/idle.h> 58#include <asm/idle.h>
62#include <asm/realmode.h> 59#include <asm/trampoline.h>
63#include <asm/cpu.h> 60#include <asm/cpu.h>
64#include <asm/numa.h> 61#include <asm/numa.h>
65#include <asm/pgtable.h> 62#include <asm/pgtable.h>
@@ -68,8 +65,6 @@
68#include <asm/mwait.h> 65#include <asm/mwait.h>
69#include <asm/apic.h> 66#include <asm/apic.h>
70#include <asm/io_apic.h> 67#include <asm/io_apic.h>
71#include <asm/i387.h>
72#include <asm/fpu-internal.h>
73#include <asm/setup.h> 68#include <asm/setup.h>
74#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
75#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -77,13 +72,23 @@
77#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
78#include <asm/i8259.h> 73#include <asm/i8259.h>
79 74
80#include <asm/realmode.h>
81
82/* State of each CPU */ 75/* State of each CPU */
83DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
84 77
78/* Store all idle threads, this can be reused instead of creating
79* a new thread. Also avoids complicated thread destroy functionality
80* for idle threads.
81*/
85#ifdef CONFIG_HOTPLUG_CPU 82#ifdef CONFIG_HOTPLUG_CPU
86/* 83/*
84 * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
85 * removed after init for !CONFIG_HOTPLUG_CPU.
86 */
87static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
88#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
89#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
90
91/*
87 * We need this for trampoline_base protection from concurrent accesses when 92 * We need this for trampoline_base protection from concurrent accesses when
88 * off- and onlining cores wildly. 93 * off- and onlining cores wildly.
89 */ 94 */
@@ -91,16 +96,20 @@ static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
91 96
92void cpu_hotplug_driver_lock(void) 97void cpu_hotplug_driver_lock(void)
93{ 98{
94 mutex_lock(&x86_cpu_hotplug_driver_mutex); 99 mutex_lock(&x86_cpu_hotplug_driver_mutex);
95} 100}
96 101
97void cpu_hotplug_driver_unlock(void) 102void cpu_hotplug_driver_unlock(void)
98{ 103{
99 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 104 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
100} 105}
101 106
102ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; } 107ssize_t arch_cpu_probe(const char *buf, size_t count) { return -1; }
103ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; } 108ssize_t arch_cpu_release(const char *buf, size_t count) { return -1; }
109#else
110static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
111#define get_idle_for_cpu(x) (idle_thread_array[(x)])
112#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
104#endif 113#endif
105 114
106/* Number of siblings per CPU package */ 115/* Number of siblings per CPU package */
@@ -108,17 +117,17 @@ int smp_num_siblings = 1;
108EXPORT_SYMBOL(smp_num_siblings); 117EXPORT_SYMBOL(smp_num_siblings);
109 118
110/* Last level cache ID of each logical CPU */ 119/* Last level cache ID of each logical CPU */
111DEFINE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id) = BAD_APICID; 120DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
112 121
113/* representing HT siblings of each logical CPU */ 122/* representing HT siblings of each logical CPU */
114DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_map); 123DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
115EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 124EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
116 125
117/* representing HT and core siblings of each logical CPU */ 126/* representing HT and core siblings of each logical CPU */
118DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
119EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
120 129
121DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); 130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
122 131
123/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
124DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -127,8 +136,8 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
127atomic_t init_deasserted; 136atomic_t init_deasserted;
128 137
129/* 138/*
130 * Report back to the Boot Processor during boot time or to the caller processor 139 * Report back to the Boot Processor.
131 * during CPU online. 140 * Running on AP.
132 */ 141 */
133static void __cpuinit smp_callin(void) 142static void __cpuinit smp_callin(void)
134{ 143{
@@ -140,17 +149,15 @@ static void __cpuinit smp_callin(void)
140 * we may get here before an INIT-deassert IPI reaches 149 * we may get here before an INIT-deassert IPI reaches
141 * our local APIC. We have to wait for the IPI or we'll 150 * our local APIC. We have to wait for the IPI or we'll
142 * lock up on an APIC access. 151 * lock up on an APIC access.
143 *
144 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
145 */ 152 */
146 cpuid = smp_processor_id(); 153 if (apic->wait_for_init_deassert)
147 if (apic->wait_for_init_deassert && cpuid != 0)
148 apic->wait_for_init_deassert(&init_deasserted); 154 apic->wait_for_init_deassert(&init_deasserted);
149 155
150 /* 156 /*
151 * (This works even if the APIC is not enabled.) 157 * (This works even if the APIC is not enabled.)
152 */ 158 */
153 phys_id = read_apic_id(); 159 phys_id = read_apic_id();
160 cpuid = smp_processor_id();
154 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) { 161 if (cpumask_test_cpu(cpuid, cpu_callin_mask)) {
155 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 162 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
156 phys_id, cpuid); 163 phys_id, cpuid);
@@ -190,7 +197,7 @@ static void __cpuinit smp_callin(void)
190 * boards) 197 * boards)
191 */ 198 */
192 199
193 pr_debug("CALLIN, before setup_local_APIC()\n"); 200 pr_debug("CALLIN, before setup_local_APIC().\n");
194 if (apic->smp_callin_clear_local_apic) 201 if (apic->smp_callin_clear_local_apic)
195 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
196 setup_local_APIC(); 203 setup_local_APIC();
@@ -200,24 +207,23 @@ static void __cpuinit smp_callin(void)
200 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
201 */ 208 */
202 setup_vector_irq(smp_processor_id()); 209 setup_vector_irq(smp_processor_id());
203
204 /*
205 * Save our processor parameters. Note: this information
206 * is needed for clock calibration.
207 */
208 smp_store_cpu_info(cpuid);
209
210 /* 210 /*
211 * Get our bogomips. 211 * Get our bogomips.
212 * Update loops_per_jiffy in cpu_data. Previous call to 212 *
213 * smp_store_cpu_info() stored a value that is close but not as 213 * Need to enable IRQs because it can take longer and then
214 * accurate as the value just calculated. 214 * the NMI watchdog might kill us.
215 */ 215 */
216 local_irq_enable();
216 calibrate_delay(); 217 calibrate_delay();
217 cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; 218 local_irq_disable();
218 pr_debug("Stack at about %p\n", &cpuid); 219 pr_debug("Stack at about %p\n", &cpuid);
219 220
220 /* 221 /*
222 * Save our processor parameters
223 */
224 smp_store_cpu_info(cpuid);
225
226 /*
221 * This must be done before setting cpu_online_mask 227 * This must be done before setting cpu_online_mask
222 * or calling notify_cpu_starting. 228 * or calling notify_cpu_starting.
223 */ 229 */
@@ -232,8 +238,6 @@ static void __cpuinit smp_callin(void)
232 cpumask_set_cpu(cpuid, cpu_callin_mask); 238 cpumask_set_cpu(cpuid, cpu_callin_mask);
233} 239}
234 240
235static int cpu0_logical_apicid;
236static int enable_start_cpu0;
237/* 241/*
238 * Activate a secondary processor. 242 * Activate a secondary processor.
239 */ 243 */
@@ -245,12 +249,9 @@ notrace static void __cpuinit start_secondary(void *unused)
245 * most necessary things. 249 * most necessary things.
246 */ 250 */
247 cpu_init(); 251 cpu_init();
248 x86_cpuinit.early_percpu_clock_init();
249 preempt_disable(); 252 preempt_disable();
250 smp_callin(); 253 smp_callin();
251 254
252 enable_start_cpu0 = 0;
253
254#ifdef CONFIG_X86_32 255#ifdef CONFIG_X86_32
255 /* switch away from the initial page table */ 256 /* switch away from the initial page table */
256 load_cr3(swapper_pg_dir); 257 load_cr3(swapper_pg_dir);
@@ -265,13 +266,22 @@ notrace static void __cpuinit start_secondary(void *unused)
265 check_tsc_sync_target(); 266 check_tsc_sync_target();
266 267
267 /* 268 /*
269 * We need to hold call_lock, so there is no inconsistency
270 * between the time smp_call_function() determines number of
271 * IPI recipients, and the time when the determination is made
272 * for which cpus receive the IPI. Holding this
273 * lock helps us to not include this cpu in a currently in progress
274 * smp_call_function().
275 *
268 * We need to hold vector_lock so there the set of online cpus 276 * We need to hold vector_lock so there the set of online cpus
269 * does not change while we are assigning vectors to cpus. Holding 277 * does not change while we are assigning vectors to cpus. Holding
270 * this lock ensures we don't half assign or remove an irq from a cpu. 278 * this lock ensures we don't half assign or remove an irq from a cpu.
271 */ 279 */
280 ipi_call_lock();
272 lock_vector_lock(); 281 lock_vector_lock();
273 set_cpu_online(smp_processor_id(), true); 282 set_cpu_online(smp_processor_id(), true);
274 unlock_vector_lock(); 283 unlock_vector_lock();
284 ipi_call_unlock();
275 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 285 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
276 x86_platform.nmi_init(); 286 x86_platform.nmi_init();
277 287
@@ -287,128 +297,74 @@ notrace static void __cpuinit start_secondary(void *unused)
287 cpu_idle(); 297 cpu_idle();
288} 298}
289 299
290void __init smp_store_boot_cpu_info(void)
291{
292 int id = 0; /* CPU 0 */
293 struct cpuinfo_x86 *c = &cpu_data(id);
294
295 *c = boot_cpu_data;
296 c->cpu_index = id;
297}
298
299/* 300/*
300 * The bootstrap kernel entry code has set these up. Save them for 301 * The bootstrap kernel entry code has set these up. Save them for
301 * a given CPU 302 * a given CPU
302 */ 303 */
304
303void __cpuinit smp_store_cpu_info(int id) 305void __cpuinit smp_store_cpu_info(int id)
304{ 306{
305 struct cpuinfo_x86 *c = &cpu_data(id); 307 struct cpuinfo_x86 *c = &cpu_data(id);
306 308
307 *c = boot_cpu_data; 309 *c = boot_cpu_data;
308 c->cpu_index = id; 310 c->cpu_index = id;
309 /* 311 if (id != 0)
310 * During boot time, CPU0 has this setup already. Save the info when 312 identify_secondary_cpu(c);
311 * bringing up AP or offlined CPU0.
312 */
313 identify_secondary_cpu(c);
314} 313}
315 314
316static bool __cpuinit 315static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
317topology_sane(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o, const char *name)
318{ 316{
319 int cpu1 = c->cpu_index, cpu2 = o->cpu_index; 317 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
320 318 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
321 return !WARN_ONCE(cpu_to_node(cpu1) != cpu_to_node(cpu2), 319 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
322 "sched: CPU #%d's %s-sibling CPU #%d is not on the same node! " 320 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
323 "[node: %d != %d]. Ignoring dependency.\n", 321 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
324 cpu1, name, cpu2, cpu_to_node(cpu1), cpu_to_node(cpu2)); 322 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
325} 323}
326 324
327#define link_mask(_m, c1, c2) \
328do { \
329 cpumask_set_cpu((c1), cpu_##_m##_mask(c2)); \
330 cpumask_set_cpu((c2), cpu_##_m##_mask(c1)); \
331} while (0)
332
333static bool __cpuinit match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
334{
335 if (cpu_has_topoext) {
336 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
337
338 if (c->phys_proc_id == o->phys_proc_id &&
339 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2) &&
340 c->compute_unit_id == o->compute_unit_id)
341 return topology_sane(c, o, "smt");
342
343 } else if (c->phys_proc_id == o->phys_proc_id &&
344 c->cpu_core_id == o->cpu_core_id) {
345 return topology_sane(c, o, "smt");
346 }
347
348 return false;
349}
350
351static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
352{
353 int cpu1 = c->cpu_index, cpu2 = o->cpu_index;
354
355 if (per_cpu(cpu_llc_id, cpu1) != BAD_APICID &&
356 per_cpu(cpu_llc_id, cpu1) == per_cpu(cpu_llc_id, cpu2))
357 return topology_sane(c, o, "llc");
358
359 return false;
360}
361
362static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
363{
364 if (c->phys_proc_id == o->phys_proc_id) {
365 if (cpu_has(c, X86_FEATURE_AMD_DCM))
366 return true;
367
368 return topology_sane(c, o, "mc");
369 }
370 return false;
371}
372 325
373void __cpuinit set_cpu_sibling_map(int cpu) 326void __cpuinit set_cpu_sibling_map(int cpu)
374{ 327{
375 bool has_mc = boot_cpu_data.x86_max_cores > 1;
376 bool has_smt = smp_num_siblings > 1;
377 struct cpuinfo_x86 *c = &cpu_data(cpu);
378 struct cpuinfo_x86 *o;
379 int i; 328 int i;
329 struct cpuinfo_x86 *c = &cpu_data(cpu);
380 330
381 cpumask_set_cpu(cpu, cpu_sibling_setup_mask); 331 cpumask_set_cpu(cpu, cpu_sibling_setup_mask);
382 332
383 if (!has_smt && !has_mc) { 333 if (smp_num_siblings > 1) {
334 for_each_cpu(i, cpu_sibling_setup_mask) {
335 struct cpuinfo_x86 *o = &cpu_data(i);
336
337 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
338 if (c->phys_proc_id == o->phys_proc_id &&
339 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
340 c->compute_unit_id == o->compute_unit_id)
341 link_thread_siblings(cpu, i);
342 } else if (c->phys_proc_id == o->phys_proc_id &&
343 c->cpu_core_id == o->cpu_core_id) {
344 link_thread_siblings(cpu, i);
345 }
346 }
347 } else {
384 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 348 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
385 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
386 cpumask_set_cpu(cpu, cpu_core_mask(cpu));
387 c->booted_cores = 1;
388 return;
389 } 349 }
390 350
391 for_each_cpu(i, cpu_sibling_setup_mask) { 351 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
392 o = &cpu_data(i);
393
394 if ((i == cpu) || (has_smt && match_smt(c, o)))
395 link_mask(sibling, cpu, i);
396
397 if ((i == cpu) || (has_mc && match_llc(c, o)))
398 link_mask(llc_shared, cpu, i);
399 352
353 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
354 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
355 c->booted_cores = 1;
356 return;
400 } 357 }
401 358
402 /*
403 * This needs a separate iteration over the cpus because we rely on all
404 * cpu_sibling_mask links to be set-up.
405 */
406 for_each_cpu(i, cpu_sibling_setup_mask) { 359 for_each_cpu(i, cpu_sibling_setup_mask) {
407 o = &cpu_data(i); 360 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
408 361 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
409 if ((i == cpu) || (has_mc && match_mc(c, o))) { 362 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
410 link_mask(core, cpu, i); 363 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
411 364 }
365 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
366 cpumask_set_cpu(i, cpu_core_mask(cpu));
367 cpumask_set_cpu(cpu, cpu_core_mask(i));
412 /* 368 /*
413 * Does this new cpu bringup a new core? 369 * Does this new cpu bringup a new core?
414 */ 370 */
@@ -434,7 +390,16 @@ void __cpuinit set_cpu_sibling_map(int cpu)
434/* maps the cpu to the sched domain representing multi-core */ 390/* maps the cpu to the sched domain representing multi-core */
435const struct cpumask *cpu_coregroup_mask(int cpu) 391const struct cpumask *cpu_coregroup_mask(int cpu)
436{ 392{
437 return cpu_llc_shared_mask(cpu); 393 struct cpuinfo_x86 *c = &cpu_data(cpu);
394 /*
395 * For perf, we return last level cache shared map.
396 * And for power savings, we return cpu_core_map
397 */
398 if ((sched_mc_power_savings || sched_smt_power_savings) &&
399 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
400 return cpu_core_mask(cpu);
401 else
402 return cpu_llc_shared_mask(cpu);
438} 403}
439 404
440static void impress_friends(void) 405static void impress_friends(void)
@@ -444,16 +409,17 @@ static void impress_friends(void)
444 /* 409 /*
445 * Allow the user to impress friends. 410 * Allow the user to impress friends.
446 */ 411 */
447 pr_debug("Before bogomips\n"); 412 pr_debug("Before bogomips.\n");
448 for_each_possible_cpu(cpu) 413 for_each_possible_cpu(cpu)
449 if (cpumask_test_cpu(cpu, cpu_callout_mask)) 414 if (cpumask_test_cpu(cpu, cpu_callout_mask))
450 bogosum += cpu_data(cpu).loops_per_jiffy; 415 bogosum += cpu_data(cpu).loops_per_jiffy;
451 pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", 416 printk(KERN_INFO
417 "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
452 num_online_cpus(), 418 num_online_cpus(),
453 bogosum/(500000/HZ), 419 bogosum/(500000/HZ),
454 (bogosum/(5000/HZ))%100); 420 (bogosum/(5000/HZ))%100);
455 421
456 pr_debug("Before bogocount - setting activated=1\n"); 422 pr_debug("Before bogocount - setting activated=1.\n");
457} 423}
458 424
459void __inquire_remote_apic(int apicid) 425void __inquire_remote_apic(int apicid)
@@ -463,17 +429,18 @@ void __inquire_remote_apic(int apicid)
463 int timeout; 429 int timeout;
464 u32 status; 430 u32 status;
465 431
466 pr_info("Inquiring remote APIC 0x%x...\n", apicid); 432 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
467 433
468 for (i = 0; i < ARRAY_SIZE(regs); i++) { 434 for (i = 0; i < ARRAY_SIZE(regs); i++) {
469 pr_info("... APIC 0x%x %s: ", apicid, names[i]); 435 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
470 436
471 /* 437 /*
472 * Wait for idle. 438 * Wait for idle.
473 */ 439 */
474 status = safe_apic_wait_icr_idle(); 440 status = safe_apic_wait_icr_idle();
475 if (status) 441 if (status)
476 pr_cont("a previous APIC delivery may have failed\n"); 442 printk(KERN_CONT
443 "a previous APIC delivery may have failed\n");
477 444
478 apic_icr_write(APIC_DM_REMRD | regs[i], apicid); 445 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
479 446
@@ -486,10 +453,10 @@ void __inquire_remote_apic(int apicid)
486 switch (status) { 453 switch (status) {
487 case APIC_ICR_RR_VALID: 454 case APIC_ICR_RR_VALID:
488 status = apic_read(APIC_RRR); 455 status = apic_read(APIC_RRR);
489 pr_cont("%08x\n", status); 456 printk(KERN_CONT "%08x\n", status);
490 break; 457 break;
491 default: 458 default:
492 pr_cont("failed\n"); 459 printk(KERN_CONT "failed\n");
493 } 460 }
494 } 461 }
495} 462}
@@ -500,7 +467,7 @@ void __inquire_remote_apic(int apicid)
500 * won't ... remember to clear down the APIC, etc later. 467 * won't ... remember to clear down the APIC, etc later.
501 */ 468 */
502int __cpuinit 469int __cpuinit
503wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) 470wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
504{ 471{
505 unsigned long send_status, accept_status = 0; 472 unsigned long send_status, accept_status = 0;
506 int maxlvt; 473 int maxlvt;
@@ -508,7 +475,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
508 /* Target chip */ 475 /* Target chip */
509 /* Boot on the stack */ 476 /* Boot on the stack */
510 /* Kick the second */ 477 /* Kick the second */
511 apic_icr_write(APIC_DM_NMI | apic->dest_logical, apicid); 478 apic_icr_write(APIC_DM_NMI | apic->dest_logical, logical_apicid);
512 479
513 pr_debug("Waiting for send to finish...\n"); 480 pr_debug("Waiting for send to finish...\n");
514 send_status = safe_apic_wait_icr_idle(); 481 send_status = safe_apic_wait_icr_idle();
@@ -523,12 +490,12 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip)
523 apic_write(APIC_ESR, 0); 490 apic_write(APIC_ESR, 0);
524 accept_status = (apic_read(APIC_ESR) & 0xEF); 491 accept_status = (apic_read(APIC_ESR) & 0xEF);
525 } 492 }
526 pr_debug("NMI sent\n"); 493 pr_debug("NMI sent.\n");
527 494
528 if (send_status) 495 if (send_status)
529 pr_err("APIC never delivered???\n"); 496 printk(KERN_ERR "APIC never delivered???\n");
530 if (accept_status) 497 if (accept_status)
531 pr_err("APIC delivery error (%lx)\n", accept_status); 498 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
532 499
533 return (send_status | accept_status); 500 return (send_status | accept_status);
534} 501}
@@ -550,7 +517,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
550 apic_read(APIC_ESR); 517 apic_read(APIC_ESR);
551 } 518 }
552 519
553 pr_debug("Asserting INIT\n"); 520 pr_debug("Asserting INIT.\n");
554 521
555 /* 522 /*
556 * Turn INIT on target chip 523 * Turn INIT on target chip
@@ -566,7 +533,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
566 533
567 mdelay(10); 534 mdelay(10);
568 535
569 pr_debug("Deasserting INIT\n"); 536 pr_debug("Deasserting INIT.\n");
570 537
571 /* Target chip */ 538 /* Target chip */
572 /* Send IPI */ 539 /* Send IPI */
@@ -599,14 +566,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
599 /* 566 /*
600 * Run STARTUP IPI loop. 567 * Run STARTUP IPI loop.
601 */ 568 */
602 pr_debug("#startup loops: %d\n", num_starts); 569 pr_debug("#startup loops: %d.\n", num_starts);
603 570
604 for (j = 1; j <= num_starts; j++) { 571 for (j = 1; j <= num_starts; j++) {
605 pr_debug("Sending STARTUP #%d\n", j); 572 pr_debug("Sending STARTUP #%d.\n", j);
606 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 573 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
607 apic_write(APIC_ESR, 0); 574 apic_write(APIC_ESR, 0);
608 apic_read(APIC_ESR); 575 apic_read(APIC_ESR);
609 pr_debug("After apic_write\n"); 576 pr_debug("After apic_write.\n");
610 577
611 /* 578 /*
612 * STARTUP IPI 579 * STARTUP IPI
@@ -623,7 +590,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
623 */ 590 */
624 udelay(300); 591 udelay(300);
625 592
626 pr_debug("Startup point 1\n"); 593 pr_debug("Startup point 1.\n");
627 594
628 pr_debug("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
629 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
@@ -638,16 +605,32 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
638 if (send_status || accept_status) 605 if (send_status || accept_status)
639 break; 606 break;
640 } 607 }
641 pr_debug("After Startup\n"); 608 pr_debug("After Startup.\n");
642 609
643 if (send_status) 610 if (send_status)
644 pr_err("APIC never delivered???\n"); 611 printk(KERN_ERR "APIC never delivered???\n");
645 if (accept_status) 612 if (accept_status)
646 pr_err("APIC delivery error (%lx)\n", accept_status); 613 printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
647 614
648 return (send_status | accept_status); 615 return (send_status | accept_status);
649} 616}
650 617
618struct create_idle {
619 struct work_struct work;
620 struct task_struct *idle;
621 struct completion done;
622 int cpu;
623};
624
625static void __cpuinit do_fork_idle(struct work_struct *work)
626{
627 struct create_idle *c_idle =
628 container_of(work, struct create_idle, work);
629
630 c_idle->idle = fork_idle(c_idle->cpu);
631 complete(&c_idle->done);
632}
633
651/* reduce the number of lines printed when booting a large cpu count system */ 634/* reduce the number of lines printed when booting a large cpu count system */
652static void __cpuinit announce_cpu(int cpu, int apicid) 635static void __cpuinit announce_cpu(int cpu, int apicid)
653{ 636{
@@ -657,111 +640,78 @@ static void __cpuinit announce_cpu(int cpu, int apicid)
657 if (system_state == SYSTEM_BOOTING) { 640 if (system_state == SYSTEM_BOOTING) {
658 if (node != current_node) { 641 if (node != current_node) {
659 if (current_node > (-1)) 642 if (current_node > (-1))
660 pr_cont(" OK\n"); 643 pr_cont(" Ok.\n");
661 current_node = node; 644 current_node = node;
662 pr_info("Booting Node %3d, Processors ", node); 645 pr_info("Booting Node %3d, Processors ", node);
663 } 646 }
664 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); 647 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
665 return; 648 return;
666 } else 649 } else
667 pr_info("Booting Node %d Processor %d APIC 0x%x\n", 650 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
668 node, cpu, apicid); 651 node, cpu, apicid);
669} 652}
670 653
671static int wakeup_cpu0_nmi(unsigned int cmd, struct pt_regs *regs)
672{
673 int cpu;
674
675 cpu = smp_processor_id();
676 if (cpu == 0 && !cpu_online(cpu) && enable_start_cpu0)
677 return NMI_HANDLED;
678
679 return NMI_DONE;
680}
681
682/*
683 * Wake up AP by INIT, INIT, STARTUP sequence.
684 *
685 * Instead of waiting for STARTUP after INITs, BSP will execute the BIOS
686 * boot-strap code which is not a desired behavior for waking up BSP. To
687 * void the boot-strap code, wake up CPU0 by NMI instead.
688 *
689 * This works to wake up soft offlined CPU0 only. If CPU0 is hard offlined
690 * (i.e. physically hot removed and then hot added), NMI won't wake it up.
691 * We'll change this code in the future to wake up hard offlined CPU0 if
692 * real platform and request are available.
693 */
694static int __cpuinit
695wakeup_cpu_via_init_nmi(int cpu, unsigned long start_ip, int apicid,
696 int *cpu0_nmi_registered)
697{
698 int id;
699 int boot_error;
700
701 /*
702 * Wake up AP by INIT, INIT, STARTUP sequence.
703 */
704 if (cpu)
705 return wakeup_secondary_cpu_via_init(apicid, start_ip);
706
707 /*
708 * Wake up BSP by nmi.
709 *
710 * Register a NMI handler to help wake up CPU0.
711 */
712 boot_error = register_nmi_handler(NMI_LOCAL,
713 wakeup_cpu0_nmi, 0, "wake_cpu0");
714
715 if (!boot_error) {
716 enable_start_cpu0 = 1;
717 *cpu0_nmi_registered = 1;
718 if (apic->dest_logical == APIC_DEST_LOGICAL)
719 id = cpu0_logical_apicid;
720 else
721 id = apicid;
722 boot_error = wakeup_secondary_cpu_via_nmi(id, start_ip);
723 }
724
725 return boot_error;
726}
727
728/* 654/*
729 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 655 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
730 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 656 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
731 * Returns zero if CPU booted OK, else error code from 657 * Returns zero if CPU booted OK, else error code from
732 * ->wakeup_secondary_cpu. 658 * ->wakeup_secondary_cpu.
733 */ 659 */
734static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) 660static int __cpuinit do_boot_cpu(int apicid, int cpu)
735{ 661{
736 volatile u32 *trampoline_status =
737 (volatile u32 *) __va(real_mode_header->trampoline_status);
738 /* start_ip had better be page-aligned! */
739 unsigned long start_ip = real_mode_header->trampoline_start;
740
741 unsigned long boot_error = 0; 662 unsigned long boot_error = 0;
663 unsigned long start_ip;
742 int timeout; 664 int timeout;
743 int cpu0_nmi_registered = 0; 665 struct create_idle c_idle = {
666 .cpu = cpu,
667 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
668 };
669
670 INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
744 671
745 /* Just in case we booted with a single CPU. */ 672 alternatives_smp_switch(1);
746 alternatives_enable_smp();
747 673
748 idle->thread.sp = (unsigned long) (((struct pt_regs *) 674 c_idle.idle = get_idle_for_cpu(cpu);
749 (THREAD_SIZE + task_stack_page(idle))) - 1);
750 per_cpu(current_task, cpu) = idle;
751 675
676 /*
677 * We can't use kernel_thread since we must avoid to
678 * reschedule the child.
679 */
680 if (c_idle.idle) {
681 c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
682 (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
683 init_idle(c_idle.idle, cpu);
684 goto do_rest;
685 }
686
687 schedule_work(&c_idle.work);
688 wait_for_completion(&c_idle.done);
689
690 if (IS_ERR(c_idle.idle)) {
691 printk("failed fork for CPU %d\n", cpu);
692 destroy_work_on_stack(&c_idle.work);
693 return PTR_ERR(c_idle.idle);
694 }
695
696 set_idle_for_cpu(cpu, c_idle.idle);
697do_rest:
698 per_cpu(current_task, cpu) = c_idle.idle;
752#ifdef CONFIG_X86_32 699#ifdef CONFIG_X86_32
753 /* Stack for startup_32 can be just as for start_secondary onwards */ 700 /* Stack for startup_32 can be just as for start_secondary onwards */
754 irq_ctx_init(cpu); 701 irq_ctx_init(cpu);
755#else 702#else
756 clear_tsk_thread_flag(idle, TIF_FORK); 703 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
757 initial_gs = per_cpu_offset(cpu); 704 initial_gs = per_cpu_offset(cpu);
758 per_cpu(kernel_stack, cpu) = 705 per_cpu(kernel_stack, cpu) =
759 (unsigned long)task_stack_page(idle) - 706 (unsigned long)task_stack_page(c_idle.idle) -
760 KERNEL_STACK_OFFSET + THREAD_SIZE; 707 KERNEL_STACK_OFFSET + THREAD_SIZE;
761#endif 708#endif
762 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 709 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
763 initial_code = (unsigned long)start_secondary; 710 initial_code = (unsigned long)start_secondary;
764 stack_start = idle->thread.sp; 711 stack_start = c_idle.idle->thread.sp;
712
713 /* start_ip had better be page-aligned! */
714 start_ip = trampoline_address();
765 715
766 /* So we see what's up */ 716 /* So we see what's up */
767 announce_cpu(cpu, apicid); 717 announce_cpu(cpu, apicid);
@@ -771,6 +721,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
771 * the targeted processor. 721 * the targeted processor.
772 */ 722 */
773 723
724 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
725
774 atomic_set(&init_deasserted, 0); 726 atomic_set(&init_deasserted, 0);
775 727
776 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 728 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -788,24 +740,21 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
788 } 740 }
789 741
790 /* 742 /*
791 * Wake up a CPU in difference cases: 743 * Kick the secondary CPU. Use the method in the APIC driver
792 * - Use the method in the APIC driver if it's defined 744 * if it's defined - or use an INIT boot APIC message otherwise:
793 * Otherwise,
794 * - Use an INIT boot APIC message for APs or NMI for BSP.
795 */ 745 */
796 if (apic->wakeup_secondary_cpu) 746 if (apic->wakeup_secondary_cpu)
797 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); 747 boot_error = apic->wakeup_secondary_cpu(apicid, start_ip);
798 else 748 else
799 boot_error = wakeup_cpu_via_init_nmi(cpu, start_ip, apicid, 749 boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip);
800 &cpu0_nmi_registered);
801 750
802 if (!boot_error) { 751 if (!boot_error) {
803 /* 752 /*
804 * allow APs to start initializing. 753 * allow APs to start initializing.
805 */ 754 */
806 pr_debug("Before Callout %d\n", cpu); 755 pr_debug("Before Callout %d.\n", cpu);
807 cpumask_set_cpu(cpu, cpu_callout_mask); 756 cpumask_set_cpu(cpu, cpu_callout_mask);
808 pr_debug("After Callout %d\n", cpu); 757 pr_debug("After Callout %d.\n", cpu);
809 758
810 /* 759 /*
811 * Wait 5s total for a response 760 * Wait 5s total for a response
@@ -823,17 +772,17 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
823 schedule(); 772 schedule();
824 } 773 }
825 774
826 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 775 if (cpumask_test_cpu(cpu, cpu_callin_mask))
827 print_cpu_msr(&cpu_data(cpu));
828 pr_debug("CPU%d: has booted.\n", cpu); 776 pr_debug("CPU%d: has booted.\n", cpu);
829 } else { 777 else {
830 boot_error = 1; 778 boot_error = 1;
831 if (*trampoline_status == 0xA5A5A5A5) 779 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
780 == 0xA5A5A5A5)
832 /* trampoline started but...? */ 781 /* trampoline started but...? */
833 pr_err("CPU%d: Stuck ??\n", cpu); 782 pr_err("CPU%d: Stuck ??\n", cpu);
834 else 783 else
835 /* trampoline code not run */ 784 /* trampoline code not run */
836 pr_err("CPU%d: Not responding\n", cpu); 785 pr_err("CPU%d: Not responding.\n", cpu);
837 if (apic->inquire_remote_apic) 786 if (apic->inquire_remote_apic)
838 apic->inquire_remote_apic(apicid); 787 apic->inquire_remote_apic(apicid);
839 } 788 }
@@ -854,7 +803,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
854 } 803 }
855 804
856 /* mark "stuck" area as not stuck */ 805 /* mark "stuck" area as not stuck */
857 *trampoline_status = 0; 806 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
858 807
859 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 808 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
860 /* 809 /*
@@ -862,17 +811,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
862 */ 811 */
863 smpboot_restore_warm_reset_vector(); 812 smpboot_restore_warm_reset_vector();
864 } 813 }
865 /*
866 * Clean up the nmi handler. Do this after the callin and callout sync
867 * to avoid impact of possible long unregister time.
868 */
869 if (cpu0_nmi_registered)
870 unregister_nmi_handler(NMI_LOCAL, "wake_cpu0");
871 814
815 destroy_work_on_stack(&c_idle.work);
872 return boot_error; 816 return boot_error;
873} 817}
874 818
875int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) 819int __cpuinit native_cpu_up(unsigned int cpu)
876{ 820{
877 int apicid = apic->cpu_present_to_apicid(cpu); 821 int apicid = apic->cpu_present_to_apicid(cpu);
878 unsigned long flags; 822 unsigned long flags;
@@ -882,10 +826,9 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
882 826
883 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); 827 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
884 828
885 if (apicid == BAD_APICID || 829 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
886 !physid_isset(apicid, phys_cpu_present_map) || 830 !physid_isset(apicid, phys_cpu_present_map)) {
887 !apic->apic_id_valid(apicid)) { 831 printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
888 pr_err("%s: bad cpu %d\n", __func__, cpu);
889 return -EINVAL; 832 return -EINVAL;
890 } 833 }
891 834
@@ -905,10 +848,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle)
905 848
906 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 849 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
907 850
908 /* the FPU context is blank, nobody can own it */ 851 err = do_boot_cpu(apicid, cpu);
909 __cpu_disable_lazy_restore(cpu);
910
911 err = do_boot_cpu(apicid, cpu, tidle);
912 if (err) { 852 if (err) {
913 pr_debug("do_boot_cpu failed %d\n", err); 853 pr_debug("do_boot_cpu failed %d\n", err);
914 return -EIO; 854 return -EIO;
@@ -969,8 +909,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
969 unsigned int cpu; 909 unsigned int cpu;
970 unsigned nr; 910 unsigned nr;
971 911
972 pr_warn("More than 8 CPUs detected - skipping them\n" 912 printk(KERN_WARNING
973 "Use CONFIG_X86_BIGSMP\n"); 913 "More than 8 CPUs detected - skipping them.\n"
914 "Use CONFIG_X86_BIGSMP.\n");
974 915
975 nr = 0; 916 nr = 0;
976 for_each_present_cpu(cpu) { 917 for_each_present_cpu(cpu) {
@@ -991,7 +932,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
991#endif 932#endif
992 933
993 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 934 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
994 pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", 935 printk(KERN_WARNING
936 "weird, boot CPU (#%d) not listed by the BIOS.\n",
995 hard_smp_processor_id()); 937 hard_smp_processor_id());
996 938
997 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 939 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
@@ -1003,10 +945,11 @@ static int __init smp_sanity_check(unsigned max_cpus)
1003 */ 945 */
1004 if (!smp_found_config && !acpi_lapic) { 946 if (!smp_found_config && !acpi_lapic) {
1005 preempt_enable(); 947 preempt_enable();
1006 pr_notice("SMP motherboard not detected\n"); 948 printk(KERN_NOTICE "SMP motherboard not detected.\n");
1007 disable_smp(); 949 disable_smp();
1008 if (APIC_init_uniprocessor()) 950 if (APIC_init_uniprocessor())
1009 pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); 951 printk(KERN_NOTICE "Local APIC not detected."
952 " Using dummy APIC emulation.\n");
1010 return -1; 953 return -1;
1011 } 954 }
1012 955
@@ -1015,8 +958,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
1015 * CPU too, but we do it for the sake of robustness anyway. 958 * CPU too, but we do it for the sake of robustness anyway.
1016 */ 959 */
1017 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { 960 if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) {
1018 pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", 961 printk(KERN_NOTICE
1019 boot_cpu_physical_apicid); 962 "weird, boot CPU (#%d) not listed by the BIOS.\n",
963 boot_cpu_physical_apicid);
1020 physid_set(hard_smp_processor_id(), phys_cpu_present_map); 964 physid_set(hard_smp_processor_id(), phys_cpu_present_map);
1021 } 965 }
1022 preempt_enable(); 966 preempt_enable();
@@ -1029,7 +973,8 @@ static int __init smp_sanity_check(unsigned max_cpus)
1029 if (!disable_apic) { 973 if (!disable_apic) {
1030 pr_err("BIOS bug, local APIC #%d not detected!...\n", 974 pr_err("BIOS bug, local APIC #%d not detected!...\n",
1031 boot_cpu_physical_apicid); 975 boot_cpu_physical_apicid);
1032 pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); 976 pr_err("... forcing use of dummy APIC emulation."
977 "(tell your hw vendor)\n");
1033 } 978 }
1034 smpboot_clear_io_apic(); 979 smpboot_clear_io_apic();
1035 disable_ioapic_support(); 980 disable_ioapic_support();
@@ -1042,7 +987,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1042 * If SMP should be disabled, then really disable it! 987 * If SMP should be disabled, then really disable it!
1043 */ 988 */
1044 if (!max_cpus) { 989 if (!max_cpus) {
1045 pr_info("SMP mode deactivated\n"); 990 printk(KERN_INFO "SMP mode deactivated.\n");
1046 smpboot_clear_io_apic(); 991 smpboot_clear_io_apic();
1047 992
1048 connect_bsp_APIC(); 993 connect_bsp_APIC();
@@ -1080,7 +1025,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1080 /* 1025 /*
1081 * Setup boot CPU information 1026 * Setup boot CPU information
1082 */ 1027 */
1083 smp_store_boot_cpu_info(); /* Final full version of the data */ 1028 smp_store_cpu_info(0); /* Final full version of the data */
1084 cpumask_copy(cpu_callin_mask, cpumask_of(0)); 1029 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1085 mb(); 1030 mb();
1086 1031
@@ -1094,7 +1039,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1094 1039
1095 1040
1096 if (smp_sanity_check(max_cpus) < 0) { 1041 if (smp_sanity_check(max_cpus) < 0) {
1097 pr_info("SMP disabled\n"); 1042 printk(KERN_INFO "SMP disabled\n");
1098 disable_smp(); 1043 disable_smp();
1099 goto out; 1044 goto out;
1100 } 1045 }
@@ -1116,11 +1061,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1116 */ 1061 */
1117 setup_local_APIC(); 1062 setup_local_APIC();
1118 1063
1119 if (x2apic_mode)
1120 cpu0_logical_apicid = apic_read(APIC_LDR);
1121 else
1122 cpu0_logical_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
1123
1124 /* 1064 /*
1125 * Enable IO APIC before setting up error vector 1065 * Enable IO APIC before setting up error vector
1126 */ 1066 */
@@ -1137,7 +1077,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1137 * Set up local APIC timer on boot CPU. 1077 * Set up local APIC timer on boot CPU.
1138 */ 1078 */
1139 1079
1140 pr_info("CPU%d: ", 0); 1080 printk(KERN_INFO "CPU%d: ", 0);
1141 print_cpu_info(&cpu_data(0)); 1081 print_cpu_info(&cpu_data(0));
1142 x86_init.timers.setup_percpu_clockev(); 1082 x86_init.timers.setup_percpu_clockev();
1143 1083
@@ -1149,6 +1089,20 @@ out:
1149 preempt_enable(); 1089 preempt_enable();
1150} 1090}
1151 1091
1092void arch_disable_nonboot_cpus_begin(void)
1093{
1094 /*
1095 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1096 * In the suspend path, we will be back in the SMP mode shortly anyways.
1097 */
1098 skip_smp_alternatives = true;
1099}
1100
1101void arch_disable_nonboot_cpus_end(void)
1102{
1103 skip_smp_alternatives = false;
1104}
1105
1152void arch_enable_nonboot_cpus_begin(void) 1106void arch_enable_nonboot_cpus_begin(void)
1153{ 1107{
1154 set_mtrr_aps_delayed_init(); 1108 set_mtrr_aps_delayed_init();
@@ -1173,9 +1127,8 @@ void __init native_smp_prepare_boot_cpu(void)
1173 1127
1174void __init native_smp_cpus_done(unsigned int max_cpus) 1128void __init native_smp_cpus_done(unsigned int max_cpus)
1175{ 1129{
1176 pr_debug("Boot done\n"); 1130 pr_debug("Boot done.\n");
1177 1131
1178 nmi_selftest();
1179 impress_friends(); 1132 impress_friends();
1180#ifdef CONFIG_X86_IO_APIC 1133#ifdef CONFIG_X86_IO_APIC
1181 setup_ioapic_dest(); 1134 setup_ioapic_dest();
@@ -1234,7 +1187,8 @@ __init void prefill_possible_map(void)
1234 1187
1235 /* nr_cpu_ids could be reduced via nr_cpus= */ 1188 /* nr_cpu_ids could be reduced via nr_cpus= */
1236 if (possible > nr_cpu_ids) { 1189 if (possible > nr_cpu_ids) {
1237 pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", 1190 printk(KERN_WARNING
1191 "%d Processors exceeds NR_CPUS limit of %d\n",
1238 possible, nr_cpu_ids); 1192 possible, nr_cpu_ids);
1239 possible = nr_cpu_ids; 1193 possible = nr_cpu_ids;
1240 } 1194 }
@@ -1243,12 +1197,13 @@ __init void prefill_possible_map(void)
1243 if (!setup_max_cpus) 1197 if (!setup_max_cpus)
1244#endif 1198#endif
1245 if (possible > i) { 1199 if (possible > i) {
1246 pr_warn("%d Processors exceeds max_cpus limit of %u\n", 1200 printk(KERN_WARNING
1201 "%d Processors exceeds max_cpus limit of %u\n",
1247 possible, setup_max_cpus); 1202 possible, setup_max_cpus);
1248 possible = i; 1203 possible = i;
1249 } 1204 }
1250 1205
1251 pr_info("Allowing %d CPUs, %d hotplug CPUs\n", 1206 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
1252 possible, max_t(int, possible - num_processors, 0)); 1207 possible, max_t(int, possible - num_processors, 0));
1253 1208
1254 for (i = 0; i < possible; i++) 1209 for (i = 0; i < possible; i++)
@@ -1309,6 +1264,19 @@ void cpu_disable_common(void)
1309 1264
1310int native_cpu_disable(void) 1265int native_cpu_disable(void)
1311{ 1266{
1267 int cpu = smp_processor_id();
1268
1269 /*
1270 * Perhaps use cpufreq to drop frequency, but that could go
1271 * into generic code.
1272 *
1273 * We won't take down the boot processor on i386 due to some
1274 * interrupts only being able to be serviced by the BSP.
1275 * Especially so if we're not using an IOAPIC -zwane
1276 */
1277 if (cpu == 0)
1278 return -EBUSY;
1279
1312 clear_local_APIC(); 1280 clear_local_APIC();
1313 1281
1314 cpu_disable_common(); 1282 cpu_disable_common();
@@ -1325,6 +1293,9 @@ void native_cpu_die(unsigned int cpu)
1325 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1293 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1326 if (system_state == SYSTEM_RUNNING) 1294 if (system_state == SYSTEM_RUNNING)
1327 pr_info("CPU %u is now offline\n", cpu); 1295 pr_info("CPU %u is now offline\n", cpu);
1296
1297 if (1 == num_online_cpus())
1298 alternatives_smp_switch(0);
1328 return; 1299 return;
1329 } 1300 }
1330 msleep(100); 1301 msleep(100);
@@ -1348,14 +1319,6 @@ void play_dead_common(void)
1348 local_irq_disable(); 1319 local_irq_disable();
1349} 1320}
1350 1321
1351static bool wakeup_cpu0(void)
1352{
1353 if (smp_processor_id() == 0 && enable_start_cpu0)
1354 return true;
1355
1356 return false;
1357}
1358
1359/* 1322/*
1360 * We need to flush the caches before going to sleep, lest we have 1323 * We need to flush the caches before going to sleep, lest we have
1361 * dirty data in our caches when we come back up. 1324 * dirty data in our caches when we come back up.
@@ -1419,11 +1382,6 @@ static inline void mwait_play_dead(void)
1419 __monitor(mwait_ptr, 0, 0); 1382 __monitor(mwait_ptr, 0, 0);
1420 mb(); 1383 mb();
1421 __mwait(eax, 0); 1384 __mwait(eax, 0);
1422 /*
1423 * If NMI wants to wake up CPU0, start CPU0.
1424 */
1425 if (wakeup_cpu0())
1426 start_cpu0();
1427 } 1385 }
1428} 1386}
1429 1387
@@ -1434,11 +1392,6 @@ static inline void hlt_play_dead(void)
1434 1392
1435 while (1) { 1393 while (1) {
1436 native_halt(); 1394 native_halt();
1437 /*
1438 * If NMI wants to wake up CPU0, start CPU0.
1439 */
1440 if (wakeup_cpu0())
1441 start_cpu0();
1442 } 1395 }
1443} 1396}
1444 1397
@@ -1448,8 +1401,7 @@ void native_play_dead(void)
1448 tboot_shutdown(TB_SHUTDOWN_WFS); 1401 tboot_shutdown(TB_SHUTDOWN_WFS);
1449 1402
1450 mwait_play_dead(); /* Only returns on failure */ 1403 mwait_play_dead(); /* Only returns on failure */
1451 if (cpuidle_play_dead()) 1404 hlt_play_dead();
1452 hlt_play_dead();
1453} 1405}
1454 1406
1455#else /* ... !CONFIG_HOTPLUG_CPU */ 1407#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index cd3b2438a98..c346d116148 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -157,33 +157,6 @@ static int enable_single_step(struct task_struct *child)
157 return 1; 157 return 1;
158} 158}
159 159
160void set_task_blockstep(struct task_struct *task, bool on)
161{
162 unsigned long debugctl;
163
164 /*
165 * Ensure irq/preemption can't change debugctl in between.
166 * Note also that both TIF_BLOCKSTEP and debugctl should
167 * be changed atomically wrt preemption.
168 * FIXME: this means that set/clear TIF_BLOCKSTEP is simply
169 * wrong if task != current, SIGKILL can wakeup the stopped
170 * tracee and set/clear can play with the running task, this
171 * can confuse the next __switch_to_xtra().
172 */
173 local_irq_disable();
174 debugctl = get_debugctlmsr();
175 if (on) {
176 debugctl |= DEBUGCTLMSR_BTF;
177 set_tsk_thread_flag(task, TIF_BLOCKSTEP);
178 } else {
179 debugctl &= ~DEBUGCTLMSR_BTF;
180 clear_tsk_thread_flag(task, TIF_BLOCKSTEP);
181 }
182 if (task == current)
183 update_debugctlmsr(debugctl);
184 local_irq_enable();
185}
186
187/* 160/*
188 * Enable single or block step. 161 * Enable single or block step.
189 */ 162 */
@@ -196,10 +169,19 @@ static void enable_step(struct task_struct *child, bool block)
196 * So no one should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
197 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
198 */ 171 */
199 if (enable_single_step(child) && block) 172 if (enable_single_step(child) && block) {
200 set_task_blockstep(child, true); 173 unsigned long debugctl = get_debugctlmsr();
201 else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) 174
202 set_task_blockstep(child, false); 175 debugctl |= DEBUGCTLMSR_BTF;
176 update_debugctlmsr(debugctl);
177 set_tsk_thread_flag(child, TIF_BLOCKSTEP);
178 } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
179 unsigned long debugctl = get_debugctlmsr();
180
181 debugctl &= ~DEBUGCTLMSR_BTF;
182 update_debugctlmsr(debugctl);
183 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
184 }
203} 185}
204 186
205void user_enable_single_step(struct task_struct *child) 187void user_enable_single_step(struct task_struct *child)
@@ -217,8 +199,13 @@ void user_disable_single_step(struct task_struct *child)
217 /* 199 /*
218 * Make sure block stepping (BTF) is disabled. 200 * Make sure block stepping (BTF) is disabled.
219 */ 201 */
220 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) 202 if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
221 set_task_blockstep(child, false); 203 unsigned long debugctl = get_debugctlmsr();
204
205 debugctl &= ~DEBUGCTLMSR_BTF;
206 update_debugctlmsr(debugctl);
207 clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
208 }
222 209
223 /* Always clear TIF_SINGLESTEP... */ 210 /* Always clear TIF_SINGLESTEP... */
224 clear_tsk_thread_flag(child, TIF_SINGLESTEP); 211 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 97ef74b88e0..ff14a5044ce 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -14,59 +14,10 @@
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/elf.h>
18 17
19#include <asm/ia32.h> 18#include <asm/ia32.h>
20#include <asm/syscalls.h> 19#include <asm/syscalls.h>
21 20
22/*
23 * Align a virtual address to avoid aliasing in the I$ on AMD F15h.
24 */
25static unsigned long get_align_mask(void)
26{
27 /* handle 32- and 64-bit case with a single conditional */
28 if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32())))
29 return 0;
30
31 if (!(current->flags & PF_RANDOMIZE))
32 return 0;
33
34 return va_align.mask;
35}
36
37unsigned long align_vdso_addr(unsigned long addr)
38{
39 unsigned long align_mask = get_align_mask();
40 return (addr + align_mask) & ~align_mask;
41}
42
43static int __init control_va_addr_alignment(char *str)
44{
45 /* guard against enabling this on other CPU families */
46 if (va_align.flags < 0)
47 return 1;
48
49 if (*str == 0)
50 return 1;
51
52 if (*str == '=')
53 str++;
54
55 if (!strcmp(str, "32"))
56 va_align.flags = ALIGN_VA_32;
57 else if (!strcmp(str, "64"))
58 va_align.flags = ALIGN_VA_64;
59 else if (!strcmp(str, "off"))
60 va_align.flags = 0;
61 else if (!strcmp(str, "on"))
62 va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
63 else
64 return 0;
65
66 return 1;
67}
68__setup("align_va_addr", control_va_addr_alignment);
69
70SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, 21SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
71 unsigned long, prot, unsigned long, flags, 22 unsigned long, prot, unsigned long, flags,
72 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
@@ -84,7 +35,7 @@ out:
84static void find_start_end(unsigned long flags, unsigned long *begin, 35static void find_start_end(unsigned long flags, unsigned long *begin,
85 unsigned long *end) 36 unsigned long *end)
86{ 37{
87 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { 38 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT)) {
88 unsigned long new_begin; 39 unsigned long new_begin;
89 /* This is usually used needed to map code in small 40 /* This is usually used needed to map code in small
90 model, so it needs to be in the first 31bit. Limit 41 model, so it needs to be in the first 31bit. Limit
@@ -112,7 +63,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
112{ 63{
113 struct mm_struct *mm = current->mm; 64 struct mm_struct *mm = current->mm;
114 struct vm_area_struct *vma; 65 struct vm_area_struct *vma;
115 struct vm_unmapped_area_info info; 66 unsigned long start_addr;
116 unsigned long begin, end; 67 unsigned long begin, end;
117 68
118 if (flags & MAP_FIXED) 69 if (flags & MAP_FIXED)
@@ -130,16 +81,46 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
130 (!vma || addr + len <= vma->vm_start)) 81 (!vma || addr + len <= vma->vm_start))
131 return addr; 82 return addr;
132 } 83 }
84 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
85 && len <= mm->cached_hole_size) {
86 mm->cached_hole_size = 0;
87 mm->free_area_cache = begin;
88 }
89 addr = mm->free_area_cache;
90 if (addr < begin)
91 addr = begin;
92 start_addr = addr;
93
94full_search:
95 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
96 /* At this point: (!vma || addr < vma->vm_end). */
97 if (end - len < addr) {
98 /*
99 * Start a new search - just in case we missed
100 * some holes.
101 */
102 if (start_addr != begin) {
103 start_addr = addr = begin;
104 mm->cached_hole_size = 0;
105 goto full_search;
106 }
107 return -ENOMEM;
108 }
109 if (!vma || addr + len <= vma->vm_start) {
110 /*
111 * Remember the place where we stopped the search:
112 */
113 mm->free_area_cache = addr + len;
114 return addr;
115 }
116 if (addr + mm->cached_hole_size < vma->vm_start)
117 mm->cached_hole_size = vma->vm_start - addr;
133 118
134 info.flags = 0; 119 addr = vma->vm_end;
135 info.length = len; 120 }
136 info.low_limit = begin;
137 info.high_limit = end;
138 info.align_mask = filp ? get_align_mask() : 0;
139 info.align_offset = pgoff << PAGE_SHIFT;
140 return vm_unmapped_area(&info);
141} 121}
142 122
123
143unsigned long 124unsigned long
144arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, 125arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
145 const unsigned long len, const unsigned long pgoff, 126 const unsigned long len, const unsigned long pgoff,
@@ -148,7 +129,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
148 struct vm_area_struct *vma; 129 struct vm_area_struct *vma;
149 struct mm_struct *mm = current->mm; 130 struct mm_struct *mm = current->mm;
150 unsigned long addr = addr0; 131 unsigned long addr = addr0;
151 struct vm_unmapped_area_info info;
152 132
153 /* requested length too big for entire address space */ 133 /* requested length too big for entire address space */
154 if (len > TASK_SIZE) 134 if (len > TASK_SIZE)
@@ -158,7 +138,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
158 return addr; 138 return addr;
159 139
160 /* for MAP_32BIT mappings we force the legact mmap base */ 140 /* for MAP_32BIT mappings we force the legact mmap base */
161 if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) 141 if (!test_thread_flag(TIF_IA32) && (flags & MAP_32BIT))
162 goto bottomup; 142 goto bottomup;
163 143
164 /* requesting a specific address */ 144 /* requesting a specific address */
@@ -170,16 +150,46 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
170 return addr; 150 return addr;
171 } 151 }
172 152
173 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 153 /* check if free_area_cache is useful for us */
174 info.length = len; 154 if (len <= mm->cached_hole_size) {
175 info.low_limit = PAGE_SIZE; 155 mm->cached_hole_size = 0;
176 info.high_limit = mm->mmap_base; 156 mm->free_area_cache = mm->mmap_base;
177 info.align_mask = filp ? get_align_mask() : 0; 157 }
178 info.align_offset = pgoff << PAGE_SHIFT; 158
179 addr = vm_unmapped_area(&info); 159 /* either no address requested or can't fit in requested address hole */
180 if (!(addr & ~PAGE_MASK)) 160 addr = mm->free_area_cache;
181 return addr; 161
182 VM_BUG_ON(addr != -ENOMEM); 162 /* make sure it can fit in the remaining address space */
163 if (addr > len) {
164 vma = find_vma(mm, addr-len);
165 if (!vma || addr <= vma->vm_start)
166 /* remember the address as a hint for next time */
167 return mm->free_area_cache = addr-len;
168 }
169
170 if (mm->mmap_base < len)
171 goto bottomup;
172
173 addr = mm->mmap_base-len;
174
175 do {
176 /*
177 * Lookup failure means no vma is above this address,
178 * else if new region fits below vma->vm_start,
179 * return with success:
180 */
181 vma = find_vma(mm, addr);
182 if (!vma || addr+len <= vma->vm_start)
183 /* remember the address as a hint for next time */
184 return mm->free_area_cache = addr;
185
186 /* remember the largest hole we saw so far */
187 if (addr + mm->cached_hole_size < vma->vm_start)
188 mm->cached_hole_size = vma->vm_start - addr;
189
190 /* try just below the current vma->vm_start */
191 addr = vma->vm_start-len;
192 } while (len < vma->vm_start);
183 193
184bottomup: 194bottomup:
185 /* 195 /*
@@ -188,5 +198,14 @@ bottomup:
188 * can happen with large stack limits and large mmap() 198 * can happen with large stack limits and large mmap()
189 * allocations. 199 * allocations.
190 */ 200 */
191 return arch_get_unmapped_area(filp, addr0, len, pgoff, flags); 201 mm->cached_hole_size = ~0UL;
202 mm->free_area_cache = TASK_UNMAPPED_BASE;
203 addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
204 /*
205 * Restore the topdown base:
206 */
207 mm->free_area_cache = mm->mmap_base;
208 mm->cached_hole_size = ~0UL;
209
210 return addr;
192} 211}
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
deleted file mode 100644
index 147fcd4941c..00000000000
--- a/arch/x86/kernel/syscall_32.c
+++ /dev/null
@@ -1,25 +0,0 @@
1/* System call table for i386. */
2
3#include <linux/linkage.h>
4#include <linux/sys.h>
5#include <linux/cache.h>
6#include <asm/asm-offsets.h>
7
8#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
9#include <asm/syscalls_32.h>
10#undef __SYSCALL_I386
11
12#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
13
14typedef asmlinkage void (*sys_call_ptr_t)(void);
15
16extern asmlinkage void sys_ni_syscall(void);
17
18const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
19 /*
20 * Smells like a compiler bug -- it doesn't work
21 * when the & below is removed.
22 */
23 [0 ... __NR_syscall_max] = &sys_ni_syscall,
24#include <asm/syscalls_32.h>
25};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 5c7f8c20da7..de87d600829 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -5,19 +5,15 @@
5#include <linux/cache.h> 5#include <linux/cache.h>
6#include <asm/asm-offsets.h> 6#include <asm/asm-offsets.h>
7 7
8#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 8#define __NO_STUBS
9 9
10#ifdef CONFIG_X86_X32_ABI 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) 11#undef _ASM_X86_UNISTD_64_H
12#else 12#include <asm/unistd_64.h>
13# define __SYSCALL_X32(nr, sym, compat) /* nothing */
14#endif
15 13
16#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; 14#undef __SYSCALL
17#include <asm/syscalls_64.h> 15#define __SYSCALL(nr, sym) [nr] = sym,
18#undef __SYSCALL_64 16#undef _ASM_X86_UNISTD_64_H
19
20#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
21 17
22typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
23 19
@@ -25,9 +21,9 @@ extern void sys_ni_syscall(void);
25 21
26const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { 22const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
27 /* 23 /*
28 * Smells like a compiler bug -- it doesn't work 24 *Smells like a like a compiler bug -- it doesn't work
29 * when the & below is removed. 25 *when the & below is removed.
30 */ 26 */
31 [0 ... __NR_syscall_max] = &sys_ni_syscall, 27 [0 ... __NR_syscall_max] = &sys_ni_syscall,
32#include <asm/syscalls_64.h> 28#include <asm/unistd_64.h>
33}; 29};
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index f84fe00fad4..e07a2fc876b 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -22,7 +22,6 @@
22#include <linux/dma_remapping.h> 22#include <linux/dma_remapping.h>
23#include <linux/init_task.h> 23#include <linux/init_task.h>
24#include <linux/spinlock.h> 24#include <linux/spinlock.h>
25#include <linux/export.h>
26#include <linux/delay.h> 25#include <linux/delay.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28#include <linux/init.h> 27#include <linux/init.h>
@@ -32,7 +31,7 @@
32#include <linux/mm.h> 31#include <linux/mm.h>
33#include <linux/tboot.h> 32#include <linux/tboot.h>
34 33
35#include <asm/realmode.h> 34#include <asm/trampoline.h>
36#include <asm/processor.h> 35#include <asm/processor.h>
37#include <asm/bootparam.h> 36#include <asm/bootparam.h>
38#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -44,7 +43,7 @@
44#include <asm/e820.h> 43#include <asm/e820.h>
45#include <asm/io.h> 44#include <asm/io.h>
46 45
47#include "../realmode/rm/wakeup.h" 46#include "acpi/realmode/wakeup.h"
48 47
49/* Global pointer to shared data; NULL means no measured launch. */ 48/* Global pointer to shared data; NULL means no measured launch. */
50struct tboot *tboot __read_mostly; 49struct tboot *tboot __read_mostly;
@@ -201,8 +200,7 @@ static int tboot_setup_sleep(void)
201 add_mac_region(e820.map[i].addr, e820.map[i].size); 200 add_mac_region(e820.map[i].addr, e820.map[i].size);
202 } 201 }
203 202
204 tboot->acpi_sinfo.kernel_s3_resume_vector = 203 tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
205 real_mode_header->wakeup_start;
206 204
207 return 0; 205 return 0;
208} 206}
@@ -273,7 +271,7 @@ static void tboot_copy_fadt(const struct acpi_table_fadt *fadt)
273 offsetof(struct acpi_table_facs, firmware_waking_vector); 271 offsetof(struct acpi_table_facs, firmware_waking_vector);
274} 272}
275 273
276static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control) 274void tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
277{ 275{
278 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = { 276 static u32 acpi_shutdown_map[ACPI_S_STATE_COUNT] = {
279 /* S0,1,2: */ -1, -1, -1, 277 /* S0,1,2: */ -1, -1, -1,
@@ -282,7 +280,7 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
282 /* S5: */ TB_SHUTDOWN_S5 }; 280 /* S5: */ TB_SHUTDOWN_S5 };
283 281
284 if (!tboot_enabled()) 282 if (!tboot_enabled())
285 return 0; 283 return;
286 284
287 tboot_copy_fadt(&acpi_gbl_FADT); 285 tboot_copy_fadt(&acpi_gbl_FADT);
288 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control; 286 tboot->acpi_sinfo.pm1a_cnt_val = pm1a_control;
@@ -293,11 +291,10 @@ static int tboot_sleep(u8 sleep_state, u32 pm1a_control, u32 pm1b_control)
293 if (sleep_state >= ACPI_S_STATE_COUNT || 291 if (sleep_state >= ACPI_S_STATE_COUNT ||
294 acpi_shutdown_map[sleep_state] == -1) { 292 acpi_shutdown_map[sleep_state] == -1) {
295 pr_warning("unsupported sleep state 0x%x\n", sleep_state); 293 pr_warning("unsupported sleep state 0x%x\n", sleep_state);
296 return -1; 294 return;
297 } 295 }
298 296
299 tboot_shutdown(acpi_shutdown_map[sleep_state]); 297 tboot_shutdown(acpi_shutdown_map[sleep_state]);
300 return 0;
301} 298}
302 299
303static atomic_t ap_wfs_count; 300static atomic_t ap_wfs_count;
@@ -347,8 +344,6 @@ static __init int tboot_late_init(void)
347 344
348 atomic_set(&ap_wfs_count, 0); 345 atomic_set(&ap_wfs_count, 0);
349 register_hotcpu_notifier(&tboot_cpu_notifier); 346 register_hotcpu_notifier(&tboot_cpu_notifier);
350
351 acpi_os_set_prepare_sleep(&tboot_sleep);
352 return 0; 347 return 0;
353} 348}
354 349
diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c
index ab40954e113..9e540fee700 100644
--- a/arch/x86/kernel/tce_64.c
+++ b/arch/x86/kernel/tce_64.c
@@ -34,7 +34,6 @@
34#include <asm/tce.h> 34#include <asm/tce.h>
35#include <asm/calgary.h> 35#include <asm/calgary.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm/cacheflush.h>
38 37
39/* flush a tce at 'tceaddr' to main memory */ 38/* flush a tce at 'tceaddr' to main memory */
40static inline void flush_tce(void* tceaddr) 39static inline void flush_tce(void* tceaddr)
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda4..c29e235792a 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <asm/cacheflush.h> 13#include <asm/cacheflush.h>
14#include <asm/sections.h> 14#include <asm/sections.h>
15#include <asm/asm.h>
16 15
17int rodata_test(void) 16int rodata_test(void)
18{ 17{
@@ -43,7 +42,14 @@ int rodata_test(void)
43 ".section .fixup,\"ax\"\n" 42 ".section .fixup,\"ax\"\n"
44 "2: jmp 1b\n" 43 "2: jmp 1b\n"
45 ".previous\n" 44 ".previous\n"
46 _ASM_EXTABLE(0b,2b) 45 ".section __ex_table,\"a\"\n"
46 " .align 16\n"
47#ifdef CONFIG_X86_32
48 " .long 0b,2b\n"
49#else
50 " .quad 0b,2b\n"
51#endif
52 ".previous"
47 : [rslt] "=r" (result) 53 : [rslt] "=r" (result)
48 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL) 54 : [rodata_test] "r" (&rodata_test_data), [zero] "r" (0UL)
49 ); 55 );
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 24d3c91e981..5a64d057be5 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -13,7 +13,7 @@
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/i8253.h> 14#include <linux/i8253.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/export.h> 16#include <linux/mca.h>
17 17
18#include <asm/vsyscall.h> 18#include <asm/vsyscall.h>
19#include <asm/x86_init.h> 19#include <asm/x86_init.h>
@@ -56,7 +56,15 @@ EXPORT_SYMBOL(profile_pc);
56 */ 56 */
57static irqreturn_t timer_interrupt(int irq, void *dev_id) 57static irqreturn_t timer_interrupt(int irq, void *dev_id)
58{ 58{
59 /* Keep nmi watchdog up to date */
60 inc_irq_stat(irq0_irqs);
61
59 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
63
64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
65 if (MCA_bus)
66 outb_p(inb_p(0x61)| 0x80, 0x61);
67
60 return IRQ_HANDLED; 68 return IRQ_HANDLED;
61} 69}
62 70
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9d9d2f9e77a..6bb7b8579e7 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -6,6 +6,7 @@
6 6
7#include <asm/uaccess.h> 7#include <asm/uaccess.h>
8#include <asm/desc.h> 8#include <asm/desc.h>
9#include <asm/system.h>
9#include <asm/ldt.h> 10#include <asm/ldt.h>
10#include <asm/processor.h> 11#include <asm/processor.h>
11#include <asm/proto.h> 12#include <asm/proto.h>
@@ -162,7 +163,7 @@ int regset_tls_get(struct task_struct *target, const struct user_regset *regset,
162{ 163{
163 const struct desc_struct *tls; 164 const struct desc_struct *tls;
164 165
165 if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || 166 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
166 (pos % sizeof(struct user_desc)) != 0 || 167 (pos % sizeof(struct user_desc)) != 0 ||
167 (count % sizeof(struct user_desc)) != 0) 168 (count % sizeof(struct user_desc)) != 0)
168 return -EINVAL; 169 return -EINVAL;
@@ -197,7 +198,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset,
197 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; 198 struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES];
198 const struct user_desc *info; 199 const struct user_desc *info;
199 200
200 if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || 201 if (pos > GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) ||
201 (pos % sizeof(struct user_desc)) != 0 || 202 (pos % sizeof(struct user_desc)) != 0 ||
202 (count % sizeof(struct user_desc)) != 0) 203 (count % sizeof(struct user_desc)) != 0)
203 return -EINVAL; 204 return -EINVAL;
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 6e60b5fe224..8927486a464 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -26,114 +26,26 @@
26 * Send feedback to <colpatch@us.ibm.com> 26 * Send feedback to <colpatch@us.ibm.com>
27 */ 27 */
28#include <linux/nodemask.h> 28#include <linux/nodemask.h>
29#include <linux/export.h>
30#include <linux/mmzone.h> 29#include <linux/mmzone.h>
31#include <linux/init.h> 30#include <linux/init.h>
32#include <linux/smp.h> 31#include <linux/smp.h>
33#include <linux/irq.h>
34#include <asm/cpu.h> 32#include <asm/cpu.h>
35 33
36static DEFINE_PER_CPU(struct x86_cpu, cpu_devices); 34static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
37 35
38#ifdef CONFIG_HOTPLUG_CPU 36#ifdef CONFIG_HOTPLUG_CPU
39
40#ifdef CONFIG_BOOTPARAM_HOTPLUG_CPU0
41static int cpu0_hotpluggable = 1;
42#else
43static int cpu0_hotpluggable;
44static int __init enable_cpu0_hotplug(char *str)
45{
46 cpu0_hotpluggable = 1;
47 return 1;
48}
49
50__setup("cpu0_hotplug", enable_cpu0_hotplug);
51#endif
52
53#ifdef CONFIG_DEBUG_HOTPLUG_CPU0
54/*
55 * This function offlines a CPU as early as possible and allows userspace to
56 * boot up without the CPU. The CPU can be onlined back by user after boot.
57 *
58 * This is only called for debugging CPU offline/online feature.
59 */
60int __ref _debug_hotplug_cpu(int cpu, int action)
61{
62 struct device *dev = get_cpu_device(cpu);
63 int ret;
64
65 if (!cpu_is_hotpluggable(cpu))
66 return -EINVAL;
67
68 cpu_hotplug_driver_lock();
69
70 switch (action) {
71 case 0:
72 ret = cpu_down(cpu);
73 if (!ret) {
74 pr_info("CPU %u is now offline\n", cpu);
75 kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
76 } else
77 pr_debug("Can't offline CPU%d.\n", cpu);
78 break;
79 case 1:
80 ret = cpu_up(cpu);
81 if (!ret)
82 kobject_uevent(&dev->kobj, KOBJ_ONLINE);
83 else
84 pr_debug("Can't online CPU%d.\n", cpu);
85 break;
86 default:
87 ret = -EINVAL;
88 }
89
90 cpu_hotplug_driver_unlock();
91
92 return ret;
93}
94
95static int __init debug_hotplug_cpu(void)
96{
97 _debug_hotplug_cpu(0, 0);
98 return 0;
99}
100
101late_initcall_sync(debug_hotplug_cpu);
102#endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */
103
104int __ref arch_register_cpu(int num) 37int __ref arch_register_cpu(int num)
105{ 38{
106 struct cpuinfo_x86 *c = &cpu_data(num);
107
108 /*
109 * Currently CPU0 is only hotpluggable on Intel platforms. Other
110 * vendors can add hotplug support later.
111 */
112 if (c->x86_vendor != X86_VENDOR_INTEL)
113 cpu0_hotpluggable = 0;
114
115 /* 39 /*
116 * Two known BSP/CPU0 dependencies: Resume from suspend/hibernate 40 * CPU0 cannot be offlined due to several
117 * depends on BSP. PIC interrupts depend on BSP. 41 * restrictions and assumptions in kernel. This basically
42 * doesn't add a control file, one cannot attempt to offline
43 * BSP.
118 * 44 *
119 * If the BSP depencies are under control, one can tell kernel to 45 * Also certain PCI quirks require not to enable hotplug control
120 * enable BSP hotplug. This basically adds a control file and 46 * for all CPU's.
121 * one can attempt to offline BSP.
122 */ 47 */
123 if (num == 0 && cpu0_hotpluggable) { 48 if (num)
124 unsigned int irq;
125 /*
126 * We won't take down the boot processor on i386 if some
127 * interrupts only are able to be serviced by the BSP in PIC.
128 */
129 for_each_active_irq(irq) {
130 if (!IO_APIC_IRQ(irq) && irq_has_action(irq)) {
131 cpu0_hotpluggable = 0;
132 break;
133 }
134 }
135 }
136 if (num || cpu0_hotpluggable)
137 per_cpu(cpu_devices, num).cpu.hotpluggable = 1; 49 per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
138 50
139 return register_cpu(&per_cpu(cpu_devices, num).cpu, num); 51 return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
deleted file mode 100644
index 25b993729f9..00000000000
--- a/arch/x86/kernel/trace_clock.c
+++ /dev/null
@@ -1,21 +0,0 @@
1/*
2 * X86 trace clocks
3 */
4#include <asm/trace_clock.h>
5#include <asm/barrier.h>
6#include <asm/msr.h>
7
8/*
9 * trace_clock_x86_tsc(): A clock that is just the cycle counter.
10 *
11 * Unlike the other clocks, this is not in nanoseconds.
12 */
13u64 notrace trace_clock_x86_tsc(void)
14{
15 u64 ret;
16
17 rdtsc_barrier();
18 rdtscll(ret);
19
20 return ret;
21}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecffca11f4e..6913369c234 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -9,9 +9,6 @@
9/* 9/*
10 * Handle hardware traps and faults. 10 * Handle hardware traps and faults.
11 */ 11 */
12
13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
14
15#include <linux/interrupt.h> 12#include <linux/interrupt.h>
16#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
17#include <linux/spinlock.h> 14#include <linux/spinlock.h>
@@ -40,6 +37,10 @@
40#include <linux/eisa.h> 37#include <linux/eisa.h>
41#endif 38#endif
42 39
40#ifdef CONFIG_MCA
41#include <linux/mca.h>
42#endif
43
43#if defined(CONFIG_EDAC) 44#if defined(CONFIG_EDAC)
44#include <linux/edac.h> 45#include <linux/edac.h>
45#endif 46#endif
@@ -49,13 +50,11 @@
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/debugreg.h> 51#include <asm/debugreg.h>
51#include <linux/atomic.h> 52#include <linux/atomic.h>
52#include <asm/ftrace.h> 53#include <asm/system.h>
53#include <asm/traps.h> 54#include <asm/traps.h>
54#include <asm/desc.h> 55#include <asm/desc.h>
55#include <asm/i387.h> 56#include <asm/i387.h>
56#include <asm/fpu-internal.h>
57#include <asm/mce.h> 57#include <asm/mce.h>
58#include <asm/context_tracking.h>
59 58
60#include <asm/mach_traps.h> 59#include <asm/mach_traps.h>
61 60
@@ -69,6 +68,9 @@
69 68
70asmlinkage int system_call(void); 69asmlinkage int system_call(void);
71 70
71/* Do we ignore FPU interrupts ? */
72char ignore_fpu_irq;
73
72/* 74/*
73 * The IDT has to be page-aligned to simplify the Pentium 75 * The IDT has to be page-aligned to simplify the Pentium
74 * F0 0F bug workaround. 76 * F0 0F bug workaround.
@@ -79,6 +81,15 @@ gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, };
79DECLARE_BITMAP(used_vectors, NR_VECTORS); 81DECLARE_BITMAP(used_vectors, NR_VECTORS);
80EXPORT_SYMBOL_GPL(used_vectors); 82EXPORT_SYMBOL_GPL(used_vectors);
81 83
84static int ignore_nmis;
85
86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
92
82static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
83{ 94{
84 if (regs->flags & X86_EFLAGS_IF) 95 if (regs->flags & X86_EFLAGS_IF)
@@ -105,47 +116,32 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
105 dec_preempt_count(); 116 dec_preempt_count();
106} 117}
107 118
108static int __kprobes 119static void __kprobes
109do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, 120do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
110 struct pt_regs *regs, long error_code) 121 long error_code, siginfo_t *info)
111{ 122{
123 struct task_struct *tsk = current;
124
112#ifdef CONFIG_X86_32 125#ifdef CONFIG_X86_32
113 if (regs->flags & X86_VM_MASK) { 126 if (regs->flags & X86_VM_MASK) {
114 /* 127 /*
115 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. 128 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
116 * On nmi (interrupt 2), do_trap should not be called. 129 * On nmi (interrupt 2), do_trap should not be called.
117 */ 130 */
118 if (trapnr < X86_TRAP_UD) { 131 if (trapnr < 6)
119 if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, 132 goto vm86_trap;
120 error_code, trapnr)) 133 goto trap_signal;
121 return 0;
122 }
123 return -1;
124 } 134 }
125#endif 135#endif
126 if (!user_mode(regs)) {
127 if (!fixup_exception(regs)) {
128 tsk->thread.error_code = error_code;
129 tsk->thread.trap_nr = trapnr;
130 die(str, regs, error_code);
131 }
132 return 0;
133 }
134
135 return -1;
136}
137
138static void __kprobes
139do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
140 long error_code, siginfo_t *info)
141{
142 struct task_struct *tsk = current;
143 136
137 if (!user_mode(regs))
138 goto kernel_trap;
144 139
145 if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) 140#ifdef CONFIG_X86_32
146 return; 141trap_signal:
142#endif
147 /* 143 /*
148 * We want error_code and trap_nr set for userspace faults and 144 * We want error_code and trap_no set for userspace faults and
149 * kernelspace faults which result in die(), but not 145 * kernelspace faults which result in die(), but not
150 * kernelspace faults which are fixed up. die() gives the 146 * kernelspace faults which are fixed up. die() gives the
151 * process no chance to handle the signal and notice the 147 * process no chance to handle the signal and notice the
@@ -154,16 +150,17 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
154 * delivered, faults. See also do_general_protection below. 150 * delivered, faults. See also do_general_protection below.
155 */ 151 */
156 tsk->thread.error_code = error_code; 152 tsk->thread.error_code = error_code;
157 tsk->thread.trap_nr = trapnr; 153 tsk->thread.trap_no = trapnr;
158 154
159#ifdef CONFIG_X86_64 155#ifdef CONFIG_X86_64
160 if (show_unhandled_signals && unhandled_signal(tsk, signr) && 156 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
161 printk_ratelimit()) { 157 printk_ratelimit()) {
162 pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", 158 printk(KERN_INFO
163 tsk->comm, tsk->pid, str, 159 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
164 regs->ip, regs->sp, error_code); 160 tsk->comm, tsk->pid, str,
161 regs->ip, regs->sp, error_code);
165 print_vma_addr(" in ", regs->ip); 162 print_vma_addr(" in ", regs->ip);
166 pr_cont("\n"); 163 printk("\n");
167 } 164 }
168#endif 165#endif
169 166
@@ -171,20 +168,33 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
171 force_sig_info(signr, info, tsk); 168 force_sig_info(signr, info, tsk);
172 else 169 else
173 force_sig(signr, tsk); 170 force_sig(signr, tsk);
171 return;
172
173kernel_trap:
174 if (!fixup_exception(regs)) {
175 tsk->thread.error_code = error_code;
176 tsk->thread.trap_no = trapnr;
177 die(str, regs, error_code);
178 }
179 return;
180
181#ifdef CONFIG_X86_32
182vm86_trap:
183 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
184 error_code, trapnr))
185 goto trap_signal;
186 return;
187#endif
174} 188}
175 189
176#define DO_ERROR(trapnr, signr, str, name) \ 190#define DO_ERROR(trapnr, signr, str, name) \
177dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ 191dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
178{ \ 192{ \
179 exception_enter(regs); \ 193 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
180 if (notify_die(DIE_TRAP, str, regs, error_code, \ 194 == NOTIFY_STOP) \
181 trapnr, signr) == NOTIFY_STOP) { \
182 exception_exit(regs); \
183 return; \ 195 return; \
184 } \
185 conditional_sti(regs); \ 196 conditional_sti(regs); \
186 do_trap(trapnr, signr, str, regs, error_code, NULL); \ 197 do_trap(trapnr, signr, str, regs, error_code, NULL); \
187 exception_exit(regs); \
188} 198}
189 199
190#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 200#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
@@ -195,45 +205,35 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
195 info.si_errno = 0; \ 205 info.si_errno = 0; \
196 info.si_code = sicode; \ 206 info.si_code = sicode; \
197 info.si_addr = (void __user *)siaddr; \ 207 info.si_addr = (void __user *)siaddr; \
198 exception_enter(regs); \ 208 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
199 if (notify_die(DIE_TRAP, str, regs, error_code, \ 209 == NOTIFY_STOP) \
200 trapnr, signr) == NOTIFY_STOP) { \
201 exception_exit(regs); \
202 return; \ 210 return; \
203 } \
204 conditional_sti(regs); \ 211 conditional_sti(regs); \
205 do_trap(trapnr, signr, str, regs, error_code, &info); \ 212 do_trap(trapnr, signr, str, regs, error_code, &info); \
206 exception_exit(regs); \ 213}
207} 214
208 215DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
209DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, 216DO_ERROR(4, SIGSEGV, "overflow", overflow)
210 regs->ip) 217DO_ERROR(5, SIGSEGV, "bounds", bounds)
211DO_ERROR(X86_TRAP_OF, SIGSEGV, "overflow", overflow) 218DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
212DO_ERROR(X86_TRAP_BR, SIGSEGV, "bounds", bounds) 219DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
213DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, 220DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
214 regs->ip) 221DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
215DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun",
216 coprocessor_segment_overrun)
217DO_ERROR(X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS)
218DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present)
219#ifdef CONFIG_X86_32 222#ifdef CONFIG_X86_32
220DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) 223DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
221#endif 224#endif
222DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check, 225DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
223 BUS_ADRALN, 0)
224 226
225#ifdef CONFIG_X86_64 227#ifdef CONFIG_X86_64
226/* Runs on IST stack */ 228/* Runs on IST stack */
227dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) 229dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
228{ 230{
229 exception_enter(regs);
230 if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 231 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
231 X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { 232 12, SIGBUS) == NOTIFY_STOP)
232 preempt_conditional_sti(regs); 233 return;
233 do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); 234 preempt_conditional_sti(regs);
234 preempt_conditional_cli(regs); 235 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
235 } 236 preempt_conditional_cli(regs);
236 exception_exit(regs);
237} 237}
238 238
239dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) 239dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -241,12 +241,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
241 static const char str[] = "double fault"; 241 static const char str[] = "double fault";
242 struct task_struct *tsk = current; 242 struct task_struct *tsk = current;
243 243
244 exception_enter(regs);
245 /* Return not checked because double check cannot be ignored */ 244 /* Return not checked because double check cannot be ignored */
246 notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); 245 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
247 246
248 tsk->thread.error_code = error_code; 247 tsk->thread.error_code = error_code;
249 tsk->thread.trap_nr = X86_TRAP_DF; 248 tsk->thread.trap_no = 8;
250 249
251 /* 250 /*
252 * This is always a kernel trap and never fixable (and thus must 251 * This is always a kernel trap and never fixable (and thus must
@@ -262,81 +261,219 @@ do_general_protection(struct pt_regs *regs, long error_code)
262{ 261{
263 struct task_struct *tsk; 262 struct task_struct *tsk;
264 263
265 exception_enter(regs);
266 conditional_sti(regs); 264 conditional_sti(regs);
267 265
268#ifdef CONFIG_X86_32 266#ifdef CONFIG_X86_32
269 if (regs->flags & X86_VM_MASK) { 267 if (regs->flags & X86_VM_MASK)
270 local_irq_enable(); 268 goto gp_in_vm86;
271 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
272 goto exit;
273 }
274#endif 269#endif
275 270
276 tsk = current; 271 tsk = current;
277 if (!user_mode(regs)) { 272 if (!user_mode(regs))
278 if (fixup_exception(regs)) 273 goto gp_in_kernel;
279 goto exit;
280
281 tsk->thread.error_code = error_code;
282 tsk->thread.trap_nr = X86_TRAP_GP;
283 if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
284 X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
285 die("general protection fault", regs, error_code);
286 goto exit;
287 }
288 274
289 tsk->thread.error_code = error_code; 275 tsk->thread.error_code = error_code;
290 tsk->thread.trap_nr = X86_TRAP_GP; 276 tsk->thread.trap_no = 13;
291 277
292 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && 278 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
293 printk_ratelimit()) { 279 printk_ratelimit()) {
294 pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", 280 printk(KERN_INFO
281 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
295 tsk->comm, task_pid_nr(tsk), 282 tsk->comm, task_pid_nr(tsk),
296 regs->ip, regs->sp, error_code); 283 regs->ip, regs->sp, error_code);
297 print_vma_addr(" in ", regs->ip); 284 print_vma_addr(" in ", regs->ip);
298 pr_cont("\n"); 285 printk("\n");
299 } 286 }
300 287
301 force_sig(SIGSEGV, tsk); 288 force_sig(SIGSEGV, tsk);
302exit: 289 return;
303 exception_exit(regs); 290
291#ifdef CONFIG_X86_32
292gp_in_vm86:
293 local_irq_enable();
294 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
295 return;
296#endif
297
298gp_in_kernel:
299 if (fixup_exception(regs))
300 return;
301
302 tsk->thread.error_code = error_code;
303 tsk->thread.trap_no = 13;
304 if (notify_die(DIE_GPF, "general protection fault", regs,
305 error_code, 13, SIGSEGV) == NOTIFY_STOP)
306 return;
307 die("general protection fault", regs, error_code);
304} 308}
305 309
306/* May run on IST stack. */ 310static int __init setup_unknown_nmi_panic(char *str)
307dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code) 311{
312 unknown_nmi_panic = 1;
313 return 1;
314}
315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
316
317static notrace __kprobes void
318pci_serr_error(unsigned char reason, struct pt_regs *regs)
308{ 319{
309#ifdef CONFIG_DYNAMIC_FTRACE 320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
321 reason, smp_processor_id());
322
310 /* 323 /*
311 * ftrace must be first, everything else may cause a recursive crash. 324 * On some machines, PCI SERR line is used to report memory
312 * See note by declaration of modifying_ftrace_code in ftrace.c 325 * errors. EDAC makes use of it.
313 */ 326 */
314 if (unlikely(atomic_read(&modifying_ftrace_code)) && 327#if defined(CONFIG_EDAC)
315 ftrace_int3_handler(regs)) 328 if (edac_handler_set()) {
329 edac_atomic_assert_error();
316 return; 330 return;
331 }
317#endif 332#endif
318 exception_enter(regs);
319#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
320 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
321 SIGTRAP) == NOTIFY_STOP)
322 goto exit;
323#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
324 333
325 if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, 334 if (panic_on_unrecovered_nmi)
326 SIGTRAP) == NOTIFY_STOP) 335 panic("NMI: Not continuing");
327 goto exit; 336
337 pr_emerg("Dazed and confused, but trying to continue\n");
338
339 /* Clear and disable the PCI SERR error line. */
340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
341 outb(reason, NMI_REASON_PORT);
342}
343
344static notrace __kprobes void
345io_check_error(unsigned char reason, struct pt_regs *regs)
346{
347 unsigned long i;
348
349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
352 show_registers(regs);
328 353
354 if (panic_on_io_nmi)
355 panic("NMI IOCK error: Not continuing");
356
357 /* Re-enable the IOCK line, wait for a few seconds */
358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
359 outb(reason, NMI_REASON_PORT);
360
361 i = 20000;
362 while (--i) {
363 touch_nmi_watchdog();
364 udelay(100);
365 }
366
367 reason &= ~NMI_REASON_CLEAR_IOCHK;
368 outb(reason, NMI_REASON_PORT);
369}
370
371static notrace __kprobes void
372unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
373{
374 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
375 NOTIFY_STOP)
376 return;
377#ifdef CONFIG_MCA
329 /* 378 /*
330 * Let others (NMI) know that the debug stack is in use 379 * Might actually be able to figure out what the guilty party
331 * as we may switch to the interrupt stack. 380 * is:
332 */ 381 */
333 debug_stack_usage_inc(); 382 if (MCA_bus) {
383 mca_handle_nmi();
384 return;
385 }
386#endif
387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
388 reason, smp_processor_id());
389
390 pr_emerg("Do you have a strange power saving mode enabled?\n");
391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
392 panic("NMI: Not continuing");
393
394 pr_emerg("Dazed and confused, but trying to continue\n");
395}
396
397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
398{
399 unsigned char reason = 0;
400
401 /*
402 * CPU-specific NMI must be processed before non-CPU-specific
403 * NMI, otherwise we may lose it, because the CPU-specific
404 * NMI can not be detected/processed on other CPUs.
405 */
406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
407 return;
408
409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
410 raw_spin_lock(&nmi_reason_lock);
411 reason = get_nmi_reason();
412
413 if (reason & NMI_REASON_MASK) {
414 if (reason & NMI_REASON_SERR)
415 pci_serr_error(reason, regs);
416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
419 /*
420 * Reassert NMI in case it became active
421 * meanwhile as it's edge-triggered:
422 */
423 reassert_nmi();
424#endif
425 raw_spin_unlock(&nmi_reason_lock);
426 return;
427 }
428 raw_spin_unlock(&nmi_reason_lock);
429
430 unknown_nmi_error(reason, regs);
431}
432
433dotraplinkage notrace __kprobes void
434do_nmi(struct pt_regs *regs, long error_code)
435{
436 nmi_enter();
437
438 inc_irq_stat(__nmi_count);
439
440 if (!ignore_nmis)
441 default_do_nmi(regs);
442
443 nmi_exit();
444}
445
446void stop_nmi(void)
447{
448 ignore_nmis++;
449}
450
451void restart_nmi(void)
452{
453 ignore_nmis--;
454}
455
456/* May run on IST stack. */
457dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
458{
459#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
460 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
461 == NOTIFY_STOP)
462 return;
463#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
464#ifdef CONFIG_KPROBES
465 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
466 == NOTIFY_STOP)
467 return;
468#else
469 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
470 == NOTIFY_STOP)
471 return;
472#endif
473
334 preempt_conditional_sti(regs); 474 preempt_conditional_sti(regs);
335 do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL); 475 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
336 preempt_conditional_cli(regs); 476 preempt_conditional_cli(regs);
337 debug_stack_usage_dec();
338exit:
339 exception_exit(regs);
340} 477}
341 478
342#ifdef CONFIG_X86_64 479#ifdef CONFIG_X86_64
@@ -397,8 +534,6 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
397 unsigned long dr6; 534 unsigned long dr6;
398 int si_code; 535 int si_code;
399 536
400 exception_enter(regs);
401
402 get_debugreg(dr6, 6); 537 get_debugreg(dr6, 6);
403 538
404 /* Filter out all the reserved bits which are preset to 1 */ 539 /* Filter out all the reserved bits which are preset to 1 */
@@ -414,7 +549,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
414 549
415 /* Catch kmemcheck conditions first of all! */ 550 /* Catch kmemcheck conditions first of all! */
416 if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) 551 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
417 goto exit; 552 return;
418 553
419 /* DR6 may or may not be cleared by the CPU */ 554 /* DR6 may or may not be cleared by the CPU */
420 set_debugreg(0, 6); 555 set_debugreg(0, 6);
@@ -429,23 +564,16 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
429 564
430 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code, 565 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
431 SIGTRAP) == NOTIFY_STOP) 566 SIGTRAP) == NOTIFY_STOP)
432 goto exit; 567 return;
433
434 /*
435 * Let others (NMI) know that the debug stack is in use
436 * as we may switch to the interrupt stack.
437 */
438 debug_stack_usage_inc();
439 568
440 /* It's safe to allow irq's after DR6 has been saved */ 569 /* It's safe to allow irq's after DR6 has been saved */
441 preempt_conditional_sti(regs); 570 preempt_conditional_sti(regs);
442 571
443 if (regs->flags & X86_VM_MASK) { 572 if (regs->flags & X86_VM_MASK) {
444 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 573 handle_vm86_trap((struct kernel_vm86_regs *) regs,
445 X86_TRAP_DB); 574 error_code, 1);
446 preempt_conditional_cli(regs); 575 preempt_conditional_cli(regs);
447 debug_stack_usage_dec(); 576 return;
448 goto exit;
449 } 577 }
450 578
451 /* 579 /*
@@ -464,10 +592,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
464 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) 592 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
465 send_sigtrap(tsk, regs, error_code, si_code); 593 send_sigtrap(tsk, regs, error_code, si_code);
466 preempt_conditional_cli(regs); 594 preempt_conditional_cli(regs);
467 debug_stack_usage_dec();
468 595
469exit: 596 return;
470 exception_exit(regs);
471} 597}
472 598
473/* 599/*
@@ -480,8 +606,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
480 struct task_struct *task = current; 606 struct task_struct *task = current;
481 siginfo_t info; 607 siginfo_t info;
482 unsigned short err; 608 unsigned short err;
483 char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : 609 char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
484 "simd exception";
485 610
486 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) 611 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
487 return; 612 return;
@@ -491,7 +616,7 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
491 { 616 {
492 if (!fixup_exception(regs)) { 617 if (!fixup_exception(regs)) {
493 task->thread.error_code = error_code; 618 task->thread.error_code = error_code;
494 task->thread.trap_nr = trapnr; 619 task->thread.trap_no = trapnr;
495 die(str, regs, error_code); 620 die(str, regs, error_code);
496 } 621 }
497 return; 622 return;
@@ -501,12 +626,12 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
501 * Save the info for the exception handler and clear the error. 626 * Save the info for the exception handler and clear the error.
502 */ 627 */
503 save_init_fpu(task); 628 save_init_fpu(task);
504 task->thread.trap_nr = trapnr; 629 task->thread.trap_no = trapnr;
505 task->thread.error_code = error_code; 630 task->thread.error_code = error_code;
506 info.si_signo = SIGFPE; 631 info.si_signo = SIGFPE;
507 info.si_errno = 0; 632 info.si_errno = 0;
508 info.si_addr = (void __user *)regs->ip; 633 info.si_addr = (void __user *)regs->ip;
509 if (trapnr == X86_TRAP_MF) { 634 if (trapnr == 16) {
510 unsigned short cwd, swd; 635 unsigned short cwd, swd;
511 /* 636 /*
512 * (~cwd & swd) will mask out exceptions that are not set to unmasked 637 * (~cwd & swd) will mask out exceptions that are not set to unmasked
@@ -550,28 +675,27 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
550 info.si_code = FPE_FLTRES; 675 info.si_code = FPE_FLTRES;
551 } else { 676 } else {
552 /* 677 /*
553 * If we're using IRQ 13, or supposedly even some trap 678 * If we're using IRQ 13, or supposedly even some trap 16
554 * X86_TRAP_MF implementations, it's possible 679 * implementations, it's possible we get a spurious trap...
555 * we get a spurious trap, which is not an error.
556 */ 680 */
557 return; 681 return; /* Spurious trap, no error */
558 } 682 }
559 force_sig_info(SIGFPE, &info, task); 683 force_sig_info(SIGFPE, &info, task);
560} 684}
561 685
562dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) 686dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
563{ 687{
564 exception_enter(regs); 688#ifdef CONFIG_X86_32
565 math_error(regs, error_code, X86_TRAP_MF); 689 ignore_fpu_irq = 1;
566 exception_exit(regs); 690#endif
691
692 math_error(regs, error_code, 16);
567} 693}
568 694
569dotraplinkage void 695dotraplinkage void
570do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 696do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
571{ 697{
572 exception_enter(regs); 698 math_error(regs, error_code, 19);
573 math_error(regs, error_code, X86_TRAP_XF);
574 exception_exit(regs);
575} 699}
576 700
577dotraplinkage void 701dotraplinkage void
@@ -580,7 +704,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
580 conditional_sti(regs); 704 conditional_sti(regs);
581#if 0 705#if 0
582 /* No need to warn about this any longer. */ 706 /* No need to warn about this any longer. */
583 pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 707 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
584#endif 708#endif
585} 709}
586 710
@@ -593,18 +717,41 @@ asmlinkage void __attribute__((weak)) smp_threshold_interrupt(void)
593} 717}
594 718
595/* 719/*
720 * __math_state_restore assumes that cr0.TS is already clear and the
721 * fpu state is all ready for use. Used during context switch.
722 */
723void __math_state_restore(void)
724{
725 struct thread_info *thread = current_thread_info();
726 struct task_struct *tsk = thread->task;
727
728 /*
729 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
730 */
731 if (unlikely(restore_fpu_checking(tsk))) {
732 stts();
733 force_sig(SIGSEGV, tsk);
734 return;
735 }
736
737 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
738 tsk->fpu_counter++;
739}
740
741/*
596 * 'math_state_restore()' saves the current math information in the 742 * 'math_state_restore()' saves the current math information in the
597 * old math state array, and gets the new ones from the current task 743 * old math state array, and gets the new ones from the current task
598 * 744 *
599 * Careful.. There are problems with IBM-designed IRQ13 behaviour. 745 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
600 * Don't touch unless you *really* know how it works. 746 * Don't touch unless you *really* know how it works.
601 * 747 *
602 * Must be called with kernel preemption disabled (eg with local 748 * Must be called with kernel preemption disabled (in this case,
603 * local interrupts as in the case of do_device_not_available). 749 * local interrupts are disabled at the call-site in entry.S).
604 */ 750 */
605void math_state_restore(void) 751asmlinkage void math_state_restore(void)
606{ 752{
607 struct task_struct *tsk = current; 753 struct thread_info *thread = current_thread_info();
754 struct task_struct *tsk = thread->task;
608 755
609 if (!tsk_used_math(tsk)) { 756 if (!tsk_used_math(tsk)) {
610 local_irq_enable(); 757 local_irq_enable();
@@ -621,27 +768,15 @@ void math_state_restore(void)
621 local_irq_disable(); 768 local_irq_disable();
622 } 769 }
623 770
624 __thread_fpu_begin(tsk); 771 clts(); /* Allow maths ops (or we recurse) */
625 772
626 /* 773 __math_state_restore();
627 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
628 */
629 if (unlikely(restore_fpu_checking(tsk))) {
630 drop_init_fpu(tsk);
631 force_sig(SIGSEGV, tsk);
632 return;
633 }
634
635 tsk->fpu_counter++;
636} 774}
637EXPORT_SYMBOL_GPL(math_state_restore); 775EXPORT_SYMBOL_GPL(math_state_restore);
638 776
639dotraplinkage void __kprobes 777dotraplinkage void __kprobes
640do_device_not_available(struct pt_regs *regs, long error_code) 778do_device_not_available(struct pt_regs *regs, long error_code)
641{ 779{
642 exception_enter(regs);
643 BUG_ON(use_eager_fpu());
644
645#ifdef CONFIG_MATH_EMULATION 780#ifdef CONFIG_MATH_EMULATION
646 if (read_cr0() & X86_CR0_EM) { 781 if (read_cr0() & X86_CR0_EM) {
647 struct math_emu_info info = { }; 782 struct math_emu_info info = { };
@@ -650,7 +785,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
650 785
651 info.regs = regs; 786 info.regs = regs;
652 math_emulate(&info); 787 math_emulate(&info);
653 exception_exit(regs);
654 return; 788 return;
655 } 789 }
656#endif 790#endif
@@ -658,37 +792,32 @@ do_device_not_available(struct pt_regs *regs, long error_code)
658#ifdef CONFIG_X86_32 792#ifdef CONFIG_X86_32
659 conditional_sti(regs); 793 conditional_sti(regs);
660#endif 794#endif
661 exception_exit(regs);
662} 795}
663 796
664#ifdef CONFIG_X86_32 797#ifdef CONFIG_X86_32
665dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) 798dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
666{ 799{
667 siginfo_t info; 800 siginfo_t info;
668
669 exception_enter(regs);
670 local_irq_enable(); 801 local_irq_enable();
671 802
672 info.si_signo = SIGILL; 803 info.si_signo = SIGILL;
673 info.si_errno = 0; 804 info.si_errno = 0;
674 info.si_code = ILL_BADSTK; 805 info.si_code = ILL_BADSTK;
675 info.si_addr = NULL; 806 info.si_addr = NULL;
676 if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 807 if (notify_die(DIE_TRAP, "iret exception",
677 X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { 808 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
678 do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, 809 return;
679 &info); 810 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
680 }
681 exception_exit(regs);
682} 811}
683#endif 812#endif
684 813
685/* Set of traps needed for early debugging. */ 814/* Set of traps needed for early debugging. */
686void __init early_trap_init(void) 815void __init early_trap_init(void)
687{ 816{
688 set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); 817 set_intr_gate_ist(1, &debug, DEBUG_STACK);
689 /* int3 can be called from all */ 818 /* int3 can be called from all */
690 set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); 819 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
691 set_intr_gate(X86_TRAP_PF, &page_fault); 820 set_intr_gate(14, &page_fault);
692 load_idt(&idt_descr); 821 load_idt(&idt_descr);
693} 822}
694 823
@@ -704,30 +833,30 @@ void __init trap_init(void)
704 early_iounmap(p, 4); 833 early_iounmap(p, 4);
705#endif 834#endif
706 835
707 set_intr_gate(X86_TRAP_DE, &divide_error); 836 set_intr_gate(0, &divide_error);
708 set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK); 837 set_intr_gate_ist(2, &nmi, NMI_STACK);
709 /* int4 can be called from all */ 838 /* int4 can be called from all */
710 set_system_intr_gate(X86_TRAP_OF, &overflow); 839 set_system_intr_gate(4, &overflow);
711 set_intr_gate(X86_TRAP_BR, &bounds); 840 set_intr_gate(5, &bounds);
712 set_intr_gate(X86_TRAP_UD, &invalid_op); 841 set_intr_gate(6, &invalid_op);
713 set_intr_gate(X86_TRAP_NM, &device_not_available); 842 set_intr_gate(7, &device_not_available);
714#ifdef CONFIG_X86_32 843#ifdef CONFIG_X86_32
715 set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS); 844 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
716#else 845#else
717 set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK); 846 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
718#endif 847#endif
719 set_intr_gate(X86_TRAP_OLD_MF, &coprocessor_segment_overrun); 848 set_intr_gate(9, &coprocessor_segment_overrun);
720 set_intr_gate(X86_TRAP_TS, &invalid_TSS); 849 set_intr_gate(10, &invalid_TSS);
721 set_intr_gate(X86_TRAP_NP, &segment_not_present); 850 set_intr_gate(11, &segment_not_present);
722 set_intr_gate_ist(X86_TRAP_SS, &stack_segment, STACKFAULT_STACK); 851 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
723 set_intr_gate(X86_TRAP_GP, &general_protection); 852 set_intr_gate(13, &general_protection);
724 set_intr_gate(X86_TRAP_SPURIOUS, &spurious_interrupt_bug); 853 set_intr_gate(15, &spurious_interrupt_bug);
725 set_intr_gate(X86_TRAP_MF, &coprocessor_error); 854 set_intr_gate(16, &coprocessor_error);
726 set_intr_gate(X86_TRAP_AC, &alignment_check); 855 set_intr_gate(17, &alignment_check);
727#ifdef CONFIG_X86_MCE 856#ifdef CONFIG_X86_MCE
728 set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK); 857 set_intr_gate_ist(18, &machine_check, MCE_STACK);
729#endif 858#endif
730 set_intr_gate(X86_TRAP_XF, &simd_coprocessor_error); 859 set_intr_gate(19, &simd_coprocessor_error);
731 860
732 /* Reserve all the builtin and the syscall vector: */ 861 /* Reserve all the builtin and the syscall vector: */
733 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 862 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
@@ -749,10 +878,4 @@ void __init trap_init(void)
749 cpu_init(); 878 cpu_init();
750 879
751 x86_init.irqs.trap_init(); 880 x86_init.irqs.trap_init();
752
753#ifdef CONFIG_X86_64
754 memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16);
755 set_nmi_gate(X86_TRAP_DB, &debug);
756 set_nmi_gate(X86_TRAP_BP, &int3);
757#endif
758} 881}
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 06ccb5073a3..db483369f10 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1,5 +1,3 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
3#include <linux/kernel.h> 1#include <linux/kernel.h>
4#include <linux/sched.h> 2#include <linux/sched.h>
5#include <linux/init.h> 3#include <linux/init.h>
@@ -37,7 +35,7 @@ static int __read_mostly tsc_unstable;
37 erroneous rdtsc usage on !cpu_has_tsc processors */ 35 erroneous rdtsc usage on !cpu_has_tsc processors */
38static int __read_mostly tsc_disabled = -1; 36static int __read_mostly tsc_disabled = -1;
39 37
40int tsc_clocksource_reliable; 38static int tsc_clocksource_reliable;
41/* 39/*
42 * Scheduler clock - returns current time in nanosec units. 40 * Scheduler clock - returns current time in nanosec units.
43 */ 41 */
@@ -77,12 +75,6 @@ unsigned long long
77sched_clock(void) __attribute__((alias("native_sched_clock"))); 75sched_clock(void) __attribute__((alias("native_sched_clock")));
78#endif 76#endif
79 77
80unsigned long long native_read_tsc(void)
81{
82 return __native_read_tsc();
83}
84EXPORT_SYMBOL(native_read_tsc);
85
86int check_tsc_unstable(void) 78int check_tsc_unstable(void)
87{ 79{
88 return tsc_unstable; 80 return tsc_unstable;
@@ -92,7 +84,8 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable);
92#ifdef CONFIG_X86_TSC 84#ifdef CONFIG_X86_TSC
93int __init notsc_setup(char *str) 85int __init notsc_setup(char *str)
94{ 86{
95 pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); 87 printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
88 "cannot disable TSC completely.\n");
96 tsc_disabled = 1; 89 tsc_disabled = 1;
97 return 1; 90 return 1;
98} 91}
@@ -185,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
185} 178}
186 179
187#define CAL_MS 10 180#define CAL_MS 10
188#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) 181#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
189#define CAL_PIT_LOOPS 1000 182#define CAL_PIT_LOOPS 1000
190 183
191#define CAL2_MS 50 184#define CAL2_MS 50
192#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) 185#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
193#define CAL2_PIT_LOOPS 5000 186#define CAL2_PIT_LOOPS 5000
194 187
195 188
@@ -297,15 +290,14 @@ static inline int pit_verify_msb(unsigned char val)
297static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) 290static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap)
298{ 291{
299 int count; 292 int count;
300 u64 tsc = 0, prev_tsc = 0; 293 u64 tsc = 0;
301 294
302 for (count = 0; count < 50000; count++) { 295 for (count = 0; count < 50000; count++) {
303 if (!pit_verify_msb(val)) 296 if (!pit_verify_msb(val))
304 break; 297 break;
305 prev_tsc = tsc;
306 tsc = get_cycles(); 298 tsc = get_cycles();
307 } 299 }
308 *deltap = get_cycles() - prev_tsc; 300 *deltap = get_cycles() - tsc;
309 *tscp = tsc; 301 *tscp = tsc;
310 302
311 /* 303 /*
@@ -319,9 +311,9 @@ static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *de
319 * How many MSB values do we want to see? We aim for 311 * How many MSB values do we want to see? We aim for
320 * a maximum error rate of 500ppm (in practice the 312 * a maximum error rate of 500ppm (in practice the
321 * real error is much smaller), but refuse to spend 313 * real error is much smaller), but refuse to spend
322 * more than 50ms on it. 314 * more than 25ms on it.
323 */ 315 */
324#define MAX_QUICK_PIT_MS 50 316#define MAX_QUICK_PIT_MS 25
325#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) 317#define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
326 318
327static unsigned long quick_pit_calibrate(void) 319static unsigned long quick_pit_calibrate(void)
@@ -380,7 +372,7 @@ static unsigned long quick_pit_calibrate(void)
380 goto success; 372 goto success;
381 } 373 }
382 } 374 }
383 pr_err("Fast TSC calibration failed\n"); 375 printk("Fast TSC calibration failed\n");
384 return 0; 376 return 0;
385 377
386success: 378success:
@@ -391,15 +383,18 @@ success:
391 * 383 *
392 * As a result, we can depend on there not being 384 * As a result, we can depend on there not being
393 * any odd delays anywhere, and the TSC reads are 385 * any odd delays anywhere, and the TSC reads are
394 * reliable (within the error). 386 * reliable (within the error). We also adjust the
387 * delta to the middle of the error bars, just
388 * because it looks nicer.
395 * 389 *
396 * kHz = ticks / time-in-seconds / 1000; 390 * kHz = ticks / time-in-seconds / 1000;
397 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 391 * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000
398 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) 392 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000)
399 */ 393 */
394 delta += (long)(d2 - d1)/2;
400 delta *= PIT_TICK_RATE; 395 delta *= PIT_TICK_RATE;
401 do_div(delta, i*256*1000); 396 do_div(delta, i*256*1000);
402 pr_info("Fast TSC calibration using PIT\n"); 397 printk("Fast TSC calibration using PIT\n");
403 return delta; 398 return delta;
404} 399}
405 400
@@ -494,8 +489,9 @@ unsigned long native_calibrate_tsc(void)
494 * use the reference value, as it is more precise. 489 * use the reference value, as it is more precise.
495 */ 490 */
496 if (delta >= 90 && delta <= 110) { 491 if (delta >= 90 && delta <= 110) {
497 pr_info("PIT calibration matches %s. %d loops\n", 492 printk(KERN_INFO
498 hpet ? "HPET" : "PMTIMER", i + 1); 493 "TSC: PIT calibration matches %s. %d loops\n",
494 hpet ? "HPET" : "PMTIMER", i + 1);
499 return tsc_ref_min; 495 return tsc_ref_min;
500 } 496 }
501 497
@@ -517,36 +513,38 @@ unsigned long native_calibrate_tsc(void)
517 */ 513 */
518 if (tsc_pit_min == ULONG_MAX) { 514 if (tsc_pit_min == ULONG_MAX) {
519 /* PIT gave no useful value */ 515 /* PIT gave no useful value */
520 pr_warn("Unable to calibrate against PIT\n"); 516 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
521 517
522 /* We don't have an alternative source, disable TSC */ 518 /* We don't have an alternative source, disable TSC */
523 if (!hpet && !ref1 && !ref2) { 519 if (!hpet && !ref1 && !ref2) {
524 pr_notice("No reference (HPET/PMTIMER) available\n"); 520 printk("TSC: No reference (HPET/PMTIMER) available\n");
525 return 0; 521 return 0;
526 } 522 }
527 523
528 /* The alternative source failed as well, disable TSC */ 524 /* The alternative source failed as well, disable TSC */
529 if (tsc_ref_min == ULONG_MAX) { 525 if (tsc_ref_min == ULONG_MAX) {
530 pr_warn("HPET/PMTIMER calibration failed\n"); 526 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
527 "failed.\n");
531 return 0; 528 return 0;
532 } 529 }
533 530
534 /* Use the alternative source */ 531 /* Use the alternative source */
535 pr_info("using %s reference calibration\n", 532 printk(KERN_INFO "TSC: using %s reference calibration\n",
536 hpet ? "HPET" : "PMTIMER"); 533 hpet ? "HPET" : "PMTIMER");
537 534
538 return tsc_ref_min; 535 return tsc_ref_min;
539 } 536 }
540 537
541 /* We don't have an alternative source, use the PIT calibration value */ 538 /* We don't have an alternative source, use the PIT calibration value */
542 if (!hpet && !ref1 && !ref2) { 539 if (!hpet && !ref1 && !ref2) {
543 pr_info("Using PIT calibration value\n"); 540 printk(KERN_INFO "TSC: Using PIT calibration value\n");
544 return tsc_pit_min; 541 return tsc_pit_min;
545 } 542 }
546 543
547 /* The alternative source failed, use the PIT calibration value */ 544 /* The alternative source failed, use the PIT calibration value */
548 if (tsc_ref_min == ULONG_MAX) { 545 if (tsc_ref_min == ULONG_MAX) {
549 pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); 546 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
547 "Using PIT calibration\n");
550 return tsc_pit_min; 548 return tsc_pit_min;
551 } 549 }
552 550
@@ -555,9 +553,9 @@ unsigned long native_calibrate_tsc(void)
555 * the PIT value as we know that there are PMTIMERs around 553 * the PIT value as we know that there are PMTIMERs around
556 * running at double speed. At least we let the user know: 554 * running at double speed. At least we let the user know:
557 */ 555 */
558 pr_warn("PIT calibration deviates from %s: %lu %lu\n", 556 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
559 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); 557 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
560 pr_info("Using PIT calibration value\n"); 558 printk(KERN_INFO "TSC: Using PIT calibration value\n");
561 return tsc_pit_min; 559 return tsc_pit_min;
562} 560}
563 561
@@ -624,8 +622,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
624 622
625 if (cpu_khz) { 623 if (cpu_khz) {
626 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; 624 *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
627 *offset = ns_now - mult_frac(tsc_now, *scale, 625 *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR);
628 (1UL << CYC2NS_SCALE_FACTOR));
629 } 626 }
630 627
631 sched_clock_idle_wakeup_event(0); 628 sched_clock_idle_wakeup_event(0);
@@ -634,7 +631,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
634 631
635static unsigned long long cyc2ns_suspend; 632static unsigned long long cyc2ns_suspend;
636 633
637void tsc_save_sched_clock_state(void) 634void save_sched_clock_state(void)
638{ 635{
639 if (!sched_clock_stable) 636 if (!sched_clock_stable)
640 return; 637 return;
@@ -650,7 +647,7 @@ void tsc_save_sched_clock_state(void)
650 * that sched_clock() continues from the point where it was left off during 647 * that sched_clock() continues from the point where it was left off during
651 * suspend. 648 * suspend.
652 */ 649 */
653void tsc_restore_sched_clock_state(void) 650void restore_sched_clock_state(void)
654{ 651{
655 unsigned long long offset; 652 unsigned long long offset;
656 unsigned long flags; 653 unsigned long flags;
@@ -789,7 +786,7 @@ void mark_tsc_unstable(char *reason)
789 tsc_unstable = 1; 786 tsc_unstable = 1;
790 sched_clock_stable = 0; 787 sched_clock_stable = 0;
791 disable_sched_clock_irqtime(); 788 disable_sched_clock_irqtime();
792 pr_info("Marking TSC unstable due to %s\n", reason); 789 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
793 /* Change only the rating, when not registered */ 790 /* Change only the rating, when not registered */
794 if (clocksource_tsc.mult) 791 if (clocksource_tsc.mult)
795 clocksource_mark_unstable(&clocksource_tsc); 792 clocksource_mark_unstable(&clocksource_tsc);
@@ -916,9 +913,9 @@ static void tsc_refine_calibration_work(struct work_struct *work)
916 goto out; 913 goto out;
917 914
918 tsc_khz = freq; 915 tsc_khz = freq;
919 pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", 916 printk(KERN_INFO "Refined TSC clocksource calibration: "
920 (unsigned long)tsc_khz / 1000, 917 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
921 (unsigned long)tsc_khz % 1000); 918 (unsigned long)tsc_khz % 1000);
922 919
923out: 920out:
924 clocksource_register_khz(&clocksource_tsc, tsc_khz); 921 clocksource_register_khz(&clocksource_tsc, tsc_khz);
@@ -937,16 +934,6 @@ static int __init init_tsc_clocksource(void)
937 clocksource_tsc.rating = 0; 934 clocksource_tsc.rating = 0;
938 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 935 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
939 } 936 }
940
941 /*
942 * Trust the results of the earlier calibration on systems
943 * exporting a reliable TSC.
944 */
945 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
946 clocksource_register_khz(&clocksource_tsc, tsc_khz);
947 return 0;
948 }
949
950 schedule_delayed_work(&tsc_irqwork, 0); 937 schedule_delayed_work(&tsc_irqwork, 0);
951 return 0; 938 return 0;
952} 939}
@@ -974,9 +961,9 @@ void __init tsc_init(void)
974 return; 961 return;
975 } 962 }
976 963
977 pr_info("Detected %lu.%03lu MHz processor\n", 964 printk("Detected %lu.%03lu MHz processor.\n",
978 (unsigned long)cpu_khz / 1000, 965 (unsigned long)cpu_khz / 1000,
979 (unsigned long)cpu_khz % 1000); 966 (unsigned long)cpu_khz % 1000);
980 967
981 /* 968 /*
982 * Secondary CPUs do not run through tsc_init(), so set up 969 * Secondary CPUs do not run through tsc_init(), so set up
@@ -1008,23 +995,3 @@ void __init tsc_init(void)
1008 check_system_tsc_reliable(); 995 check_system_tsc_reliable();
1009} 996}
1010 997
1011#ifdef CONFIG_SMP
1012/*
1013 * If we have a constant TSC and are using the TSC for the delay loop,
1014 * we can skip clock calibration if another cpu in the same socket has already
1015 * been calibrated. This assumes that CONSTANT_TSC applies to all
1016 * cpus in the socket - this should be a safe assumption.
1017 */
1018unsigned long __cpuinit calibrate_delay_is_known(void)
1019{
1020 int i, cpu = smp_processor_id();
1021
1022 if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC))
1023 return 0;
1024
1025 for_each_online_cpu(i)
1026 if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id)
1027 return cpu_data(i).loops_per_jiffy;
1028 return 0;
1029}
1030#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index fc25e60a588..0aa5fed8b9e 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -42,7 +42,7 @@ static __cpuinitdata int nr_warps;
42/* 42/*
43 * TSC-warp measurement loop running on both CPUs: 43 * TSC-warp measurement loop running on both CPUs:
44 */ 44 */
45static __cpuinit void check_tsc_warp(unsigned int timeout) 45static __cpuinit void check_tsc_warp(void)
46{ 46{
47 cycles_t start, now, prev, end; 47 cycles_t start, now, prev, end;
48 int i; 48 int i;
@@ -51,9 +51,9 @@ static __cpuinit void check_tsc_warp(unsigned int timeout)
51 start = get_cycles(); 51 start = get_cycles();
52 rdtsc_barrier(); 52 rdtsc_barrier();
53 /* 53 /*
54 * The measurement runs for 'timeout' msecs: 54 * The measurement runs for 20 msecs:
55 */ 55 */
56 end = start + (cycles_t) tsc_khz * timeout; 56 end = start + tsc_khz * 20ULL;
57 now = start; 57 now = start;
58 58
59 for (i = 0; ; i++) { 59 for (i = 0; ; i++) {
@@ -99,25 +99,6 @@ static __cpuinit void check_tsc_warp(unsigned int timeout)
99} 99}
100 100
101/* 101/*
102 * If the target CPU coming online doesn't have any of its core-siblings
103 * online, a timeout of 20msec will be used for the TSC-warp measurement
104 * loop. Otherwise a smaller timeout of 2msec will be used, as we have some
105 * information about this socket already (and this information grows as we
106 * have more and more logical-siblings in that socket).
107 *
108 * Ideally we should be able to skip the TSC sync check on the other
109 * core-siblings, if the first logical CPU in a socket passed the sync test.
110 * But as the TSC is per-logical CPU and can potentially be modified wrongly
111 * by the bios, TSC sync test for smaller duration should be able
112 * to catch such errors. Also this will catch the condition where all the
113 * cores in the socket doesn't get reset at the same time.
114 */
115static inline unsigned int loop_timeout(int cpu)
116{
117 return (cpumask_weight(cpu_core_mask(cpu)) > 1) ? 2 : 20;
118}
119
120/*
121 * Source CPU calls into this - it waits for the freshly booted 102 * Source CPU calls into this - it waits for the freshly booted
122 * target CPU to arrive and then starts the measurement: 103 * target CPU to arrive and then starts the measurement:
123 */ 104 */
@@ -132,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
132 if (unsynchronized_tsc()) 113 if (unsynchronized_tsc())
133 return; 114 return;
134 115
135 if (tsc_clocksource_reliable) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
136 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
137 pr_info( 118 pr_info(
138 "Skipped synchronization checks as TSC is reliable.\n"); 119 "Skipped synchronization checks as TSC is reliable.\n");
@@ -154,7 +135,7 @@ void __cpuinit check_tsc_sync_source(int cpu)
154 */ 135 */
155 atomic_inc(&start_count); 136 atomic_inc(&start_count);
156 137
157 check_tsc_warp(loop_timeout(cpu)); 138 check_tsc_warp();
158 139
159 while (atomic_read(&stop_count) != cpus-1) 140 while (atomic_read(&stop_count) != cpus-1)
160 cpu_relax(); 141 cpu_relax();
@@ -191,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void)
191{ 172{
192 int cpus = 2; 173 int cpus = 2;
193 174
194 if (unsynchronized_tsc() || tsc_clocksource_reliable) 175 if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE))
195 return; 176 return;
196 177
197 /* 178 /*
@@ -202,7 +183,7 @@ void __cpuinit check_tsc_sync_target(void)
202 while (atomic_read(&start_count) != cpus) 183 while (atomic_read(&start_count) != cpus)
203 cpu_relax(); 184 cpu_relax();
204 185
205 check_tsc_warp(loop_timeout(smp_processor_id())); 186 check_tsc_warp();
206 187
207 /* 188 /*
208 * Ok, we are done: 189 * Ok, we are done:
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
deleted file mode 100644
index c71025b6746..00000000000
--- a/arch/x86/kernel/uprobes.c
+++ /dev/null
@@ -1,697 +0,0 @@
1/*
2 * User-space Probes (UProbes) for x86
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2008-2011
19 * Authors:
20 * Srikar Dronamraju
21 * Jim Keniston
22 */
23#include <linux/kernel.h>
24#include <linux/sched.h>
25#include <linux/ptrace.h>
26#include <linux/uprobes.h>
27#include <linux/uaccess.h>
28
29#include <linux/kdebug.h>
30#include <asm/processor.h>
31#include <asm/insn.h>
32
33/* Post-execution fixups. */
34
35/* No fixup needed */
36#define UPROBE_FIX_NONE 0x0
37
38/* Adjust IP back to vicinity of actual insn */
39#define UPROBE_FIX_IP 0x1
40
41/* Adjust the return address of a call insn */
42#define UPROBE_FIX_CALL 0x2
43
44/* Instruction will modify TF, don't change it */
45#define UPROBE_FIX_SETF 0x4
46
47#define UPROBE_FIX_RIP_AX 0x8000
48#define UPROBE_FIX_RIP_CX 0x4000
49
50#define UPROBE_TRAP_NR UINT_MAX
51
52/* Adaptations for mhiramat x86 decoder v14. */
53#define OPCODE1(insn) ((insn)->opcode.bytes[0])
54#define OPCODE2(insn) ((insn)->opcode.bytes[1])
55#define OPCODE3(insn) ((insn)->opcode.bytes[2])
56#define MODRM_REG(insn) X86_MODRM_REG(insn->modrm.value)
57
58#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
59 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
60 (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \
61 (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \
62 (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \
63 << (row % 32))
64
65/*
66 * Good-instruction tables for 32-bit apps. This is non-const and volatile
67 * to keep gcc from statically optimizing it out, as variable_test_bit makes
68 * some versions of gcc to think only *(unsigned long*) is used.
69 */
70static volatile u32 good_insns_32[256 / 32] = {
71 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
72 /* ---------------------------------------------- */
73 W(0x00, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) | /* 00 */
74 W(0x10, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0) , /* 10 */
75 W(0x20, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) | /* 20 */
76 W(0x30, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1) , /* 30 */
77 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
78 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
79 W(0x60, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
80 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
81 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
82 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
83 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
84 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
85 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
86 W(0xd0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
87 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
88 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
89 /* ---------------------------------------------- */
90 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
91};
92
93/* Using this for both 64-bit and 32-bit apps */
94static volatile u32 good_2byte_insns[256 / 32] = {
95 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
96 /* ---------------------------------------------- */
97 W(0x00, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1) | /* 00 */
98 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* 10 */
99 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 20 */
100 W(0x30, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 30 */
101 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 40 */
102 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
103 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 60 */
104 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) , /* 70 */
105 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
106 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
107 W(0xa0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1) | /* a0 */
108 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
109 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* c0 */
110 W(0xd0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
111 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* e0 */
112 W(0xf0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* f0 */
113 /* ---------------------------------------------- */
114 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
115};
116
117#ifdef CONFIG_X86_64
118/* Good-instruction tables for 64-bit apps */
119static volatile u32 good_insns_64[256 / 32] = {
120 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
121 /* ---------------------------------------------- */
122 W(0x00, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 00 */
123 W(0x10, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 10 */
124 W(0x20, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) | /* 20 */
125 W(0x30, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0) , /* 30 */
126 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
127 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 50 */
128 W(0x60, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 60 */
129 W(0x70, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 70 */
130 W(0x80, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
131 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 90 */
132 W(0xa0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* a0 */
133 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* b0 */
134 W(0xc0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0) | /* c0 */
135 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
136 W(0xe0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* e0 */
137 W(0xf0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1) /* f0 */
138 /* ---------------------------------------------- */
139 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
140};
141#endif
142#undef W
143
144/*
145 * opcodes we'll probably never support:
146 *
147 * 6c-6d, e4-e5, ec-ed - in
148 * 6e-6f, e6-e7, ee-ef - out
149 * cc, cd - int3, int
150 * cf - iret
151 * d6 - illegal instruction
152 * f1 - int1/icebp
153 * f4 - hlt
154 * fa, fb - cli, sti
155 * 0f - lar, lsl, syscall, clts, sysret, sysenter, sysexit, invd, wbinvd, ud2
156 *
157 * invalid opcodes in 64-bit mode:
158 *
159 * 06, 0e, 16, 1e, 27, 2f, 37, 3f, 60-62, 82, c4-c5, d4-d5
160 * 63 - we support this opcode in x86_64 but not in i386.
161 *
162 * opcodes we may need to refine support for:
163 *
164 * 0f - 2-byte instructions: For many of these instructions, the validity
165 * depends on the prefix and/or the reg field. On such instructions, we
166 * just consider the opcode combination valid if it corresponds to any
167 * valid instruction.
168 *
169 * 8f - Group 1 - only reg = 0 is OK
170 * c6-c7 - Group 11 - only reg = 0 is OK
171 * d9-df - fpu insns with some illegal encodings
172 * f2, f3 - repnz, repz prefixes. These are also the first byte for
173 * certain floating-point instructions, such as addsd.
174 *
175 * fe - Group 4 - only reg = 0 or 1 is OK
176 * ff - Group 5 - only reg = 0-6 is OK
177 *
178 * others -- Do we need to support these?
179 *
180 * 0f - (floating-point?) prefetch instructions
181 * 07, 17, 1f - pop es, pop ss, pop ds
182 * 26, 2e, 36, 3e - es:, cs:, ss:, ds: segment prefixes --
183 * but 64 and 65 (fs: and gs:) seem to be used, so we support them
184 * 67 - addr16 prefix
185 * ce - into
186 * f0 - lock prefix
187 */
188
189/*
190 * TODO:
191 * - Where necessary, examine the modrm byte and allow only valid instructions
192 * in the different Groups and fpu instructions.
193 */
194
195static bool is_prefix_bad(struct insn *insn)
196{
197 int i;
198
199 for (i = 0; i < insn->prefixes.nbytes; i++) {
200 switch (insn->prefixes.bytes[i]) {
201 case 0x26: /* INAT_PFX_ES */
202 case 0x2E: /* INAT_PFX_CS */
203 case 0x36: /* INAT_PFX_DS */
204 case 0x3E: /* INAT_PFX_SS */
205 case 0xF0: /* INAT_PFX_LOCK */
206 return true;
207 }
208 }
209 return false;
210}
211
212static int validate_insn_32bits(struct arch_uprobe *auprobe, struct insn *insn)
213{
214 insn_init(insn, auprobe->insn, false);
215
216 /* Skip good instruction prefixes; reject "bad" ones. */
217 insn_get_opcode(insn);
218 if (is_prefix_bad(insn))
219 return -ENOTSUPP;
220
221 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_32))
222 return 0;
223
224 if (insn->opcode.nbytes == 2) {
225 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
226 return 0;
227 }
228
229 return -ENOTSUPP;
230}
231
232/*
233 * Figure out which fixups arch_uprobe_post_xol() will need to perform, and
234 * annotate arch_uprobe->fixups accordingly. To start with,
235 * arch_uprobe->fixups is either zero or it reflects rip-related fixups.
236 */
237static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)
238{
239 bool fix_ip = true, fix_call = false; /* defaults */
240 int reg;
241
242 insn_get_opcode(insn); /* should be a nop */
243
244 switch (OPCODE1(insn)) {
245 case 0x9d:
246 /* popf */
247 auprobe->fixups |= UPROBE_FIX_SETF;
248 break;
249 case 0xc3: /* ret/lret */
250 case 0xcb:
251 case 0xc2:
252 case 0xca:
253 /* ip is correct */
254 fix_ip = false;
255 break;
256 case 0xe8: /* call relative - Fix return addr */
257 fix_call = true;
258 break;
259 case 0x9a: /* call absolute - Fix return addr, not ip */
260 fix_call = true;
261 fix_ip = false;
262 break;
263 case 0xff:
264 insn_get_modrm(insn);
265 reg = MODRM_REG(insn);
266 if (reg == 2 || reg == 3) {
267 /* call or lcall, indirect */
268 /* Fix return addr; ip is correct. */
269 fix_call = true;
270 fix_ip = false;
271 } else if (reg == 4 || reg == 5) {
272 /* jmp or ljmp, indirect */
273 /* ip is correct. */
274 fix_ip = false;
275 }
276 break;
277 case 0xea: /* jmp absolute -- ip is correct */
278 fix_ip = false;
279 break;
280 default:
281 break;
282 }
283 if (fix_ip)
284 auprobe->fixups |= UPROBE_FIX_IP;
285 if (fix_call)
286 auprobe->fixups |= UPROBE_FIX_CALL;
287}
288
289#ifdef CONFIG_X86_64
290/*
291 * If arch_uprobe->insn doesn't use rip-relative addressing, return
292 * immediately. Otherwise, rewrite the instruction so that it accesses
293 * its memory operand indirectly through a scratch register. Set
294 * arch_uprobe->fixups and arch_uprobe->rip_rela_target_address
295 * accordingly. (The contents of the scratch register will be saved
296 * before we single-step the modified instruction, and restored
297 * afterward.)
298 *
299 * We do this because a rip-relative instruction can access only a
300 * relatively small area (+/- 2 GB from the instruction), and the XOL
301 * area typically lies beyond that area. At least for instructions
302 * that store to memory, we can't execute the original instruction
303 * and "fix things up" later, because the misdirected store could be
304 * disastrous.
305 *
306 * Some useful facts about rip-relative instructions:
307 *
308 * - There's always a modrm byte.
309 * - There's never a SIB byte.
310 * - The displacement is always 4 bytes.
311 */
312static void
313handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
314{
315 u8 *cursor;
316 u8 reg;
317
318 if (mm->context.ia32_compat)
319 return;
320
321 auprobe->rip_rela_target_address = 0x0;
322 if (!insn_rip_relative(insn))
323 return;
324
325 /*
326 * insn_rip_relative() would have decoded rex_prefix, modrm.
327 * Clear REX.b bit (extension of MODRM.rm field):
328 * we want to encode rax/rcx, not r8/r9.
329 */
330 if (insn->rex_prefix.nbytes) {
331 cursor = auprobe->insn + insn_offset_rex_prefix(insn);
332 *cursor &= 0xfe; /* Clearing REX.B bit */
333 }
334
335 /*
336 * Point cursor at the modrm byte. The next 4 bytes are the
337 * displacement. Beyond the displacement, for some instructions,
338 * is the immediate operand.
339 */
340 cursor = auprobe->insn + insn_offset_modrm(insn);
341 insn_get_length(insn);
342
343 /*
344 * Convert from rip-relative addressing to indirect addressing
345 * via a scratch register. Change the r/m field from 0x5 (%rip)
346 * to 0x0 (%rax) or 0x1 (%rcx), and squeeze out the offset field.
347 */
348 reg = MODRM_REG(insn);
349 if (reg == 0) {
350 /*
351 * The register operand (if any) is either the A register
352 * (%rax, %eax, etc.) or (if the 0x4 bit is set in the
353 * REX prefix) %r8. In any case, we know the C register
354 * is NOT the register operand, so we use %rcx (register
355 * #1) for the scratch register.
356 */
357 auprobe->fixups = UPROBE_FIX_RIP_CX;
358 /* Change modrm from 00 000 101 to 00 000 001. */
359 *cursor = 0x1;
360 } else {
361 /* Use %rax (register #0) for the scratch register. */
362 auprobe->fixups = UPROBE_FIX_RIP_AX;
363 /* Change modrm from 00 xxx 101 to 00 xxx 000 */
364 *cursor = (reg << 3);
365 }
366
367 /* Target address = address of next instruction + (signed) offset */
368 auprobe->rip_rela_target_address = (long)insn->length + insn->displacement.value;
369
370 /* Displacement field is gone; slide immediate field (if any) over. */
371 if (insn->immediate.nbytes) {
372 cursor++;
373 memmove(cursor, cursor + insn->displacement.nbytes, insn->immediate.nbytes);
374 }
375 return;
376}
377
378static int validate_insn_64bits(struct arch_uprobe *auprobe, struct insn *insn)
379{
380 insn_init(insn, auprobe->insn, true);
381
382 /* Skip good instruction prefixes; reject "bad" ones. */
383 insn_get_opcode(insn);
384 if (is_prefix_bad(insn))
385 return -ENOTSUPP;
386
387 if (test_bit(OPCODE1(insn), (unsigned long *)good_insns_64))
388 return 0;
389
390 if (insn->opcode.nbytes == 2) {
391 if (test_bit(OPCODE2(insn), (unsigned long *)good_2byte_insns))
392 return 0;
393 }
394 return -ENOTSUPP;
395}
396
397static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
398{
399 if (mm->context.ia32_compat)
400 return validate_insn_32bits(auprobe, insn);
401 return validate_insn_64bits(auprobe, insn);
402}
403#else /* 32-bit: */
404static void handle_riprel_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
405{
406 /* No RIP-relative addressing on 32-bit */
407}
408
409static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, struct insn *insn)
410{
411 return validate_insn_32bits(auprobe, insn);
412}
413#endif /* CONFIG_X86_64 */
414
415/**
416 * arch_uprobe_analyze_insn - instruction analysis including validity and fixups.
417 * @mm: the probed address space.
418 * @arch_uprobe: the probepoint information.
419 * @addr: virtual address at which to install the probepoint
420 * Return 0 on success or a -ve number on error.
421 */
422int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
423{
424 int ret;
425 struct insn insn;
426
427 auprobe->fixups = 0;
428 ret = validate_insn_bits(auprobe, mm, &insn);
429 if (ret != 0)
430 return ret;
431
432 handle_riprel_insn(auprobe, mm, &insn);
433 prepare_fixups(auprobe, &insn);
434
435 return 0;
436}
437
438#ifdef CONFIG_X86_64
439/*
440 * If we're emulating a rip-relative instruction, save the contents
441 * of the scratch register and store the target address in that register.
442 */
443static void
444pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
445 struct arch_uprobe_task *autask)
446{
447 if (auprobe->fixups & UPROBE_FIX_RIP_AX) {
448 autask->saved_scratch_register = regs->ax;
449 regs->ax = current->utask->vaddr;
450 regs->ax += auprobe->rip_rela_target_address;
451 } else if (auprobe->fixups & UPROBE_FIX_RIP_CX) {
452 autask->saved_scratch_register = regs->cx;
453 regs->cx = current->utask->vaddr;
454 regs->cx += auprobe->rip_rela_target_address;
455 }
456}
457#else
458static void
459pre_xol_rip_insn(struct arch_uprobe *auprobe, struct pt_regs *regs,
460 struct arch_uprobe_task *autask)
461{
462 /* No RIP-relative addressing on 32-bit */
463}
464#endif
465
466/*
467 * arch_uprobe_pre_xol - prepare to execute out of line.
468 * @auprobe: the probepoint information.
469 * @regs: reflects the saved user state of current task.
470 */
471int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
472{
473 struct arch_uprobe_task *autask;
474
475 autask = &current->utask->autask;
476 autask->saved_trap_nr = current->thread.trap_nr;
477 current->thread.trap_nr = UPROBE_TRAP_NR;
478 regs->ip = current->utask->xol_vaddr;
479 pre_xol_rip_insn(auprobe, regs, autask);
480
481 autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF);
482 regs->flags |= X86_EFLAGS_TF;
483 if (test_tsk_thread_flag(current, TIF_BLOCKSTEP))
484 set_task_blockstep(current, false);
485
486 return 0;
487}
488
489/*
490 * This function is called by arch_uprobe_post_xol() to adjust the return
491 * address pushed by a call instruction executed out of line.
492 */
493static int adjust_ret_addr(unsigned long sp, long correction)
494{
495 int rasize, ncopied;
496 long ra = 0;
497
498 if (is_ia32_task())
499 rasize = 4;
500 else
501 rasize = 8;
502
503 ncopied = copy_from_user(&ra, (void __user *)sp, rasize);
504 if (unlikely(ncopied))
505 return -EFAULT;
506
507 ra += correction;
508 ncopied = copy_to_user((void __user *)sp, &ra, rasize);
509 if (unlikely(ncopied))
510 return -EFAULT;
511
512 return 0;
513}
514
515#ifdef CONFIG_X86_64
516static bool is_riprel_insn(struct arch_uprobe *auprobe)
517{
518 return ((auprobe->fixups & (UPROBE_FIX_RIP_AX | UPROBE_FIX_RIP_CX)) != 0);
519}
520
521static void
522handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
523{
524 if (is_riprel_insn(auprobe)) {
525 struct arch_uprobe_task *autask;
526
527 autask = &current->utask->autask;
528 if (auprobe->fixups & UPROBE_FIX_RIP_AX)
529 regs->ax = autask->saved_scratch_register;
530 else
531 regs->cx = autask->saved_scratch_register;
532
533 /*
534 * The original instruction includes a displacement, and so
535 * is 4 bytes longer than what we've just single-stepped.
536 * Fall through to handle stuff like "jmpq *...(%rip)" and
537 * "callq *...(%rip)".
538 */
539 if (correction)
540 *correction += 4;
541 }
542}
543#else
544static void
545handle_riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs, long *correction)
546{
547 /* No RIP-relative addressing on 32-bit */
548}
549#endif
550
551/*
552 * If xol insn itself traps and generates a signal(Say,
553 * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped
554 * instruction jumps back to its own address. It is assumed that anything
555 * like do_page_fault/do_trap/etc sets thread.trap_nr != -1.
556 *
557 * arch_uprobe_pre_xol/arch_uprobe_post_xol save/restore thread.trap_nr,
558 * arch_uprobe_xol_was_trapped() simply checks that ->trap_nr is not equal to
559 * UPROBE_TRAP_NR == -1 set by arch_uprobe_pre_xol().
560 */
561bool arch_uprobe_xol_was_trapped(struct task_struct *t)
562{
563 if (t->thread.trap_nr != UPROBE_TRAP_NR)
564 return true;
565
566 return false;
567}
568
569/*
570 * Called after single-stepping. To avoid the SMP problems that can
571 * occur when we temporarily put back the original opcode to
572 * single-step, we single-stepped a copy of the instruction.
573 *
574 * This function prepares to resume execution after the single-step.
575 * We have to fix things up as follows:
576 *
577 * Typically, the new ip is relative to the copied instruction. We need
578 * to make it relative to the original instruction (FIX_IP). Exceptions
579 * are return instructions and absolute or indirect jump or call instructions.
580 *
581 * If the single-stepped instruction was a call, the return address that
582 * is atop the stack is the address following the copied instruction. We
583 * need to make it the address following the original instruction (FIX_CALL).
584 *
585 * If the original instruction was a rip-relative instruction such as
586 * "movl %edx,0xnnnn(%rip)", we have instead executed an equivalent
587 * instruction using a scratch register -- e.g., "movl %edx,(%rax)".
588 * We need to restore the contents of the scratch register and adjust
589 * the ip, keeping in mind that the instruction we executed is 4 bytes
590 * shorter than the original instruction (since we squeezed out the offset
591 * field). (FIX_RIP_AX or FIX_RIP_CX)
592 */
593int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
594{
595 struct uprobe_task *utask;
596 long correction;
597 int result = 0;
598
599 WARN_ON_ONCE(current->thread.trap_nr != UPROBE_TRAP_NR);
600
601 utask = current->utask;
602 current->thread.trap_nr = utask->autask.saved_trap_nr;
603 correction = (long)(utask->vaddr - utask->xol_vaddr);
604 handle_riprel_post_xol(auprobe, regs, &correction);
605 if (auprobe->fixups & UPROBE_FIX_IP)
606 regs->ip += correction;
607
608 if (auprobe->fixups & UPROBE_FIX_CALL)
609 result = adjust_ret_addr(regs->sp, correction);
610
611 /*
612 * arch_uprobe_pre_xol() doesn't save the state of TIF_BLOCKSTEP
613 * so we can get an extra SIGTRAP if we do not clear TF. We need
614 * to examine the opcode to make it right.
615 */
616 if (utask->autask.saved_tf)
617 send_sig(SIGTRAP, current, 0);
618 else if (!(auprobe->fixups & UPROBE_FIX_SETF))
619 regs->flags &= ~X86_EFLAGS_TF;
620
621 return result;
622}
623
624/* callback routine for handling exceptions. */
625int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data)
626{
627 struct die_args *args = data;
628 struct pt_regs *regs = args->regs;
629 int ret = NOTIFY_DONE;
630
631 /* We are only interested in userspace traps */
632 if (regs && !user_mode_vm(regs))
633 return NOTIFY_DONE;
634
635 switch (val) {
636 case DIE_INT3:
637 if (uprobe_pre_sstep_notifier(regs))
638 ret = NOTIFY_STOP;
639
640 break;
641
642 case DIE_DEBUG:
643 if (uprobe_post_sstep_notifier(regs))
644 ret = NOTIFY_STOP;
645
646 default:
647 break;
648 }
649
650 return ret;
651}
652
653/*
654 * This function gets called when XOL instruction either gets trapped or
655 * the thread has a fatal signal, so reset the instruction pointer to its
656 * probed address.
657 */
658void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
659{
660 struct uprobe_task *utask = current->utask;
661
662 current->thread.trap_nr = utask->autask.saved_trap_nr;
663 handle_riprel_post_xol(auprobe, regs, NULL);
664 instruction_pointer_set(regs, utask->vaddr);
665
666 /* clear TF if it was set by us in arch_uprobe_pre_xol() */
667 if (!utask->autask.saved_tf)
668 regs->flags &= ~X86_EFLAGS_TF;
669}
670
671/*
672 * Skip these instructions as per the currently known x86 ISA.
673 * rep=0x66*; nop=0x90
674 */
675static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
676{
677 int i;
678
679 for (i = 0; i < MAX_UINSN_BYTES; i++) {
680 if (auprobe->insn[i] == 0x66)
681 continue;
682
683 if (auprobe->insn[i] == 0x90)
684 return true;
685
686 break;
687 }
688 return false;
689}
690
691bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
692{
693 bool ret = __skip_sstep(auprobe, regs);
694 if (ret && (regs->flags & X86_EFLAGS_TF))
695 send_sig(SIGTRAP, current, 0);
696 return ret;
697}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 1dfe69cc78a..863f8753ab0 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -28,8 +28,6 @@
28 * 28 *
29 */ 29 */
30 30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
33#include <linux/capability.h> 31#include <linux/capability.h>
34#include <linux/errno.h> 32#include <linux/errno.h>
35#include <linux/interrupt.h> 33#include <linux/interrupt.h>
@@ -139,14 +137,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
139 local_irq_enable(); 137 local_irq_enable();
140 138
141 if (!current->thread.vm86_info) { 139 if (!current->thread.vm86_info) {
142 pr_alert("no vm86_info: BAD\n"); 140 printk("no vm86_info: BAD\n");
143 do_exit(SIGSEGV); 141 do_exit(SIGSEGV);
144 } 142 }
145 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); 143 set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
146 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs); 144 tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
147 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap); 145 tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
148 if (tmp) { 146 if (tmp) {
149 pr_alert("could not access userspace vm86_info\n"); 147 printk("vm86: could not access userspace vm86_info\n");
150 do_exit(SIGSEGV); 148 do_exit(SIGSEGV);
151 } 149 }
152 150
@@ -174,7 +172,6 @@ static void mark_screen_rdonly(struct mm_struct *mm)
174 spinlock_t *ptl; 172 spinlock_t *ptl;
175 int i; 173 int i;
176 174
177 down_write(&mm->mmap_sem);
178 pgd = pgd_offset(mm, 0xA0000); 175 pgd = pgd_offset(mm, 0xA0000);
179 if (pgd_none_or_clear_bad(pgd)) 176 if (pgd_none_or_clear_bad(pgd))
180 goto out; 177 goto out;
@@ -182,7 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
182 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
183 goto out; 180 goto out;
184 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
185 split_huge_page_pmd_mm(mm, 0xA0000, pmd); 182 split_huge_page_pmd(mm, pmd);
186 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
187 goto out; 184 goto out;
188 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -193,7 +190,6 @@ static void mark_screen_rdonly(struct mm_struct *mm)
193 } 190 }
194 pte_unmap_unlock(pte, ptl); 191 pte_unmap_unlock(pte, ptl);
195out: 192out:
196 up_write(&mm->mmap_sem);
197 flush_tlb(); 193 flush_tlb();
198} 194}
199 195
@@ -339,11 +335,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
339 if (info->flags & VM86_SCREEN_BITMAP) 335 if (info->flags & VM86_SCREEN_BITMAP)
340 mark_screen_rdonly(tsk->mm); 336 mark_screen_rdonly(tsk->mm);
341 337
342 /*call __audit_syscall_exit since we do not exit via the normal paths */ 338 /*call audit_syscall_exit since we do not exit via the normal paths */
343#ifdef CONFIG_AUDITSYSCALL
344 if (unlikely(current->audit_context)) 339 if (unlikely(current->audit_context))
345 __audit_syscall_exit(1, 0); 340 audit_syscall_exit(AUDITSC_RESULT(0), 0);
346#endif
347 341
348 __asm__ __volatile__( 342 __asm__ __volatile__(
349 "movl %0,%%esp\n\t" 343 "movl %0,%%esp\n\t"
@@ -561,9 +555,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
561 if ((trapno == 3) || (trapno == 1)) { 555 if ((trapno == 3) || (trapno == 1)) {
562 KVM86->regs32->ax = VM86_TRAP + (trapno << 8); 556 KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
563 /* setting this flag forces the code in entry_32.S to 557 /* setting this flag forces the code in entry_32.S to
564 the path where we call save_v86_state() and change 558 call save_v86_state() and change the stack pointer
565 the stack pointer to KVM86->regs32 */ 559 to KVM86->regs32 */
566 set_thread_flag(TIF_NOTIFY_RESUME); 560 set_thread_flag(TIF_IRET);
567 return 0; 561 return 0;
568 } 562 }
569 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 563 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -571,7 +565,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
571 } 565 }
572 if (trapno != 1) 566 if (trapno != 1)
573 return 1; /* we let this handle by the calling routine */ 567 return 1; /* we let this handle by the calling routine */
574 current->thread.trap_nr = trapno; 568 current->thread.trap_no = trapno;
575 current->thread.error_code = error_code; 569 current->thread.error_code = error_code;
576 force_sig(SIGTRAP, current); 570 force_sig(SIGTRAP, current);
577 return 0; 571 return 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 22a1530146a..0f703f10901 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -197,6 +197,18 @@ SECTIONS
197 197
198 INIT_DATA_SECTION(16) 198 INIT_DATA_SECTION(16)
199 199
200 /*
201 * Code and data for a variety of lowlevel trampolines, to be
202 * copied into base memory (< 1 MiB) during initialization.
203 * Since it is copied early, the main copy can be discarded
204 * afterwards.
205 */
206 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
207 x86_trampoline_start = .;
208 *(.x86_trampoline)
209 x86_trampoline_end = .;
210 }
211
200 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 212 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
201 __x86_cpu_dev_start = .; 213 __x86_cpu_dev_start = .;
202 *(.x86_cpu_dev.init) 214 *(.x86_cpu_dev.init)
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 992f890283e..a1d804bcd48 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -15,8 +15,6 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/pci_ids.h> 16#include <linux/pci_ids.h>
17#include <linux/pci_regs.h> 17#include <linux/pci_regs.h>
18#include <linux/smp.h>
19#include <linux/irq.h>
20 18
21#include <asm/apic.h> 19#include <asm/apic.h>
22#include <asm/pci-direct.h> 20#include <asm/pci-direct.h>
@@ -24,8 +22,6 @@
24#include <asm/paravirt.h> 22#include <asm/paravirt.h>
25#include <asm/setup.h> 23#include <asm/setup.h>
26 24
27#define TOPOLOGY_REGISTER_OFFSET 0x10
28
29#if defined CONFIG_PCI && defined CONFIG_PARAVIRT 25#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
30/* 26/*
31 * Interrupt control on vSMPowered systems: 27 * Interrupt control on vSMPowered systems:
@@ -96,18 +92,6 @@ static void __init set_vsmp_pv_ops(void)
96 ctl = readl(address + 4); 92 ctl = readl(address + 4);
97 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", 93 printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
98 cap, ctl); 94 cap, ctl);
99
100 /* If possible, let the vSMP foundation route the interrupt optimally */
101#ifdef CONFIG_SMP
102 if (cap & ctl & BIT(8)) {
103 ctl &= ~BIT(8);
104#ifdef CONFIG_PROC_FS
105 /* Don't let users change irq affinity via procfs */
106 no_irq_affinity = 1;
107#endif
108 }
109#endif
110
111 if (cap & ctl & (1 << 4)) { 95 if (cap & ctl & (1 << 4)) {
112 /* Setup irq ops and turn on vSMP IRQ fastpath handling */ 96 /* Setup irq ops and turn on vSMP IRQ fastpath handling */
113 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); 97 pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
@@ -115,11 +99,12 @@ static void __init set_vsmp_pv_ops(void)
115 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); 99 pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl);
116 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); 100 pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl);
117 pv_init_ops.patch = vsmp_patch; 101 pv_init_ops.patch = vsmp_patch;
102
118 ctl &= ~(1 << 4); 103 ctl &= ~(1 << 4);
104 writel(ctl, address + 4);
105 ctl = readl(address + 4);
106 printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
119 } 107 }
120 writel(ctl, address + 4);
121 ctl = readl(address + 4);
122 pr_info("vSMP CTL: control set to:0x%08x\n", ctl);
123 108
124 early_iounmap(address, 8); 109 early_iounmap(address, 8);
125} 110}
@@ -164,73 +149,12 @@ int is_vsmp_box(void)
164 return 0; 149 return 0;
165} 150}
166#endif 151#endif
167
168static void __init vsmp_cap_cpus(void)
169{
170#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP)
171 void __iomem *address;
172 unsigned int cfg, topology, node_shift, maxcpus;
173
174 /*
175 * CONFIG_X86_VSMP is not configured, so limit the number CPUs to the
176 * ones present in the first board, unless explicitly overridden by
177 * setup_max_cpus
178 */
179 if (setup_max_cpus != NR_CPUS)
180 return;
181
182 /* Read the vSMP Foundation topology register */
183 cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
184 address = early_ioremap(cfg + TOPOLOGY_REGISTER_OFFSET, 4);
185 if (WARN_ON(!address))
186 return;
187
188 topology = readl(address);
189 node_shift = (topology >> 16) & 0x7;
190 if (!node_shift)
191 /* The value 0 should be decoded as 8 */
192 node_shift = 8;
193 maxcpus = (topology & ((1 << node_shift) - 1)) + 1;
194
195 pr_info("vSMP CTL: Capping CPUs to %d (CONFIG_X86_VSMP is unset)\n",
196 maxcpus);
197 setup_max_cpus = maxcpus;
198 early_iounmap(address, 4);
199#endif
200}
201
202static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
203{
204 return hard_smp_processor_id() >> index_msb;
205}
206
207/*
208 * In vSMP, all cpus should be capable of handling interrupts, regardless of
209 * the APIC used.
210 */
211static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask,
212 const struct cpumask *mask)
213{
214 cpumask_setall(retmask);
215}
216
217static void vsmp_apic_post_init(void)
218{
219 /* need to update phys_pkg_id */
220 apic->phys_pkg_id = apicid_phys_pkg_id;
221 apic->vector_allocation_domain = fill_vector_allocation_domain;
222}
223
224void __init vsmp_init(void) 152void __init vsmp_init(void)
225{ 153{
226 detect_vsmp_box(); 154 detect_vsmp_box();
227 if (!is_vsmp_box()) 155 if (!is_vsmp_box())
228 return; 156 return;
229 157
230 x86_platform.apic_post_init = vsmp_apic_post_init;
231
232 vsmp_cap_cpus();
233
234 set_vsmp_pv_ops(); 158 set_vsmp_pv_ops();
235 return; 159 return;
236} 160}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 9a907a67be8..b56c65de384 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -18,8 +18,6 @@
18 * use the vDSO. 18 * use the vDSO.
19 */ 19 */
20 20
21#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22
23#include <linux/time.h> 21#include <linux/time.h>
24#include <linux/init.h> 22#include <linux/init.h>
25#include <linux/kernel.h> 23#include <linux/kernel.h>
@@ -27,8 +25,7 @@
27#include <linux/seqlock.h> 25#include <linux/seqlock.h>
28#include <linux/jiffies.h> 26#include <linux/jiffies.h>
29#include <linux/sysctl.h> 27#include <linux/sysctl.h>
30#include <linux/topology.h> 28#include <linux/clocksource.h>
31#include <linux/timekeeper_internal.h>
32#include <linux/getcpu.h> 29#include <linux/getcpu.h>
33#include <linux/cpu.h> 30#include <linux/cpu.h>
34#include <linux/smp.h> 31#include <linux/smp.h>
@@ -54,9 +51,12 @@
54#include "vsyscall_trace.h" 51#include "vsyscall_trace.h"
55 52
56DEFINE_VVAR(int, vgetcpu_mode); 53DEFINE_VVAR(int, vgetcpu_mode);
57DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); 54DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
55{
56 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
57};
58 58
59static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; 59static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
60 60
61static int __init vsyscall_setup(char *str) 61static int __init vsyscall_setup(char *str)
62{ 62{
@@ -79,56 +79,50 @@ early_param("vsyscall", vsyscall_setup);
79 79
80void update_vsyscall_tz(void) 80void update_vsyscall_tz(void)
81{ 81{
82 unsigned long flags;
83
84 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
85 /* sys_tz has changed */
82 vsyscall_gtod_data.sys_tz = sys_tz; 86 vsyscall_gtod_data.sys_tz = sys_tz;
87 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
83} 88}
84 89
85void update_vsyscall(struct timekeeper *tk) 90void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
91 struct clocksource *clock, u32 mult)
86{ 92{
87 struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; 93 unsigned long flags;
88 94
89 write_seqcount_begin(&vdata->seq); 95 write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
90 96
91 /* copy vsyscall data */ 97 /* copy vsyscall data */
92 vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; 98 vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode;
93 vdata->clock.cycle_last = tk->clock->cycle_last; 99 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
94 vdata->clock.mask = tk->clock->mask; 100 vsyscall_gtod_data.clock.mask = clock->mask;
95 vdata->clock.mult = tk->mult; 101 vsyscall_gtod_data.clock.mult = mult;
96 vdata->clock.shift = tk->shift; 102 vsyscall_gtod_data.clock.shift = clock->shift;
97 103 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
98 vdata->wall_time_sec = tk->xtime_sec; 104 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
99 vdata->wall_time_snsec = tk->xtime_nsec; 105 vsyscall_gtod_data.wall_to_monotonic = *wtm;
100 106 vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
101 vdata->monotonic_time_sec = tk->xtime_sec 107
102 + tk->wall_to_monotonic.tv_sec; 108 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
103 vdata->monotonic_time_snsec = tk->xtime_nsec
104 + (tk->wall_to_monotonic.tv_nsec
105 << tk->shift);
106 while (vdata->monotonic_time_snsec >=
107 (((u64)NSEC_PER_SEC) << tk->shift)) {
108 vdata->monotonic_time_snsec -=
109 ((u64)NSEC_PER_SEC) << tk->shift;
110 vdata->monotonic_time_sec++;
111 }
112
113 vdata->wall_time_coarse.tv_sec = tk->xtime_sec;
114 vdata->wall_time_coarse.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
115
116 vdata->monotonic_time_coarse = timespec_add(vdata->wall_time_coarse,
117 tk->wall_to_monotonic);
118
119 write_seqcount_end(&vdata->seq);
120} 109}
121 110
122static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, 111static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
123 const char *message) 112 const char *message)
124{ 113{
125 if (!show_unhandled_signals) 114 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST);
115 struct task_struct *tsk;
116
117 if (!show_unhandled_signals || !__ratelimit(&rs))
126 return; 118 return;
127 119
128 pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", 120 tsk = current;
129 level, current->comm, task_pid_nr(current), 121
130 message, regs->ip, regs->cs, 122 printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n",
131 regs->sp, regs->ax, regs->si, regs->di); 123 level, tsk->comm, task_pid_nr(tsk),
124 message, regs->ip, regs->cs,
125 regs->sp, regs->ax, regs->si, regs->di);
132} 126}
133 127
134static int addr_to_vsyscall_nr(unsigned long addr) 128static int addr_to_vsyscall_nr(unsigned long addr)
@@ -145,40 +139,11 @@ static int addr_to_vsyscall_nr(unsigned long addr)
145 return nr; 139 return nr;
146} 140}
147 141
148static bool write_ok_or_segv(unsigned long ptr, size_t size)
149{
150 /*
151 * XXX: if access_ok, get_user, and put_user handled
152 * sig_on_uaccess_error, this could go away.
153 */
154
155 if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) {
156 siginfo_t info;
157 struct thread_struct *thread = &current->thread;
158
159 thread->error_code = 6; /* user fault, no page, write */
160 thread->cr2 = ptr;
161 thread->trap_nr = X86_TRAP_PF;
162
163 memset(&info, 0, sizeof(info));
164 info.si_signo = SIGSEGV;
165 info.si_errno = 0;
166 info.si_code = SEGV_MAPERR;
167 info.si_addr = (void __user *)ptr;
168
169 force_sig_info(SIGSEGV, &info, current);
170 return false;
171 } else {
172 return true;
173 }
174}
175
176bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) 142bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
177{ 143{
178 struct task_struct *tsk; 144 struct task_struct *tsk;
179 unsigned long caller; 145 unsigned long caller;
180 int vsyscall_nr, syscall_nr, tmp; 146 int vsyscall_nr;
181 int prev_sig_on_uaccess_error;
182 long ret; 147 long ret;
183 148
184 /* 149 /*
@@ -211,72 +176,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
211 } 176 }
212 177
213 tsk = current; 178 tsk = current;
179 if (seccomp_mode(&tsk->seccomp))
180 do_exit(SIGKILL);
214 181
215 /*
216 * Check for access_ok violations and find the syscall nr.
217 *
218 * NULL is a valid user pointer (in the access_ok sense) on 32-bit and
219 * 64-bit, so we don't need to special-case it here. For all the
220 * vsyscalls, NULL means "don't write anything" not "write it at
221 * address 0".
222 */
223 switch (vsyscall_nr) {
224 case 0:
225 if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) ||
226 !write_ok_or_segv(regs->si, sizeof(struct timezone))) {
227 ret = -EFAULT;
228 goto check_fault;
229 }
230
231 syscall_nr = __NR_gettimeofday;
232 break;
233
234 case 1:
235 if (!write_ok_or_segv(regs->di, sizeof(time_t))) {
236 ret = -EFAULT;
237 goto check_fault;
238 }
239
240 syscall_nr = __NR_time;
241 break;
242
243 case 2:
244 if (!write_ok_or_segv(regs->di, sizeof(unsigned)) ||
245 !write_ok_or_segv(regs->si, sizeof(unsigned))) {
246 ret = -EFAULT;
247 goto check_fault;
248 }
249
250 syscall_nr = __NR_getcpu;
251 break;
252 }
253
254 /*
255 * Handle seccomp. regs->ip must be the original value.
256 * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt.
257 *
258 * We could optimize the seccomp disabled case, but performance
259 * here doesn't matter.
260 */
261 regs->orig_ax = syscall_nr;
262 regs->ax = -ENOSYS;
263 tmp = secure_computing(syscall_nr);
264 if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) {
265 warn_bad_vsyscall(KERN_DEBUG, regs,
266 "seccomp tried to change syscall nr or ip");
267 do_exit(SIGSYS);
268 }
269 if (tmp)
270 goto do_ret; /* skip requested */
271
272 /*
273 * With a real vsyscall, page faults cause SIGSEGV. We want to
274 * preserve that behavior to make writing exploits harder.
275 */
276 prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error;
277 current_thread_info()->sig_on_uaccess_error = 1;
278
279 ret = -EFAULT;
280 switch (vsyscall_nr) { 182 switch (vsyscall_nr) {
281 case 0: 183 case 0:
282 ret = sys_gettimeofday( 184 ret = sys_gettimeofday(
@@ -291,35 +193,29 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
291 case 2: 193 case 2:
292 ret = sys_getcpu((unsigned __user *)regs->di, 194 ret = sys_getcpu((unsigned __user *)regs->di,
293 (unsigned __user *)regs->si, 195 (unsigned __user *)regs->si,
294 NULL); 196 0);
295 break; 197 break;
296 } 198 }
297 199
298 current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error;
299
300check_fault:
301 if (ret == -EFAULT) { 200 if (ret == -EFAULT) {
302 /* Bad news -- userspace fed a bad pointer to a vsyscall. */
303 warn_bad_vsyscall(KERN_INFO, regs,
304 "vsyscall fault (exploit attempt?)");
305
306 /* 201 /*
307 * If we failed to generate a signal for any reason, 202 * Bad news -- userspace fed a bad pointer to a vsyscall.
308 * generate one here. (This should be impossible.) 203 *
204 * With a real vsyscall, that would have caused SIGSEGV.
205 * To make writing reliable exploits using the emulated
206 * vsyscalls harder, generate SIGSEGV here as well.
309 */ 207 */
310 if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && 208 warn_bad_vsyscall(KERN_INFO, regs,
311 !sigismember(&tsk->pending.signal, SIGSEGV))) 209 "vsyscall fault (exploit attempt?)");
312 goto sigsegv; 210 goto sigsegv;
313
314 return true; /* Don't emulate the ret. */
315 } 211 }
316 212
317 regs->ax = ret; 213 regs->ax = ret;
318 214
319do_ret:
320 /* Emulate a ret instruction. */ 215 /* Emulate a ret instruction. */
321 regs->ip = caller; 216 regs->ip = caller;
322 regs->sp += 8; 217 regs->sp += 8;
218
323 return true; 219 return true;
324 220
325sigsegv: 221sigsegv:
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1330dd10295..9796c2f3d07 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -13,13 +13,9 @@
13#include <asm/ftrace.h> 13#include <asm/ftrace.h>
14 14
15#ifdef CONFIG_FUNCTION_TRACER 15#ifdef CONFIG_FUNCTION_TRACER
16/* mcount and __fentry__ are defined in assembly */ 16/* mcount is defined in assembly */
17#ifdef CC_USING_FENTRY
18EXPORT_SYMBOL(__fentry__);
19#else
20EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
21#endif 18#endif
22#endif
23 19
24EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
25EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
@@ -32,7 +28,6 @@ EXPORT_SYMBOL(__put_user_8);
32 28
33EXPORT_SYMBOL(copy_user_generic_string); 29EXPORT_SYMBOL(copy_user_generic_string);
34EXPORT_SYMBOL(copy_user_generic_unrolled); 30EXPORT_SYMBOL(copy_user_generic_unrolled);
35EXPORT_SYMBOL(copy_user_enhanced_fast_string);
36EXPORT_SYMBOL(__copy_user_nocache); 31EXPORT_SYMBOL(__copy_user_nocache);
37EXPORT_SYMBOL(_copy_from_user); 32EXPORT_SYMBOL(_copy_from_user);
38EXPORT_SYMBOL(_copy_to_user); 33EXPORT_SYMBOL(_copy_to_user);
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 7a3d075a814..6f164bd5e14 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -18,14 +18,13 @@
18#include <asm/e820.h> 18#include <asm/e820.h>
19#include <asm/time.h> 19#include <asm/time.h>
20#include <asm/irq.h> 20#include <asm/irq.h>
21#include <asm/io_apic.h>
22#include <asm/pat.h> 21#include <asm/pat.h>
23#include <asm/tsc.h> 22#include <asm/tsc.h>
24#include <asm/iommu.h> 23#include <asm/iommu.h>
25#include <asm/mach_traps.h>
26 24
27void __cpuinit x86_init_noop(void) { } 25void __cpuinit x86_init_noop(void) { }
28void __init x86_init_uint_noop(unsigned int unused) { } 26void __init x86_init_uint_noop(unsigned int unused) { }
27void __init x86_init_pgd_noop(pgd_t *unused) { }
29int __init iommu_init_noop(void) { return 0; } 28int __init iommu_init_noop(void) { return 0; }
30void iommu_shutdown_noop(void) { } 29void iommu_shutdown_noop(void) { }
31 30
@@ -67,7 +66,8 @@ struct x86_init_ops x86_init __initdata = {
67 }, 66 },
68 67
69 .paging = { 68 .paging = {
70 .pagetable_init = native_pagetable_init, 69 .pagetable_setup_start = native_pagetable_setup_start,
70 .pagetable_setup_done = native_pagetable_setup_done,
71 }, 71 },
72 72
73 .timers = { 73 .timers = {
@@ -89,7 +89,6 @@ struct x86_init_ops x86_init __initdata = {
89}; 89};
90 90
91struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 91struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
92 .early_percpu_clock_init = x86_init_noop,
93 .setup_percpu_clockev = setup_secondary_APIC_clock, 92 .setup_percpu_clockev = setup_secondary_APIC_clock,
94}; 93};
95 94
@@ -103,10 +102,7 @@ struct x86_platform_ops x86_platform = {
103 .iommu_shutdown = iommu_shutdown_noop, 102 .iommu_shutdown = iommu_shutdown_noop,
104 .is_untracked_pat_range = is_ISA_range, 103 .is_untracked_pat_range = is_ISA_range,
105 .nmi_init = default_nmi_init, 104 .nmi_init = default_nmi_init,
106 .get_nmi_reason = default_get_nmi_reason, 105 .i8042_detect = default_i8042_detect
107 .i8042_detect = default_i8042_detect,
108 .save_sched_clock_state = tsc_save_sched_clock_state,
109 .restore_sched_clock_state = tsc_restore_sched_clock_state,
110}; 106};
111 107
112EXPORT_SYMBOL_GPL(x86_platform); 108EXPORT_SYMBOL_GPL(x86_platform);
@@ -114,12 +110,4 @@ struct x86_msi_ops x86_msi = {
114 .setup_msi_irqs = native_setup_msi_irqs, 110 .setup_msi_irqs = native_setup_msi_irqs,
115 .teardown_msi_irq = native_teardown_msi_irq, 111 .teardown_msi_irq = native_teardown_msi_irq,
116 .teardown_msi_irqs = default_teardown_msi_irqs, 112 .teardown_msi_irqs = default_teardown_msi_irqs,
117 .restore_msi_irqs = default_restore_msi_irqs,
118};
119
120struct x86_io_apic_ops x86_io_apic_ops = {
121 .init = native_io_apic_init_mappings,
122 .read = native_io_apic_read,
123 .write = native_io_apic_write,
124 .modify = native_io_apic_modify,
125}; 113};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index ada87a329ed..a3911343976 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -3,14 +3,12 @@
3 * 3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com> 4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */ 5 */
6
7#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
8
9#include <linux/bootmem.h> 6#include <linux/bootmem.h>
10#include <linux/compat.h> 7#include <linux/compat.h>
11#include <asm/i387.h> 8#include <asm/i387.h>
12#include <asm/fpu-internal.h> 9#ifdef CONFIG_IA32_EMULATION
13#include <asm/sigframe.h> 10#include <asm/sigcontext32.h>
11#endif
14#include <asm/xcr.h> 12#include <asm/xcr.h>
15 13
16/* 14/*
@@ -21,9 +19,13 @@ u64 pcntxt_mask;
21/* 19/*
22 * Represents init state for the supported extended state. 20 * Represents init state for the supported extended state.
23 */ 21 */
24struct xsave_struct *init_xstate_buf; 22static struct xsave_struct *init_xstate_buf;
23
24struct _fpx_sw_bytes fx_sw_reserved;
25#ifdef CONFIG_IA32_EMULATION
26struct _fpx_sw_bytes fx_sw_reserved_ia32;
27#endif
25 28
26static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;
27static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; 29static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
28 30
29/* 31/*
@@ -38,13 +40,15 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
38 */ 40 */
39void __sanitize_i387_state(struct task_struct *tsk) 41void __sanitize_i387_state(struct task_struct *tsk)
40{ 42{
41 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
42 int feature_bit = 0x2;
43 u64 xstate_bv; 43 u64 xstate_bv;
44 int feature_bit = 0x2;
45 struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
44 46
45 if (!fx) 47 if (!fx)
46 return; 48 return;
47 49
50 BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
51
48 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; 52 xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
49 53
50 /* 54 /*
@@ -98,326 +102,217 @@ void __sanitize_i387_state(struct task_struct *tsk)
98 * Check for the presence of extended state information in the 102 * Check for the presence of extended state information in the
99 * user fpstate pointer in the sigcontext. 103 * user fpstate pointer in the sigcontext.
100 */ 104 */
101static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, 105int check_for_xstate(struct i387_fxsave_struct __user *buf,
102 void __user *fpstate, 106 void __user *fpstate,
103 struct _fpx_sw_bytes *fx_sw) 107 struct _fpx_sw_bytes *fx_sw_user)
104{ 108{
105 int min_xstate_size = sizeof(struct i387_fxsave_struct) + 109 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
106 sizeof(struct xsave_hdr_struct); 110 sizeof(struct xsave_hdr_struct);
107 unsigned int magic2; 111 unsigned int magic2;
112 int err;
108 113
109 if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) 114 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
110 return -1; 115 sizeof(struct _fpx_sw_bytes));
116 if (err)
117 return -EFAULT;
111 118
112 /* Check for the first magic field and other error scenarios. */ 119 /*
113 if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || 120 * First Magic check failed.
114 fx_sw->xstate_size < min_xstate_size || 121 */
115 fx_sw->xstate_size > xstate_size || 122 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
116 fx_sw->xstate_size > fx_sw->extended_size) 123 return -EINVAL;
117 return -1;
118 124
119 /* 125 /*
126 * Check for error scenarios.
127 */
128 if (fx_sw_user->xstate_size < min_xstate_size ||
129 fx_sw_user->xstate_size > xstate_size ||
130 fx_sw_user->xstate_size > fx_sw_user->extended_size)
131 return -EINVAL;
132
133 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
134 fx_sw_user->extended_size -
135 FP_XSTATE_MAGIC2_SIZE));
136 if (err)
137 return err;
138 /*
120 * Check for the presence of second magic word at the end of memory 139 * Check for the presence of second magic word at the end of memory
121 * layout. This detects the case where the user just copied the legacy 140 * layout. This detects the case where the user just copied the legacy
122 * fpstate layout with out copying the extended state information 141 * fpstate layout with out copying the extended state information
123 * in the memory layout. 142 * in the memory layout.
124 */ 143 */
125 if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) 144 if (magic2 != FP_XSTATE_MAGIC2)
126 || magic2 != FP_XSTATE_MAGIC2) 145 return -EFAULT;
127 return -1;
128 146
129 return 0; 147 return 0;
130} 148}
131 149
150#ifdef CONFIG_X86_64
132/* 151/*
133 * Signal frame handlers. 152 * Signal frame handlers.
134 */ 153 */
135static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
136{
137 if (use_fxsr()) {
138 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
139 struct user_i387_ia32_struct env;
140 struct _fpstate_ia32 __user *fp = buf;
141
142 convert_from_fxsr(&env, tsk);
143
144 if (__copy_to_user(buf, &env, sizeof(env)) ||
145 __put_user(xsave->i387.swd, &fp->status) ||
146 __put_user(X86_FXSR_MAGIC, &fp->magic))
147 return -1;
148 } else {
149 struct i387_fsave_struct __user *fp = buf;
150 u32 swd;
151 if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))
152 return -1;
153 }
154
155 return 0;
156}
157
158static inline int save_xstate_epilog(void __user *buf, int ia32_frame)
159{
160 struct xsave_struct __user *x = buf;
161 struct _fpx_sw_bytes *sw_bytes;
162 u32 xstate_bv;
163 int err;
164
165 /* Setup the bytes not touched by the [f]xsave and reserved for SW. */
166 sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved;
167 err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes));
168
169 if (!use_xsave())
170 return err;
171
172 err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size));
173
174 /*
175 * Read the xstate_bv which we copied (directly from the cpu or
176 * from the state in task struct) to the user buffers.
177 */
178 err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
179
180 /*
181 * For legacy compatible, we always set FP/SSE bits in the bit
182 * vector while saving the state to the user context. This will
183 * enable us capturing any changes(during sigreturn) to
184 * the FP/SSE bits by the legacy applications which don't touch
185 * xstate_bv in the xsave header.
186 *
187 * xsave aware apps can change the xstate_bv in the xsave
188 * header as well as change any contents in the memory layout.
189 * xrestore as part of sigreturn will capture all the changes.
190 */
191 xstate_bv |= XSTATE_FPSSE;
192
193 err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv);
194
195 return err;
196}
197 154
198static inline int save_user_xstate(struct xsave_struct __user *buf) 155int save_i387_xstate(void __user *buf)
199{ 156{
200 int err; 157 struct task_struct *tsk = current;
201 158 int err = 0;
202 if (use_xsave())
203 err = xsave_user(buf);
204 else if (use_fxsr())
205 err = fxsave_user((struct i387_fxsave_struct __user *) buf);
206 else
207 err = fsave_user((struct i387_fsave_struct __user *) buf);
208 159
209 if (unlikely(err) && __clear_user(buf, xstate_size)) 160 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
210 err = -EFAULT; 161 return -EACCES;
211 return err;
212}
213 162
214/* 163 BUG_ON(sig_xstate_size < xstate_size);
215 * Save the fpu, extended register state to the user signal frame.
216 *
217 * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save
218 * state is copied.
219 * 'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'.
220 *
221 * buf == buf_fx for 64-bit frames and 32-bit fsave frame.
222 * buf != buf_fx for 32-bit frames with fxstate.
223 *
224 * If the fpu, extended register state is live, save the state directly
225 * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise,
226 * copy the thread's fpu state to the user frame starting at 'buf_fx'.
227 *
228 * If this is a 32-bit frame with fxstate, put a fsave header before
229 * the aligned state at 'buf_fx'.
230 *
231 * For [f]xsave state, update the SW reserved fields in the [f]xsave frame
232 * indicating the absence/presence of the extended state to the user.
233 */
234int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)
235{
236 struct xsave_struct *xsave = &current->thread.fpu.state->xsave;
237 struct task_struct *tsk = current;
238 int ia32_fxstate = (buf != buf_fx);
239 164
240 ia32_fxstate &= (config_enabled(CONFIG_X86_32) || 165 if ((unsigned long)buf % 64)
241 config_enabled(CONFIG_IA32_EMULATION)); 166 printk("save_i387_xstate: bad fpstate %p\n", buf);
242 167
243 if (!access_ok(VERIFY_WRITE, buf, size)) 168 if (!used_math())
244 return -EACCES; 169 return 0;
245 170
246 if (!HAVE_HWFP) 171 if (task_thread_info(tsk)->status & TS_USEDFPU) {
247 return fpregs_soft_get(current, NULL, 0, 172 if (use_xsave())
248 sizeof(struct user_i387_ia32_struct), NULL, 173 err = xsave_user(buf);
249 (struct _fpstate_ia32 __user *) buf) ? -1 : 1; 174 else
175 err = fxsave_user(buf);
250 176
251 if (user_has_fpu()) { 177 if (err)
252 /* Save the live register state to the user directly. */ 178 return err;
253 if (save_user_xstate(buf_fx)) 179 task_thread_info(tsk)->status &= ~TS_USEDFPU;
254 return -1; 180 stts();
255 /* Update the thread's fxstate to save the fsave header. */
256 if (ia32_fxstate)
257 fpu_fxsave(&tsk->thread.fpu);
258 } else { 181 } else {
259 sanitize_i387_state(tsk); 182 sanitize_i387_state(tsk);
260 if (__copy_to_user(buf_fx, xsave, xstate_size)) 183 if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
184 xstate_size))
261 return -1; 185 return -1;
262 } 186 }
263 187
264 /* Save the fsave header for the 32-bit frames. */ 188 clear_used_math(); /* trigger finit */
265 if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf))
266 return -1;
267
268 if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate))
269 return -1;
270 189
271 drop_init_fpu(tsk); /* trigger finit */ 190 if (use_xsave()) {
191 struct _fpstate __user *fx = buf;
192 struct _xstate __user *x = buf;
193 u64 xstate_bv;
272 194
273 return 0; 195 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
274} 196 sizeof(struct _fpx_sw_bytes));
275 197
276static inline void 198 err |= __put_user(FP_XSTATE_MAGIC2,
277sanitize_restored_xstate(struct task_struct *tsk, 199 (__u32 __user *) (buf + sig_xstate_size
278 struct user_i387_ia32_struct *ia32_env, 200 - FP_XSTATE_MAGIC2_SIZE));
279 u64 xstate_bv, int fx_only)
280{
281 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
282 struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr;
283
284 if (use_xsave()) {
285 /* These bits must be zero. */
286 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
287 201
288 /* 202 /*
289 * Init the state that is not present in the memory 203 * Read the xstate_bv which we copied (directly from the cpu or
290 * layout and not enabled by the OS. 204 * from the state in task struct) to the user buffers and
205 * set the FP/SSE bits.
291 */ 206 */
292 if (fx_only) 207 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
293 xsave_hdr->xstate_bv = XSTATE_FPSSE;
294 else
295 xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv);
296 }
297 208
298 if (use_fxsr()) {
299 /* 209 /*
300 * mscsr reserved bits must be masked to zero for security 210 * For legacy compatible, we always set FP/SSE bits in the bit
301 * reasons. 211 * vector while saving the state to the user context. This will
212 * enable us capturing any changes(during sigreturn) to
213 * the FP/SSE bits by the legacy applications which don't touch
214 * xstate_bv in the xsave header.
215 *
216 * xsave aware apps can change the xstate_bv in the xsave
217 * header as well as change any contents in the memory layout.
218 * xrestore as part of sigreturn will capture all the changes.
302 */ 219 */
303 xsave->i387.mxcsr &= mxcsr_feature_mask; 220 xstate_bv |= XSTATE_FPSSE;
304 221
305 convert_to_fxsr(tsk, ia32_env); 222 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
223
224 if (err)
225 return err;
306 } 226 }
227
228 return 1;
307} 229}
308 230
309/* 231/*
310 * Restore the extended state if present. Otherwise, restore the FP/SSE state. 232 * Restore the extended state if present. Otherwise, restore the FP/SSE
233 * state.
311 */ 234 */
312static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only) 235static int restore_user_xstate(void __user *buf)
313{ 236{
314 if (use_xsave()) { 237 struct _fpx_sw_bytes fx_sw_user;
315 if ((unsigned long)buf % 64 || fx_only) { 238 u64 mask;
316 u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; 239 int err;
317 xrstor_state(init_xstate_buf, init_bv); 240
318 return fxrstor_user(buf); 241 if (((unsigned long)buf % 64) ||
319 } else { 242 check_for_xstate(buf, buf, &fx_sw_user))
320 u64 init_bv = pcntxt_mask & ~xbv; 243 goto fx_only;
321 if (unlikely(init_bv)) 244
322 xrstor_state(init_xstate_buf, init_bv); 245 mask = fx_sw_user.xstate_bv;
323 return xrestore_user(buf, xbv); 246
324 } 247 /*
325 } else if (use_fxsr()) { 248 * restore the state passed by the user.
326 return fxrstor_user(buf); 249 */
327 } else 250 err = xrestore_user(buf, mask);
328 return frstor_user(buf); 251 if (err)
252 return err;
253
254 /*
255 * init the state skipped by the user.
256 */
257 mask = pcntxt_mask & ~mask;
258 if (unlikely(mask))
259 xrstor_state(init_xstate_buf, mask);
260
261 return 0;
262
263fx_only:
264 /*
265 * couldn't find the extended state information in the
266 * memory layout. Restore just the FP/SSE and init all
267 * the other extended state.
268 */
269 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
270 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
329} 271}
330 272
331int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) 273/*
274 * This restores directly out of user space. Exceptions are handled.
275 */
276int restore_i387_xstate(void __user *buf)
332{ 277{
333 int ia32_fxstate = (buf != buf_fx);
334 struct task_struct *tsk = current; 278 struct task_struct *tsk = current;
335 int state_size = xstate_size; 279 int err = 0;
336 u64 xstate_bv = 0;
337 int fx_only = 0;
338
339 ia32_fxstate &= (config_enabled(CONFIG_X86_32) ||
340 config_enabled(CONFIG_IA32_EMULATION));
341 280
342 if (!buf) { 281 if (!buf) {
343 drop_init_fpu(tsk); 282 if (used_math())
283 goto clear;
344 return 0; 284 return 0;
345 } 285 } else
346 286 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
347 if (!access_ok(VERIFY_READ, buf, size)) 287 return -EACCES;
348 return -EACCES;
349
350 if (!used_math() && init_fpu(tsk))
351 return -1;
352 288
353 if (!HAVE_HWFP) { 289 if (!used_math()) {
354 return fpregs_soft_set(current, NULL, 290 err = init_fpu(tsk);
355 0, sizeof(struct user_i387_ia32_struct), 291 if (err)
356 NULL, buf) != 0; 292 return err;
357 } 293 }
358 294
359 if (use_xsave()) { 295 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
360 struct _fpx_sw_bytes fx_sw_user; 296 clts();
361 if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { 297 task_thread_info(current)->status |= TS_USEDFPU;
362 /*
363 * Couldn't find the extended state information in the
364 * memory layout. Restore just the FP/SSE and init all
365 * the other extended state.
366 */
367 state_size = sizeof(struct i387_fxsave_struct);
368 fx_only = 1;
369 } else {
370 state_size = fx_sw_user.xstate_size;
371 xstate_bv = fx_sw_user.xstate_bv;
372 }
373 } 298 }
374 299 if (use_xsave())
375 if (ia32_fxstate) { 300 err = restore_user_xstate(buf);
376 /* 301 else
377 * For 32-bit frames with fxstate, copy the user state to the 302 err = fxrstor_checking((__force struct i387_fxsave_struct *)
378 * thread's fpu state, reconstruct fxstate from the fsave 303 buf);
379 * header. Sanitize the copied state etc. 304 if (unlikely(err)) {
380 */
381 struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
382 struct user_i387_ia32_struct env;
383 int err = 0;
384
385 /*
386 * Drop the current fpu which clears used_math(). This ensures
387 * that any context-switch during the copy of the new state,
388 * avoids the intermediate state from getting restored/saved.
389 * Thus avoiding the new restored state from getting corrupted.
390 * We will be ready to restore/save the state only after
391 * set_used_math() is again set.
392 */
393 drop_fpu(tsk);
394
395 if (__copy_from_user(xsave, buf_fx, state_size) ||
396 __copy_from_user(&env, buf, sizeof(env))) {
397 err = -1;
398 } else {
399 sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
400 set_used_math();
401 }
402
403 if (use_eager_fpu())
404 math_state_restore();
405
406 return err;
407 } else {
408 /* 305 /*
409 * For 64-bit frames and 32-bit fsave frames, restore the user 306 * Encountered an error while doing the restore from the
410 * state to the registers directly (with exceptions handled). 307 * user buffer, clear the fpu state.
411 */ 308 */
412 user_fpu_begin(); 309clear:
413 if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { 310 clear_fpu(tsk);
414 drop_init_fpu(tsk); 311 clear_used_math();
415 return -1;
416 }
417 } 312 }
418 313 return err;
419 return 0;
420} 314}
315#endif
421 316
422/* 317/*
423 * Prepare the SW reserved portion of the fxsave memory layout, indicating 318 * Prepare the SW reserved portion of the fxsave memory layout, indicating
@@ -428,23 +323,32 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
428 */ 323 */
429static void prepare_fx_sw_frame(void) 324static void prepare_fx_sw_frame(void)
430{ 325{
431 int fsave_header_size = sizeof(struct i387_fsave_struct); 326 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
432 int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; 327 FP_XSTATE_MAGIC2_SIZE;
328
329 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
433 330
434 if (config_enabled(CONFIG_X86_32)) 331#ifdef CONFIG_IA32_EMULATION
435 size += fsave_header_size; 332 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
333#endif
334
335 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
436 336
437 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; 337 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
438 fx_sw_reserved.extended_size = size; 338 fx_sw_reserved.extended_size = sig_xstate_size;
439 fx_sw_reserved.xstate_bv = pcntxt_mask; 339 fx_sw_reserved.xstate_bv = pcntxt_mask;
440 fx_sw_reserved.xstate_size = xstate_size; 340 fx_sw_reserved.xstate_size = xstate_size;
441 341#ifdef CONFIG_IA32_EMULATION
442 if (config_enabled(CONFIG_IA32_EMULATION)) { 342 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
443 fx_sw_reserved_ia32 = fx_sw_reserved; 343 sizeof(struct _fpx_sw_bytes));
444 fx_sw_reserved_ia32.extended_size += fsave_header_size; 344 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
445 } 345#endif
446} 346}
447 347
348#ifdef CONFIG_X86_64
349unsigned int sig_xstate_size = sizeof(struct _fpstate);
350#endif
351
448/* 352/*
449 * Enable the extended processor state save/restore feature 353 * Enable the extended processor state save/restore feature
450 */ 354 */
@@ -482,21 +386,19 @@ static void __init setup_xstate_features(void)
482/* 386/*
483 * setup the xstate image representing the init state 387 * setup the xstate image representing the init state
484 */ 388 */
485static void __init setup_init_fpu_buf(void) 389static void __init setup_xstate_init(void)
486{ 390{
391 setup_xstate_features();
392
487 /* 393 /*
488 * Setup init_xstate_buf to represent the init state of 394 * Setup init_xstate_buf to represent the init state of
489 * all the features managed by the xsave 395 * all the features managed by the xsave
490 */ 396 */
491 init_xstate_buf = alloc_bootmem_align(xstate_size, 397 init_xstate_buf = alloc_bootmem_align(xstate_size,
492 __alignof__(struct xsave_struct)); 398 __alignof__(struct xsave_struct));
493 fx_finit(&init_xstate_buf->i387); 399 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
494
495 if (!cpu_has_xsave)
496 return;
497
498 setup_xstate_features();
499 400
401 clts();
500 /* 402 /*
501 * Init all the features state with header_bv being 0x0 403 * Init all the features state with header_bv being 0x0
502 */ 404 */
@@ -506,21 +408,9 @@ static void __init setup_init_fpu_buf(void)
506 * of any feature which is not represented by all zero's. 408 * of any feature which is not represented by all zero's.
507 */ 409 */
508 xsave_state(init_xstate_buf, -1); 410 xsave_state(init_xstate_buf, -1);
411 stts();
509} 412}
510 413
511static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
512static int __init eager_fpu_setup(char *s)
513{
514 if (!strcmp(s, "on"))
515 eagerfpu = ENABLE;
516 else if (!strcmp(s, "off"))
517 eagerfpu = DISABLE;
518 else if (!strcmp(s, "auto"))
519 eagerfpu = AUTO;
520 return 1;
521}
522__setup("eagerfpu=", eager_fpu_setup);
523
524/* 414/*
525 * Enable and initialize the xsave feature. 415 * Enable and initialize the xsave feature.
526 */ 416 */
@@ -537,7 +427,7 @@ static void __init xstate_enable_boot_cpu(void)
537 pcntxt_mask = eax + ((u64)edx << 32); 427 pcntxt_mask = eax + ((u64)edx << 32);
538 428
539 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { 429 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
540 pr_err("FP/SSE not shown under xsave features 0x%llx\n", 430 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
541 pcntxt_mask); 431 pcntxt_mask);
542 BUG(); 432 BUG();
543 } 433 }
@@ -557,14 +447,12 @@ static void __init xstate_enable_boot_cpu(void)
557 447
558 update_regset_xstate_info(xstate_size, pcntxt_mask); 448 update_regset_xstate_info(xstate_size, pcntxt_mask);
559 prepare_fx_sw_frame(); 449 prepare_fx_sw_frame();
560 setup_init_fpu_buf();
561 450
562 /* Auto enable eagerfpu for xsaveopt */ 451 setup_xstate_init();
563 if (cpu_has_xsaveopt && eagerfpu != DISABLE)
564 eagerfpu = ENABLE;
565 452
566 pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", 453 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
567 pcntxt_mask, xstate_size); 454 "cntxt size 0x%x\n",
455 pcntxt_mask, xstate_size);
568} 456}
569 457
570/* 458/*
@@ -586,43 +474,3 @@ void __cpuinit xsave_init(void)
586 next_func = xstate_enable; 474 next_func = xstate_enable;
587 this_func(); 475 this_func();
588} 476}
589
590static inline void __init eager_fpu_init_bp(void)
591{
592 current->thread.fpu.state =
593 alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
594 if (!init_xstate_buf)
595 setup_init_fpu_buf();
596}
597
598void __cpuinit eager_fpu_init(void)
599{
600 static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
601
602 clear_used_math();
603 current_thread_info()->status = 0;
604
605 if (eagerfpu == ENABLE)
606 setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
607
608 if (!cpu_has_eager_fpu) {
609 stts();
610 return;
611 }
612
613 if (boot_func) {
614 boot_func();
615 boot_func = NULL;
616 }
617
618 /*
619 * This is same as math_state_restore(). But use_xsave() is
620 * not yet patched to use math_state_restore().
621 */
622 init_fpu(current);
623 __thread_fpu_begin(current);
624 if (cpu_has_xsave)
625 xrstor_state(init_xstate_buf, -1);
626 else
627 fxrstor_checking(&init_xstate_buf->i387);
628}