aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/x86/kernel
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile44
-rw-r--r--arch/x86/kernel/acpi/boot.c150
-rw-r--r--arch/x86/kernel/acpi/cstate.c11
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S35
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h11
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c86
-rw-r--r--arch/x86/kernel/acpi/sleep.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c242
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)84
-rw-r--r--arch/x86/kernel/amd_iommu.c581
-rw-r--r--arch/x86/kernel/amd_iommu_init.c221
-rw-r--r--arch/x86/kernel/amd_nb.c255
-rw-r--r--arch/x86/kernel/apb_timer.c133
-rw-r--r--arch/x86/kernel/aperture_64.c150
-rw-r--r--arch/x86/kernel/apic/Makefile22
-rw-r--r--arch/x86/kernel/apic/apic.c554
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c30
-rw-r--r--arch/x86/kernel/apic/apic_noop.c17
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c45
-rw-r--r--arch/x86/kernel/apic/es7000_32.c46
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c48
-rw-r--r--arch/x86/kernel/apic/io_apic.c1639
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/apic/numaq_32.c62
-rw-r--r--arch/x86/kernel/apic/probe_32.c120
-rw-r--r--arch/x86/kernel/apic/probe_64.c69
-rw-r--r--arch/x86/kernel/apic/summit_32.c50
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c224
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c117
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c219
-rw-r--r--arch/x86/kernel/apm_32.c35
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c71
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/bios_uv.c215
-rw-r--r--arch/x86/kernel/check.c24
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c167
-rw-r--r--arch/x86/kernel/cpu/bugs.c1
-rw-r--r--arch/x86/kernel/cpu/common.c66
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c775
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c626
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1601
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c261
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c71
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c122
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c65
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c130
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event.c615
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c229
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c537
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c312
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c368
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c649
-rw-r--r--arch/x86/kernel/cpu/scattered.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_32.c5
-rw-r--r--arch/x86/kernel/crash_dump_64.c6
-rw-r--r--arch/x86/kernel/devicetree.c452
-rw-r--r--arch/x86/kernel/dumpstack.c63
-rw-r--r--arch/x86/kernel/dumpstack_32.c22
-rw-r--r--arch/x86/kernel/dumpstack_64.c26
-rw-r--r--arch/x86/kernel/e820.c211
-rw-r--r--arch/x86/kernel/early-quirks.c23
-rw-r--r--arch/x86/kernel/early_printk.c12
-rw-r--r--arch/x86/kernel/efi.c612
-rw-r--r--arch/x86/kernel/efi_32.c112
-rw-r--r--arch/x86/kernel/efi_64.c114
-rw-r--r--arch/x86/kernel/efi_stub_32.S123
-rw-r--r--arch/x86/kernel/efi_stub_64.S116
-rw-r--r--arch/x86/kernel/entry_32.S325
-rw-r--r--arch/x86/kernel/entry_64.S186
-rw-r--r--arch/x86/kernel/ftrace.c101
-rw-r--r--arch/x86/kernel/head.c3
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/head_32.S188
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c147
-rw-r--r--arch/x86/kernel/hw_breakpoint.c16
-rw-r--r--arch/x86/kernel/i387.c61
-rw-r--r--arch/x86/kernel/i8237.c30
-rw-r--r--arch/x86/kernel/i8253.c86
-rw-r--r--arch/x86/kernel/i8259.c98
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c102
-rw-r--r--arch/x86/kernel/irq_32.c38
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c113
-rw-r--r--arch/x86/kernel/jump_label.c51
-rw-r--r--arch/x86/kernel/k8.c137
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c48
-rw-r--r--arch/x86/kernel/kprobes.c154
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c25
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c208
-rw-r--r--arch/x86/kernel/microcode_core.c44
-rw-r--r--arch/x86/kernel/microcode_intel.c18
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c71
-rw-r--r--arch/x86/kernel/module.c21
-rw-r--r--arch/x86/kernel/mpparse.c139
-rw-r--r--arch/x86/kernel/mrst.c311
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/olpc.c260
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c22
-rw-r--r--arch/x86/kernel/pci-dma.c110
-rw-r--r--arch/x86/kernel/pci-iommu_table.c79
-rw-r--r--arch/x86/kernel/pci-swiotlb.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c105
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c57
-rw-r--r--arch/x86/kernel/pvclock.c46
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c204
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c16
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/rtc.c5
-rw-r--r--arch/x86/kernel/scx200_32.c131
-rw-r--r--arch/x86/kernel/setup.c327
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/sfi.c120
-rw-r--r--arch/x86/kernel/signal.c14
-rw-r--r--arch/x86/kernel/smp.c24
-rw-r--r--arch/x86/kernel/smpboot.c325
-rw-r--r--arch/x86/kernel/stacktrace.c17
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S12
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c20
-rw-r--r--arch/x86/kernel/tlb_uv.c1655
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c62
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S30
-rw-r--r--arch/x86/kernel/traps.c167
-rw-r--r--arch/x86/kernel/tsc.c183
-rw-r--r--arch/x86/kernel/uv_irq.c302
-rw-r--r--arch/x86/kernel/uv_sysfs.c76
-rw-r--r--arch/x86/kernel/uv_time.c423
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/visws_quirks.c666
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kernel/vmlinux.lds.S95
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c48
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c14
-rw-r--r--arch/x86/kernel/xsave.c5
196 files changed, 8552 insertions, 24695 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6890dbb9ac15..d727f8f94333 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,28 +24,34 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_tsc.o := $(nostackp) 27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 28CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 29GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 30GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 31GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
32GCOV_PROFILE_paravirt.o := n 33GCOV_PROFILE_paravirt.o := n
33 34
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
34obj-y := process_$(BITS).o signal.o entry_$(BITS).o 38obj-y := process_$(BITS).o signal.o entry_$(BITS).o
35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
36obj-y += time.o ioport.o ldt.o dumpstack.o 40obj-y += time.o ioport.o ldt.o dumpstack.o
37obj-y += setup.o x86_init.o i8259.o irqinit.o 41obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
38obj-$(CONFIG_X86_VISWS) += visws_quirks.o 42obj-$(CONFIG_IRQ_WORK) += irq_work.o
39obj-$(CONFIG_X86_32) += probe_roms_32.o 43obj-y += probe_roms.o
40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
43obj-y += bootflag.o e820.o 47obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 50obj-y += tsc.o io_delay.o rtc.o
51obj-y += pci-iommu_table.o
52obj-y += resource.o
47 53
48obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 54obj-y += trampoline.o trampoline_$(BITS).o
49obj-y += process.o 55obj-y += process.o
50obj-y += i387.o xsave.o 56obj-y += i387.o xsave.o
51obj-y += ptrace.o 57obj-y += ptrace.o
@@ -53,11 +59,12 @@ obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 59obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 60obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o 61obj-$(CONFIG_INTEL_TXT) += tboot.o
62obj-$(CONFIG_ISA_DMA_API) += i8237.o
56obj-$(CONFIG_STACKTRACE) += stacktrace.o 63obj-$(CONFIG_STACKTRACE) += stacktrace.o
57obj-y += cpu/ 64obj-y += cpu/
58obj-y += acpi/ 65obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
60obj-y += reboot.o 66obj-y += reboot.o
67obj-$(CONFIG_X86_32) += reboot_32.o
61obj-$(CONFIG_MCA) += mca_32.o 68obj-$(CONFIG_MCA) += mca_32.o
62obj-$(CONFIG_X86_MSR) += msr.o 69obj-$(CONFIG_X86_MSR) += msr.o
63obj-$(CONFIG_X86_CPUID) += cpuid.o 70obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -65,10 +72,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
65apm-y := apm_32.o 72apm-y := apm_32.o
66obj-$(CONFIG_APM) += apm.o 73obj-$(CONFIG_APM) += apm.o
67obj-$(CONFIG_SMP) += smp.o 74obj-$(CONFIG_SMP) += smp.o
68obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 75obj-$(CONFIG_SMP) += smpboot.o
76obj-$(CONFIG_SMP) += tsc_sync.o
69obj-$(CONFIG_SMP) += setup_percpu.o 77obj-$(CONFIG_SMP) += setup_percpu.o
70obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
71obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
72obj-$(CONFIG_X86_MPPARSE) += mpparse.o 78obj-$(CONFIG_X86_MPPARSE) += mpparse.o
73obj-y += apic/ 79obj-y += apic/
74obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 80obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -80,7 +86,6 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
80obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 86obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
81obj-$(CONFIG_KPROBES) += kprobes.o 87obj-$(CONFIG_KPROBES) += kprobes.o
82obj-$(CONFIG_MODULES) += module.o 88obj-$(CONFIG_MODULES) += module.o
83obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
84obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 89obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
85obj-$(CONFIG_KGDB) += kgdb.o 90obj-$(CONFIG_KGDB) += kgdb.o
86obj-$(CONFIG_VM86) += vm86_32.o 91obj-$(CONFIG_VM86) += vm86_32.o
@@ -89,11 +94,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 94obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o 95obj-$(CONFIG_APB_TIMER) += apb_timer.o
91 96
92obj-$(CONFIG_K8_NB) += k8.o 97obj-$(CONFIG_AMD_NB) += amd_nb.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 98obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 99obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 100
96obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
97obj-$(CONFIG_KVM_GUEST) += kvm.o 101obj-$(CONFIG_KVM_GUEST) += kvm.o
98obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 102obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
99obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 103obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -102,13 +106,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
102 106
103obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 107obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
104 108
105obj-$(CONFIG_SCx200) += scx200.o
106scx200-y += scx200_32.o
107
108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
110obj-$(CONFIG_X86_MRST) += mrst.o
111
112microcode-y := microcode_core.o 109microcode-y := microcode_core.o
113microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 110microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
114microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 111microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
@@ -117,17 +114,16 @@ obj-$(CONFIG_MICROCODE) += microcode.o
117obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 114obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
118 115
119obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 116obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
117obj-$(CONFIG_OF) += devicetree.o
120 118
121obj-$(CONFIG_FEATHER_TRACE) += ft_event.o 119obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
122 120
123### 121###
124# 64 bit specific files 122# 64 bit specific files
125ifeq ($(CONFIG_X86_64),y) 123ifeq ($(CONFIG_X86_64),y)
126 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
127 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
128 obj-$(CONFIG_AUDIT) += audit_64.o 124 obj-$(CONFIG_AUDIT) += audit_64.o
129 125
130 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 126 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
131 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 127 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
132 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 128 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
133 129
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce0..4558f0d0822d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata;
72int acpi_sci_override_gsi __initdata; 72int acpi_sci_override_gsi __initdata;
73int acpi_skip_timer_override __initdata; 73int acpi_skip_timer_override __initdata;
74int acpi_use_timer_override __initdata; 74int acpi_use_timer_override __initdata;
75int acpi_fix_pin2_polarity __initdata;
75 76
76#ifdef CONFIG_X86_LOCAL_APIC 77#ifdef CONFIG_X86_LOCAL_APIC
77static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 78static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -198,6 +199,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
198{ 199{
199 unsigned int ver = 0; 200 unsigned int ver = 0;
200 201
202 if (id >= (MAX_LOCAL_APIC-1)) {
203 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
204 return;
205 }
206
201 if (!enabled) { 207 if (!enabled) {
202 ++disabled_cpus; 208 ++disabled_cpus;
203 return; 209 return;
@@ -410,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
410 return 0; 416 return 0;
411 } 417 }
412 418
413 if (acpi_skip_timer_override && 419 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
414 intsrc->source_irq == 0 && intsrc->global_irq == 2) { 420 if (acpi_skip_timer_override) {
415 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 421 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
416 return 0; 422 return 0;
423 }
424 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
425 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
426 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
427 }
417 } 428 }
418 429
419 mp_override_legacy_irq(intsrc->source_irq, 430 mp_override_legacy_irq(intsrc->source_irq,
@@ -504,6 +515,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
504 515
505 return 0; 516 return 0;
506} 517}
518EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
507 519
508int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 520int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
509{ 521{
@@ -513,35 +525,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
513 return 0; 525 return 0;
514} 526}
515 527
516/* 528static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
517 * success: return IRQ number (>=0) 529 int trigger, int polarity)
518 * failure: return < 0
519 */
520int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
521{ 530{
522 unsigned int irq;
523 unsigned int plat_gsi = gsi;
524
525#ifdef CONFIG_PCI 531#ifdef CONFIG_PCI
526 /* 532 /*
527 * Make sure all (legacy) PCI IRQs are set as level-triggered. 533 * Make sure all (legacy) PCI IRQs are set as level-triggered.
528 */ 534 */
529 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 535 if (trigger == ACPI_LEVEL_SENSITIVE)
530 if (trigger == ACPI_LEVEL_SENSITIVE) 536 eisa_set_level_irq(gsi);
531 eisa_set_level_irq(gsi);
532 }
533#endif 537#endif
534 538
539 return gsi;
540}
541
542static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
543 int trigger, int polarity)
544{
535#ifdef CONFIG_X86_IO_APIC 545#ifdef CONFIG_X86_IO_APIC
536 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 546 gsi = mp_register_gsi(dev, gsi, trigger, polarity);
537 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
538 }
539#endif 547#endif
548
549 return gsi;
550}
551
552int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
553 int trigger, int polarity) = acpi_register_gsi_pic;
554
555/*
556 * success: return IRQ number (>=0)
557 * failure: return < 0
558 */
559int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
560{
561 unsigned int irq;
562 unsigned int plat_gsi = gsi;
563
564 plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
540 irq = gsi_to_irq(plat_gsi); 565 irq = gsi_to_irq(plat_gsi);
541 566
542 return irq; 567 return irq;
543} 568}
544 569
570void __init acpi_set_irq_model_pic(void)
571{
572 acpi_irq_model = ACPI_IRQ_MODEL_PIC;
573 __acpi_register_gsi = acpi_register_gsi_pic;
574 acpi_ioapic = 0;
575}
576
577void __init acpi_set_irq_model_ioapic(void)
578{
579 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
580 __acpi_register_gsi = acpi_register_gsi_ioapic;
581 acpi_ioapic = 1;
582}
583
545/* 584/*
546 * ACPI based hotplug support for CPU 585 * ACPI based hotplug support for CPU
547 */ 586 */
@@ -556,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
556 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
557 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
558 return; 597 return;
559#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
560 apicid_to_node[physid] = nid;
561 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
562#else /* CONFIG_X86_32 */
563 apicid_2_node[physid] = nid;
564 cpu_to_node_map[cpu] = nid;
565#endif
566
567#endif 600#endif
568} 601}
569 602
@@ -820,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
820 * returns 0 on success, < 0 on error 853 * returns 0 on success, < 0 on error
821 */ 854 */
822 855
823static void __init acpi_register_lapic_address(unsigned long address)
824{
825 mp_lapic_addr = address;
826
827 set_fixmap_nocache(FIX_APIC_BASE, address);
828 if (boot_cpu_physical_apicid == -1U) {
829 boot_cpu_physical_apicid = read_apic_id();
830 apic_version[boot_cpu_physical_apicid] =
831 GET_APIC_VERSION(apic_read(APIC_LVR));
832 }
833}
834
835static int __init early_acpi_parse_madt_lapic_addr_ovr(void) 856static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
836{ 857{
837 int count; 858 int count;
@@ -853,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
853 return count; 874 return count;
854 } 875 }
855 876
856 acpi_register_lapic_address(acpi_lapic_addr); 877 register_lapic_address(acpi_lapic_addr);
857 878
858 return count; 879 return count;
859} 880}
@@ -880,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void)
880 return count; 901 return count;
881 } 902 }
882 903
883 acpi_register_lapic_address(acpi_lapic_addr); 904 register_lapic_address(acpi_lapic_addr);
884 905
885 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 906 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
886 acpi_parse_sapic, MAX_APICS); 907 acpi_parse_sapic, MAX_LOCAL_APIC);
887 908
888 if (!count) { 909 if (!count) {
889 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 910 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
890 acpi_parse_x2apic, MAX_APICS); 911 acpi_parse_x2apic, MAX_LOCAL_APIC);
891 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 912 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
892 acpi_parse_lapic, MAX_APICS); 913 acpi_parse_lapic, MAX_LOCAL_APIC);
893 } 914 }
894 if (!count && !x2count) { 915 if (!count && !x2count) {
895 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 916 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -922,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
922extern int es7000_plat; 943extern int es7000_plat;
923#endif 944#endif
924 945
925static void assign_to_mp_irq(struct mpc_intsrc *m,
926 struct mpc_intsrc *mp_irq)
927{
928 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
929}
930
931static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
932 struct mpc_intsrc *m)
933{
934 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
935}
936
937static void save_mp_irq(struct mpc_intsrc *m)
938{
939 int i;
940
941 for (i = 0; i < mp_irq_entries; i++) {
942 if (!mp_irq_cmp(&mp_irqs[i], m))
943 return;
944 }
945
946 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
947 if (++mp_irq_entries == MAX_IRQ_SOURCES)
948 panic("Max # of irq sources exceeded!!\n");
949}
950
951void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 946void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
952{ 947{
953 int ioapic; 948 int ioapic;
@@ -975,10 +970,10 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
975 mp_irq.irqflag = (trigger << 2) | polarity; 970 mp_irq.irqflag = (trigger << 2) | polarity;
976 mp_irq.srcbus = MP_ISA_BUS; 971 mp_irq.srcbus = MP_ISA_BUS;
977 mp_irq.srcbusirq = bus_irq; /* IRQ */ 972 mp_irq.srcbusirq = bus_irq; /* IRQ */
978 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ 973 mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
979 mp_irq.dstirq = pin; /* INTIN# */ 974 mp_irq.dstirq = pin; /* INTIN# */
980 975
981 save_mp_irq(&mp_irq); 976 mp_save_irq(&mp_irq);
982 977
983 isa_irq_to_gsi[bus_irq] = gsi; 978 isa_irq_to_gsi[bus_irq] = gsi;
984} 979}
@@ -1026,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1026 if (ioapic < 0) 1021 if (ioapic < 0)
1027 continue; 1022 continue;
1028 pin = mp_find_ioapic_pin(ioapic, gsi); 1023 pin = mp_find_ioapic_pin(ioapic, gsi);
1029 dstapic = mp_ioapics[ioapic].apicid; 1024 dstapic = mpc_ioapic_id(ioapic);
1030 1025
1031 for (idx = 0; idx < mp_irq_entries; idx++) { 1026 for (idx = 0; idx < mp_irq_entries; idx++) {
1032 struct mpc_intsrc *irq = mp_irqs + idx; 1027 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1053,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1053 mp_irq.srcbusirq = i; /* Identity mapped */ 1048 mp_irq.srcbusirq = i; /* Identity mapped */
1054 mp_irq.dstirq = pin; 1049 mp_irq.dstirq = pin;
1055 1050
1056 save_mp_irq(&mp_irq); 1051 mp_save_irq(&mp_irq);
1057 } 1052 }
1058} 1053}
1059 1054
@@ -1087,10 +1082,10 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1087 mp_irq.srcbus = number; 1082 mp_irq.srcbus = number;
1088 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1089 ioapic = mp_find_ioapic(gsi); 1084 ioapic = mp_find_ioapic(gsi);
1090 mp_irq.dstapic = mp_ioapics[ioapic].apicid; 1085 mp_irq.dstapic = mpc_ioapic_id(ioapic);
1091 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); 1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1092 1087
1093 save_mp_irq(&mp_irq); 1088 mp_save_irq(&mp_irq);
1094#endif 1089#endif
1095 return 0; 1090 return 0;
1096} 1091}
@@ -1118,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1118 1113
1119 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1120 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1115 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1121 "%d-%d\n", mp_ioapics[ioapic].apicid, 1116 "%d-%d\n", mpc_ioapic_id(ioapic),
1122 ioapic_pin); 1117 ioapic_pin);
1123 return gsi; 1118 return gsi;
1124 } 1119 }
@@ -1259,8 +1254,7 @@ static void __init acpi_process_madt(void)
1259 */ 1254 */
1260 error = acpi_parse_madt_ioapic_entries(); 1255 error = acpi_parse_madt_ioapic_entries();
1261 if (!error) { 1256 if (!error) {
1262 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; 1257 acpi_set_irq_model_ioapic();
1263 acpi_ioapic = 1;
1264 1258
1265 smp_found_config = 1; 1259 smp_found_config = 1;
1266 } 1260 }
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
13 13
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h>
16 17
17/* 18/*
18 * Initialize bm_flags based on the CPU cache properties 19 * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
65 66
66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 67static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
67 68
68#define MWAIT_SUBSTATE_MASK (0xf)
69#define MWAIT_CSTATE_MASK (0xf)
70#define MWAIT_SUBSTATE_SIZE (4)
71
72#define CPUID_MWAIT_LEAF (5)
73#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
74#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
75
76#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
77
78#define NATIVE_CSTATE_BEYOND_HALT (2) 69#define NATIVE_CSTATE_BEYOND_HALT (2)
79 70
80static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) 71static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..b4fd836e4053 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -22,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */
22pmode_cr4: .long 0 /* Saved %cr4 */ 28pmode_cr4: .long 0 /* Saved %cr4 */
23pmode_efer: .quad 0 /* Saved EFER */ 29pmode_efer: .quad 0 /* Saved EFER */
24pmode_gdt: .quad 0 30pmode_gdt: .quad 0
31pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
32pmode_behavior: .long 0 /* Wakeup behavior flags */
25realmode_flags: .long 0 33realmode_flags: .long 0
26real_magic: .long 0 34real_magic: .long 0
27trampoline_segment: .word 0 35trampoline_segment: .word 0
@@ -30,14 +38,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 38wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 39wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 40wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 41signature: .long WAKEUP_HEADER_SIGNATURE
34 42
35 .text 43 .text
36 .globl _start
37 .code16 44 .code16
38wakeup_code: 45wakeup_code:
39_start:
40 cli
41 cld 46 cld
42 47
43 /* Apparently some dimwit BIOS programmers don't know how to 48 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,17 +82,29 @@ _start:
77 82
78 /* Check header signature... */ 83 /* Check header signature... */
79 movl signature, %eax 84 movl signature, %eax
80 cmpl $0x51ee1111, %eax 85 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 86 jne bogus_real_magic
82 87
83 /* Check we really have everything... */ 88 /* Check we really have everything... */
84 movl end_signature, %eax 89 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 90 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 91 jne bogus_real_magic
87 92
88 /* Call the C code */ 93 /* Call the C code */
89 calll main 94 calll main
90 95
96 /* Restore MISC_ENABLE before entering protected mode, in case
97 BIOS decided to clear XD_DISABLE during S3. */
98 movl pmode_behavior, %eax
99 btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
100 jnc 1f
101
102 movl pmode_misc_en, %eax
103 movl pmode_misc_en + 4, %edx
104 movl $MSR_IA32_MISC_ENABLE, %ecx
105 wrmsr
1061:
107
91 /* Do any other stuff... */ 108 /* Do any other stuff... */
92 109
93#ifndef CONFIG_64BIT 110#ifndef CONFIG_64BIT
@@ -147,3 +164,7 @@ wakeup_heap:
147wakeup_stack: 164wakeup_stack:
148 .space 2048 165 .space 2048
149wakeup_stack_end: 166wakeup_stack_end:
167
168 .section ".signature","a"
169end_signature:
170 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..97a29e1430e3 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
21 u32 pmode_efer_low; /* Protected mode EFER */ 21 u32 pmode_efer_low; /* Protected mode EFER */
22 u32 pmode_efer_high; 22 u32 pmode_efer_high;
23 u64 pmode_gdt; 23 u64 pmode_gdt;
24 u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
25 u32 pmode_misc_en_high;
26 u32 pmode_behavior; /* Wakeup routine behavior flags */
24 u32 realmode_flags; 27 u32 realmode_flags;
25 u32 real_magic; 28 u32 real_magic;
26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ 29 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -35,7 +38,11 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 38extern struct wakeup_header wakeup_header;
36#endif 39#endif
37 40
38#define HEADER_OFFSET 0x3f00 41#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 42#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
43#define WAKEUP_END_SIGNATURE 0x65a22c82
44
45/* Wakeup behavior bits */
46#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
40 47
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 48#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070d..103b6ab368d3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,45 +7,39 @@
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/memblock.h>
10#include <linux/dmi.h> 11#include <linux/dmi.h>
11#include <linux/cpumask.h> 12#include <linux/cpumask.h>
12#include <asm/segment.h> 13#include <asm/segment.h>
13#include <asm/desc.h> 14#include <asm/desc.h>
15#include <asm/pgtable.h>
16#include <asm/cacheflush.h>
14 17
15#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
16#include "sleep.h" 19#include "sleep.h"
17 20
18unsigned long acpi_wakeup_address;
19unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
20 22
21/* address in low memory of the wakeup routine. */
22static unsigned long acpi_realmode;
23
24#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
25static char temp_stack[4096]; 24static char temp_stack[4096];
26#endif 25#endif
27 26
28/** 27/**
29 * acpi_save_state_mem - save kernel state 28 * acpi_suspend_lowlevel - save kernel state
30 * 29 *
31 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
32 * low memory. 31 * low memory.
33 *
34 * Note that this is too late to change acpi_wakeup_address.
35 */ 32 */
36int acpi_save_state_mem(void) 33int acpi_suspend_lowlevel(void)
37{ 34{
38 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
39 38
40 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
41 printk(KERN_ERR "Could not allocate memory during boot, "
42 "S3 disabled\n");
43 return -ENOMEM;
44 }
45 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
46 40
47 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
48 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
49 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
50 return -EINVAL; 44 return -EINVAL;
51 } 45 }
@@ -65,9 +59,7 @@ int acpi_save_state_mem(void)
65 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
66 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
67 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
68 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
69 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
70 << 16);
71 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
72 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
73 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -85,17 +77,23 @@ int acpi_save_state_mem(void)
85 77
86 header->pmode_cr0 = read_cr0(); 78 header->pmode_cr0 = read_cr0();
87 header->pmode_cr4 = read_cr4_safe(); 79 header->pmode_cr4 = read_cr4_safe();
80 header->pmode_behavior = 0;
81 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
82 &header->pmode_misc_en_low,
83 &header->pmode_misc_en_high))
84 header->pmode_behavior |=
85 (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
88 header->realmode_flags = acpi_realmode_flags; 86 header->realmode_flags = acpi_realmode_flags;
89 header->real_magic = 0x12345678; 87 header->real_magic = 0x12345678;
90 88
91#ifndef CONFIG_64BIT 89#ifndef CONFIG_64BIT
92 header->pmode_entry = (u32)&wakeup_pmode_return; 90 header->pmode_entry = (u32)&wakeup_pmode_return;
93 header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); 91 header->pmode_cr3 = (u32)__pa(&initial_page_table);
94 saved_magic = 0x12345678; 92 saved_magic = 0x12345678;
95#else /* CONFIG_64BIT */ 93#else /* CONFIG_64BIT */
96 header->trampoline_segment = setup_trampoline() >> 4; 94 header->trampoline_segment = trampoline_address() >> 4;
97#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
98 stack_start.sp = temp_stack + sizeof(temp_stack); 96 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
99 early_gdt_descr.address = 97 early_gdt_descr.address =
100 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 98 (unsigned long)get_cpu_gdt_table(smp_processor_id());
101 initial_gs = per_cpu_offset(smp_processor_id()); 99 initial_gs = per_cpu_offset(smp_processor_id());
@@ -104,47 +102,10 @@ int acpi_save_state_mem(void)
104 saved_magic = 0x123456789abcdef0L; 102 saved_magic = 0x123456789abcdef0L;
105#endif /* CONFIG_64BIT */ 103#endif /* CONFIG_64BIT */
106 104
105 do_suspend_lowlevel();
107 return 0; 106 return 0;
108} 107}
109 108
110/*
111 * acpi_restore_state - undo effects of acpi_save_state_mem
112 */
113void acpi_restore_state_mem(void)
114{
115}
116
117
118/**
119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
120 *
121 * We allocate a page from the first 1MB of memory for the wakeup
122 * routine for when we come back from a sleep state. The
123 * runtime allocator allows specification of <16MB pages, but not
124 * <1MB pages.
125 */
126void __init acpi_reserve_wakeup_memory(void)
127{
128 unsigned long mem;
129
130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
131 printk(KERN_ERR
132 "ACPI: Wakeup code way too big, S3 disabled.\n");
133 return;
134 }
135
136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
137
138 if (mem == -1L) {
139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
140 return;
141 }
142 acpi_realmode = (unsigned long) phys_to_virt(mem);
143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145}
146
147
148static int __init acpi_sleep_setup(char *str) 109static int __init acpi_sleep_setup(char *str)
149{ 110{
150 while ((str != NULL) && (*str != '\0')) { 111 while ((str != NULL) && (*str != '\0')) {
@@ -157,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
157#ifdef CONFIG_HIBERNATION 118#ifdef CONFIG_HIBERNATION
158 if (strncmp(str, "s4_nohwsig", 10) == 0) 119 if (strncmp(str, "s4_nohwsig", 10) == 0)
159 acpi_no_s4_hw_signature(); 120 acpi_no_s4_hw_signature();
160 if (strncmp(str, "s4_nonvs", 8) == 0) {
161 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
162 "please use acpi_sleep=nonvs instead");
163 acpi_nvs_nosave();
164 }
165#endif 121#endif
166 if (strncmp(str, "nonvs", 5) == 0) 122 if (strncmp(str, "nonvs", 5) == 0)
167 acpi_nvs_nosave(); 123 acpi_nvs_nosave();
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
14
15extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c4..a81f2d52f869 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
67#define DPRINTK(fmt, args...) if (debug_alternative) \ 67#define DPRINTK(fmt, args...) if (debug_alternative) \
68 printk(KERN_DEBUG fmt, args) 68 printk(KERN_DEBUG fmt, args)
69 69
70/*
71 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
72 * that correspond to that nop. Getting from one nop to the next, we
73 * add to the array the offset that is equal to the sum of all sizes of
74 * nops preceding the one we are after.
75 *
76 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
77 * nice symmetry of sizes of the previous nops.
78 */
70#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) 79#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
71/* Use inline assembly to define this because the nops are defined 80static const unsigned char intelnops[] =
72 as inline assembly strings in the include files and we cannot 81{
73 get them easily into strings. */ 82 GENERIC_NOP1,
74asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " 83 GENERIC_NOP2,
75 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 84 GENERIC_NOP3,
76 GENERIC_NOP7 GENERIC_NOP8 85 GENERIC_NOP4,
77 "\t.previous"); 86 GENERIC_NOP5,
78extern const unsigned char intelnops[]; 87 GENERIC_NOP6,
79static const unsigned char *const __initconst_or_module 88 GENERIC_NOP7,
80intel_nops[ASM_NOP_MAX+1] = { 89 GENERIC_NOP8,
90 GENERIC_NOP5_ATOMIC
91};
92static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
93{
81 NULL, 94 NULL,
82 intelnops, 95 intelnops,
83 intelnops + 1, 96 intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
87 intelnops + 1 + 2 + 3 + 4 + 5, 100 intelnops + 1 + 2 + 3 + 4 + 5,
88 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 101 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
89 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
90}; 104};
91#endif 105#endif
92 106
93#ifdef K8_NOP1 107#ifdef K8_NOP1
94asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " 108static const unsigned char k8nops[] =
95 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 109{
96 K8_NOP7 K8_NOP8 110 K8_NOP1,
97 "\t.previous"); 111 K8_NOP2,
98extern const unsigned char k8nops[]; 112 K8_NOP3,
99static const unsigned char *const __initconst_or_module 113 K8_NOP4,
100k8_nops[ASM_NOP_MAX+1] = { 114 K8_NOP5,
115 K8_NOP6,
116 K8_NOP7,
117 K8_NOP8,
118 K8_NOP5_ATOMIC
119};
120static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
121{
101 NULL, 122 NULL,
102 k8nops, 123 k8nops,
103 k8nops + 1, 124 k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
107 k8nops + 1 + 2 + 3 + 4 + 5, 128 k8nops + 1 + 2 + 3 + 4 + 5,
108 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 129 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
109 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
131 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
110}; 132};
111#endif 133#endif
112 134
113#if defined(K7_NOP1) && !defined(CONFIG_X86_64) 135#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
114asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " 136static const unsigned char k7nops[] =
115 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 137{
116 K7_NOP7 K7_NOP8 138 K7_NOP1,
117 "\t.previous"); 139 K7_NOP2,
118extern const unsigned char k7nops[]; 140 K7_NOP3,
119static const unsigned char *const __initconst_or_module 141 K7_NOP4,
120k7_nops[ASM_NOP_MAX+1] = { 142 K7_NOP5,
143 K7_NOP6,
144 K7_NOP7,
145 K7_NOP8,
146 K7_NOP5_ATOMIC
147};
148static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
149{
121 NULL, 150 NULL,
122 k7nops, 151 k7nops,
123 k7nops + 1, 152 k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
127 k7nops + 1 + 2 + 3 + 4 + 5, 156 k7nops + 1 + 2 + 3 + 4 + 5,
128 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 157 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
159 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
130}; 160};
131#endif 161#endif
132 162
133#ifdef P6_NOP1 163#ifdef P6_NOP1
134asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " 164static const unsigned char __initconst_or_module p6nops[] =
135 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 165{
136 P6_NOP7 P6_NOP8 166 P6_NOP1,
137 "\t.previous"); 167 P6_NOP2,
138extern const unsigned char p6nops[]; 168 P6_NOP3,
139static const unsigned char *const __initconst_or_module 169 P6_NOP4,
140p6_nops[ASM_NOP_MAX+1] = { 170 P6_NOP5,
171 P6_NOP6,
172 P6_NOP7,
173 P6_NOP8,
174 P6_NOP5_ATOMIC
175};
176static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
177{
141 NULL, 178 NULL,
142 p6nops, 179 p6nops,
143 p6nops + 1, 180 p6nops + 1,
@@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = {
147 p6nops + 1 + 2 + 3 + 4 + 5, 184 p6nops + 1 + 2 + 3 + 4 + 5,
148 p6nops + 1 + 2 + 3 + 4 + 5 + 6, 185 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
149 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
187 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
150}; 188};
151#endif 189#endif
152 190
191/* Initialize these to a safe default */
153#ifdef CONFIG_X86_64 192#ifdef CONFIG_X86_64
193const unsigned char * const *ideal_nops = p6_nops;
194#else
195const unsigned char * const *ideal_nops = intel_nops;
196#endif
154 197
155extern char __vsyscall_0; 198void __init arch_init_ideal_nops(void)
156static const unsigned char *const *__init_or_module find_nop_table(void)
157{ 199{
158 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 200 switch (boot_cpu_data.x86_vendor) {
159 boot_cpu_has(X86_FEATURE_NOPL)) 201 case X86_VENDOR_INTEL:
160 return p6_nops; 202 /*
161 else 203 * Due to a decoder implementation quirk, some
162 return k8_nops; 204 * specific Intel CPUs actually perform better with
163} 205 * the "k8_nops" than with the SDM-recommended NOPs.
164 206 */
165#else /* CONFIG_X86_64 */ 207 if (boot_cpu_data.x86 == 6 &&
208 boot_cpu_data.x86_model >= 0x0f &&
209 boot_cpu_data.x86_model != 0x1c &&
210 boot_cpu_data.x86_model != 0x26 &&
211 boot_cpu_data.x86_model != 0x27 &&
212 boot_cpu_data.x86_model < 0x30) {
213 ideal_nops = k8_nops;
214 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
215 ideal_nops = p6_nops;
216 } else {
217#ifdef CONFIG_X86_64
218 ideal_nops = k8_nops;
219#else
220 ideal_nops = intel_nops;
221#endif
222 }
166 223
167static const unsigned char *const *__init_or_module find_nop_table(void) 224 default:
168{ 225#ifdef CONFIG_X86_64
169 if (boot_cpu_has(X86_FEATURE_K8)) 226 ideal_nops = k8_nops;
170 return k8_nops; 227#else
171 else if (boot_cpu_has(X86_FEATURE_K7)) 228 if (boot_cpu_has(X86_FEATURE_K8))
172 return k7_nops; 229 ideal_nops = k8_nops;
173 else if (boot_cpu_has(X86_FEATURE_NOPL)) 230 else if (boot_cpu_has(X86_FEATURE_K7))
174 return p6_nops; 231 ideal_nops = k7_nops;
175 else 232 else
176 return intel_nops; 233 ideal_nops = intel_nops;
234#endif
235 }
177} 236}
178 237
179#endif /* CONFIG_X86_64 */
180
181/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 238/* Use this to add nops to a buffer, then text_poke the whole buffer. */
182static void __init_or_module add_nops(void *insns, unsigned int len) 239static void __init_or_module add_nops(void *insns, unsigned int len)
183{ 240{
184 const unsigned char *const *noptable = find_nop_table();
185
186 while (len > 0) { 241 while (len > 0) {
187 unsigned int noplen = len; 242 unsigned int noplen = len;
188 if (noplen > ASM_NOP_MAX) 243 if (noplen > ASM_NOP_MAX)
189 noplen = ASM_NOP_MAX; 244 noplen = ASM_NOP_MAX;
190 memcpy(insns, noptable[noplen], noplen); 245 memcpy(insns, ideal_nops[noplen], noplen);
191 insns += noplen; 246 insns += noplen;
192 len -= noplen; 247 len -= noplen;
193 } 248 }
@@ -195,11 +250,12 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
195 250
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 251extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern s32 __smp_locks[], __smp_locks_end[]; 252extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 253extern char __vsyscall_0;
254void *text_poke_early(void *addr, const void *opcode, size_t len);
199 255
200/* Replace instructions with better alternatives for this CPU type. 256/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 257 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 258 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 259 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 260 Tough. Make sure you disable such features by hand. */
205 261
@@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
210 u8 insnbuf[MAX_PATCH_LEN]; 266 u8 insnbuf[MAX_PATCH_LEN];
211 267
212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
269 /*
270 * The scan order should be from start to end. A later scanned
271 * alternative code can overwrite a previous scanned alternative code.
272 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
273 * patch code.
274 *
275 * So be careful if you want to change the scan order to any other
276 * order.
277 */
213 for (a = start; a < end; a++) { 278 for (a = start; a < end; a++) {
214 u8 *instr = a->instr; 279 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
@@ -353,6 +418,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
353 mutex_unlock(&smp_alt); 418 mutex_unlock(&smp_alt);
354} 419}
355 420
421bool skip_smp_alternatives;
356void alternatives_smp_switch(int smp) 422void alternatives_smp_switch(int smp)
357{ 423{
358 struct smp_alt_module *mod; 424 struct smp_alt_module *mod;
@@ -368,7 +434,7 @@ void alternatives_smp_switch(int smp)
368 printk("lockdep: fixing up alternatives.\n"); 434 printk("lockdep: fixing up alternatives.\n");
369#endif 435#endif
370 436
371 if (noreplace_smp || smp_alt_once) 437 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
372 return; 438 return;
373 BUG_ON(!smp && (num_online_cpus() > 1)); 439 BUG_ON(!smp && (num_online_cpus() > 1));
374 440
@@ -522,7 +588,7 @@ void __init alternative_instructions(void)
522 * instructions. And on the local CPU you need to be protected again NMI or MCE 588 * instructions. And on the local CPU you need to be protected again NMI or MCE
523 * handlers seeing an inconsistent instruction while you patch. 589 * handlers seeing an inconsistent instruction while you patch.
524 */ 590 */
525static void *__init_or_module text_poke_early(void *addr, const void *opcode, 591void *__init_or_module text_poke_early(void *addr, const void *opcode,
526 size_t len) 592 size_t len)
527{ 593{
528 unsigned long flags; 594 unsigned long flags;
@@ -591,17 +657,21 @@ static atomic_t stop_machine_first;
591static int wrote_text; 657static int wrote_text;
592 658
593struct text_poke_params { 659struct text_poke_params {
594 void *addr; 660 struct text_poke_param *params;
595 const void *opcode; 661 int nparams;
596 size_t len;
597}; 662};
598 663
599static int __kprobes stop_machine_text_poke(void *data) 664static int __kprobes stop_machine_text_poke(void *data)
600{ 665{
601 struct text_poke_params *tpp = data; 666 struct text_poke_params *tpp = data;
667 struct text_poke_param *p;
668 int i;
602 669
603 if (atomic_dec_and_test(&stop_machine_first)) { 670 if (atomic_dec_and_test(&stop_machine_first)) {
604 text_poke(tpp->addr, tpp->opcode, tpp->len); 671 for (i = 0; i < tpp->nparams; i++) {
672 p = &tpp->params[i];
673 text_poke(p->addr, p->opcode, p->len);
674 }
605 smp_wmb(); /* Make sure other cpus see that this has run */ 675 smp_wmb(); /* Make sure other cpus see that this has run */
606 wrote_text = 1; 676 wrote_text = 1;
607 } else { 677 } else {
@@ -610,8 +680,17 @@ static int __kprobes stop_machine_text_poke(void *data)
610 smp_mb(); /* Load wrote_text before following execution */ 680 smp_mb(); /* Load wrote_text before following execution */
611 } 681 }
612 682
613 flush_icache_range((unsigned long)tpp->addr, 683 for (i = 0; i < tpp->nparams; i++) {
614 (unsigned long)tpp->addr + tpp->len); 684 p = &tpp->params[i];
685 flush_icache_range((unsigned long)p->addr,
686 (unsigned long)p->addr + p->len);
687 }
688 /*
689 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
690 * that a core serializing instruction such as "cpuid" should be
691 * executed on _each_ core before the new instruction is made visible.
692 */
693 sync_core();
615 return 0; 694 return 0;
616} 695}
617 696
@@ -631,13 +710,36 @@ static int __kprobes stop_machine_text_poke(void *data)
631void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) 710void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
632{ 711{
633 struct text_poke_params tpp; 712 struct text_poke_params tpp;
713 struct text_poke_param p;
634 714
635 tpp.addr = addr; 715 p.addr = addr;
636 tpp.opcode = opcode; 716 p.opcode = opcode;
637 tpp.len = len; 717 p.len = len;
718 tpp.params = &p;
719 tpp.nparams = 1;
638 atomic_set(&stop_machine_first, 1); 720 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 721 wrote_text = 0;
640 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 722 /* Use __stop_machine() because the caller already got online_cpus. */
723 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
641 return addr; 724 return addr;
642} 725}
643 726
727/**
728 * text_poke_smp_batch - Update instructions on a live kernel on SMP
729 * @params: an array of text_poke parameters
730 * @n: the number of elements in params.
731 *
732 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
733 * stop_machine() is heavy task, it is better to aggregate text_poke requests
734 * and do it once if possible.
735 *
736 * Note: Must be called under get_online_cpus() and text_mutex.
737 */
738void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
739{
740 struct text_poke_params tpp = {.params = params, .nparams = n};
741
742 atomic_set(&stop_machine_first, 1);
743 wrote_text = 0;
744 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
745}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 0f7f130caa67..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -27,7 +27,7 @@
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/syscore_ops.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <asm/atomic.h> 33#include <asm/atomic.h>
@@ -39,8 +39,9 @@
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
41#include <asm/dma.h> 41#include <asm/dma.h>
42#include <asm/k8.h> 42#include <asm/amd_nb.h>
43#include <asm/x86_init.h> 43#include <asm/x86_init.h>
44#include <asm/iommu_table.h>
44 45
45static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 46static unsigned long iommu_bus_base; /* GART remapping area (physical) */
46static unsigned long iommu_size; /* size of remapping area bytes */ 47static unsigned long iommu_size; /* size of remapping area bytes */
@@ -80,6 +81,9 @@ static u32 gart_unmapped_entry;
80#define AGPEXTERN 81#define AGPEXTERN
81#endif 82#endif
82 83
84/* GART can only remap to physical addresses < 1TB */
85#define GART_MAX_PHYS_ADDR (1ULL << 40)
86
83/* backdoor interface to AGP driver */ 87/* backdoor interface to AGP driver */
84AGPEXTERN int agp_memory_reserved; 88AGPEXTERN int agp_memory_reserved;
85AGPEXTERN __u32 *agp_gatt_table; 89AGPEXTERN __u32 *agp_gatt_table;
@@ -142,7 +146,7 @@ static void flush_gart(void)
142 146
143 spin_lock_irqsave(&iommu_bitmap_lock, flags); 147 spin_lock_irqsave(&iommu_bitmap_lock, flags);
144 if (need_flush) { 148 if (need_flush) {
145 k8_flush_garts(); 149 amd_flush_garts();
146 need_flush = false; 150 need_flush = false;
147 } 151 }
148 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 152 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -211,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
211 size_t size, int dir, unsigned long align_mask) 215 size_t size, int dir, unsigned long align_mask)
212{ 216{
213 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); 217 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
214 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 218 unsigned long iommu_page;
215 int i; 219 int i;
216 220
221 if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
222 return bad_dma_addr;
223
224 iommu_page = alloc_iommu(dev, npages, align_mask);
217 if (iommu_page == -1) { 225 if (iommu_page == -1) {
218 if (!nonforced_iommu(dev, phys_mem, size)) 226 if (!nonforced_iommu(dev, phys_mem, size))
219 return phys_mem; 227 return phys_mem;
@@ -560,14 +568,17 @@ static void enable_gart_translations(void)
560{ 568{
561 int i; 569 int i;
562 570
563 for (i = 0; i < num_k8_northbridges; i++) { 571 if (!amd_nb_has_feature(AMD_NB_GART))
564 struct pci_dev *dev = k8_northbridges[i]; 572 return;
573
574 for (i = 0; i < amd_nb_num(); i++) {
575 struct pci_dev *dev = node_to_amd_nb(i)->misc;
565 576
566 enable_gart_translation(dev, __pa(agp_gatt_table)); 577 enable_gart_translation(dev, __pa(agp_gatt_table));
567 } 578 }
568 579
569 /* Flush the GART-TLB to remove stale entries */ 580 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts(); 581 amd_flush_garts();
571} 582}
572 583
573/* 584/*
@@ -585,72 +596,62 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
585 aperture_alloc = aper_alloc; 596 aperture_alloc = aper_alloc;
586} 597}
587 598
588static void gart_fixup_northbridges(struct sys_device *dev) 599static void gart_fixup_northbridges(void)
589{ 600{
590 int i; 601 int i;
591 602
592 if (!fix_up_north_bridges) 603 if (!fix_up_north_bridges)
593 return; 604 return;
594 605
606 if (!amd_nb_has_feature(AMD_NB_GART))
607 return;
608
595 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 609 pr_info("PCI-DMA: Restoring GART aperture settings\n");
596 610
597 for (i = 0; i < num_k8_northbridges; i++) { 611 for (i = 0; i < amd_nb_num(); i++) {
598 struct pci_dev *dev = k8_northbridges[i]; 612 struct pci_dev *dev = node_to_amd_nb(i)->misc;
599 613
600 /* 614 /*
601 * Don't enable translations just yet. That is the next 615 * Don't enable translations just yet. That is the next
602 * step. Restore the pre-suspend aperture settings. 616 * step. Restore the pre-suspend aperture settings.
603 */ 617 */
604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); 618 gart_set_size_and_enable(dev, aperture_order);
605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); 619 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
606 } 620 }
607} 621}
608 622
609static int gart_resume(struct sys_device *dev) 623static void gart_resume(void)
610{ 624{
611 pr_info("PCI-DMA: Resuming GART IOMMU\n"); 625 pr_info("PCI-DMA: Resuming GART IOMMU\n");
612 626
613 gart_fixup_northbridges(dev); 627 gart_fixup_northbridges();
614 628
615 enable_gart_translations(); 629 enable_gart_translations();
616
617 return 0;
618} 630}
619 631
620static int gart_suspend(struct sys_device *dev, pm_message_t state) 632static struct syscore_ops gart_syscore_ops = {
621{
622 return 0;
623}
624
625static struct sysdev_class gart_sysdev_class = {
626 .name = "gart",
627 .suspend = gart_suspend,
628 .resume = gart_resume, 633 .resume = gart_resume,
629 634
630}; 635};
631 636
632static struct sys_device device_gart = {
633 .cls = &gart_sysdev_class,
634};
635
636/* 637/*
637 * Private Northbridge GATT initialization in case we cannot use the 638 * Private Northbridge GATT initialization in case we cannot use the
638 * AGP driver for some reason. 639 * AGP driver for some reason.
639 */ 640 */
640static __init int init_k8_gatt(struct agp_kern_info *info) 641static __init int init_amd_gatt(struct agp_kern_info *info)
641{ 642{
642 unsigned aper_size, gatt_size, new_aper_size; 643 unsigned aper_size, gatt_size, new_aper_size;
643 unsigned aper_base, new_aper_base; 644 unsigned aper_base, new_aper_base;
644 struct pci_dev *dev; 645 struct pci_dev *dev;
645 void *gatt; 646 void *gatt;
646 int i, error; 647 int i;
647 648
648 pr_info("PCI-DMA: Disabling AGP.\n"); 649 pr_info("PCI-DMA: Disabling AGP.\n");
649 650
650 aper_size = aper_base = info->aper_size = 0; 651 aper_size = aper_base = info->aper_size = 0;
651 dev = NULL; 652 dev = NULL;
652 for (i = 0; i < num_k8_northbridges; i++) { 653 for (i = 0; i < amd_nb_num(); i++) {
653 dev = k8_northbridges[i]; 654 dev = node_to_amd_nb(i)->misc;
654 new_aper_base = read_aperture(dev, &new_aper_size); 655 new_aper_base = read_aperture(dev, &new_aper_size);
655 if (!new_aper_base) 656 if (!new_aper_base)
656 goto nommu; 657 goto nommu;
@@ -678,12 +679,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
678 679
679 agp_gatt_table = gatt; 680 agp_gatt_table = gatt;
680 681
681 error = sysdev_class_register(&gart_sysdev_class); 682 register_syscore_ops(&gart_syscore_ops);
682 if (!error)
683 error = sysdev_register(&device_gart);
684 if (error)
685 panic("Could not register gart_sysdev -- "
686 "would corrupt data on next suspend");
687 683
688 flush_gart(); 684 flush_gart();
689 685
@@ -718,10 +714,13 @@ static void gart_iommu_shutdown(void)
718 if (!no_agp) 714 if (!no_agp)
719 return; 715 return;
720 716
721 for (i = 0; i < num_k8_northbridges; i++) { 717 if (!amd_nb_has_feature(AMD_NB_GART))
718 return;
719
720 for (i = 0; i < amd_nb_num(); i++) {
722 u32 ctl; 721 u32 ctl;
723 722
724 dev = k8_northbridges[i]; 723 dev = node_to_amd_nb(i)->misc;
725 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 724 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
726 725
727 ctl &= ~GARTEN; 726 ctl &= ~GARTEN;
@@ -739,14 +738,14 @@ int __init gart_iommu_init(void)
739 unsigned long scratch; 738 unsigned long scratch;
740 long i; 739 long i;
741 740
742 if (num_k8_northbridges == 0) 741 if (!amd_nb_has_feature(AMD_NB_GART))
743 return 0; 742 return 0;
744 743
745#ifndef CONFIG_AGP_AMD64 744#ifndef CONFIG_AGP_AMD64
746 no_agp = 1; 745 no_agp = 1;
747#else 746#else
748 /* Makefile puts PCI initialization via subsys_initcall first. */ 747 /* Makefile puts PCI initialization via subsys_initcall first. */
749 /* Add other K8 AGP bridge drivers here */ 748 /* Add other AMD AGP bridge drivers here */
750 no_agp = no_agp || 749 no_agp = no_agp ||
751 (agp_amd64_init() < 0) || 750 (agp_amd64_init() < 0) ||
752 (agp_copy_info(agp_bridge, &info) < 0); 751 (agp_copy_info(agp_bridge, &info) < 0);
@@ -755,7 +754,7 @@ int __init gart_iommu_init(void)
755 if (no_iommu || 754 if (no_iommu ||
756 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 755 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
757 !gart_iommu_aperture || 756 !gart_iommu_aperture ||
758 (no_agp && init_k8_gatt(&info) < 0)) { 757 (no_agp && init_amd_gatt(&info) < 0)) {
759 if (max_pfn > MAX_DMA32_PFN) { 758 if (max_pfn > MAX_DMA32_PFN) {
760 pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); 759 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
761 pr_warning("falling back to iommu=soft.\n"); 760 pr_warning("falling back to iommu=soft.\n");
@@ -896,3 +895,4 @@ void __init gart_parse_options(char *p)
896 } 895 }
897 } 896 }
898} 897}
898IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..7c3a95e54ec5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/pci-ats.h>
21#include <linux/bitmap.h> 22#include <linux/bitmap.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -25,16 +26,18 @@
25#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
26#include <linux/iommu-helper.h> 27#include <linux/iommu-helper.h>
27#include <linux/iommu.h> 28#include <linux/iommu.h>
29#include <linux/delay.h>
28#include <asm/proto.h> 30#include <asm/proto.h>
29#include <asm/iommu.h> 31#include <asm/iommu.h>
30#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/dma.h>
31#include <asm/amd_iommu_proto.h> 34#include <asm/amd_iommu_proto.h>
32#include <asm/amd_iommu_types.h> 35#include <asm/amd_iommu_types.h>
33#include <asm/amd_iommu.h> 36#include <asm/amd_iommu.h>
34 37
35#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 38#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
36 39
37#define EXIT_LOOP_COUNT 10000000 40#define LOOP_TIMEOUT 100000
38 41
39static DEFINE_RWLOCK(amd_iommu_devtable_lock); 42static DEFINE_RWLOCK(amd_iommu_devtable_lock);
40 43
@@ -57,7 +60,6 @@ struct iommu_cmd {
57 u32 data[4]; 60 u32 data[4];
58}; 61};
59 62
60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
61static void update_domain(struct protection_domain *domain); 63static void update_domain(struct protection_domain *domain);
62 64
63/**************************************************************************** 65/****************************************************************************
@@ -153,6 +155,10 @@ static int iommu_init_device(struct device *dev)
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); 155 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev) 156 if (pdev)
155 dev_data->alias = &pdev->dev; 157 dev_data->alias = &pdev->dev;
158 else {
159 kfree(dev_data);
160 return -ENOTSUPP;
161 }
156 162
157 atomic_set(&dev_data->bind, 0); 163 atomic_set(&dev_data->bind, 0);
158 164
@@ -162,6 +168,20 @@ static int iommu_init_device(struct device *dev)
162 return 0; 168 return 0;
163} 169}
164 170
171static void iommu_ignore_device(struct device *dev)
172{
173 u16 devid, alias;
174
175 devid = get_device_id(dev);
176 alias = amd_iommu_alias_table[devid];
177
178 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
179 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
180
181 amd_iommu_rlookup_table[devid] = NULL;
182 amd_iommu_rlookup_table[alias] = NULL;
183}
184
165static void iommu_uninit_device(struct device *dev) 185static void iommu_uninit_device(struct device *dev)
166{ 186{
167 kfree(dev->archdata.iommu); 187 kfree(dev->archdata.iommu);
@@ -191,7 +211,9 @@ int __init amd_iommu_init_devices(void)
191 continue; 211 continue;
192 212
193 ret = iommu_init_device(&pdev->dev); 213 ret = iommu_init_device(&pdev->dev);
194 if (ret) 214 if (ret == -ENOTSUPP)
215 iommu_ignore_device(&pdev->dev);
216 else if (ret)
195 goto out_free; 217 goto out_free;
196 } 218 }
197 219
@@ -322,8 +344,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
322 break; 344 break;
323 case EVENT_TYPE_ILL_CMD: 345 case EVENT_TYPE_ILL_CMD:
324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 346 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 dump_command(address); 347 dump_command(address);
328 break; 348 break;
329 case EVENT_TYPE_CMD_HARD_ERR: 349 case EVENT_TYPE_CMD_HARD_ERR:
@@ -367,7 +387,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
367 spin_unlock_irqrestore(&iommu->lock, flags); 387 spin_unlock_irqrestore(&iommu->lock, flags);
368} 388}
369 389
370irqreturn_t amd_iommu_int_handler(int irq, void *data) 390irqreturn_t amd_iommu_int_thread(int irq, void *data)
371{ 391{
372 struct amd_iommu *iommu; 392 struct amd_iommu *iommu;
373 393
@@ -377,192 +397,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
377 return IRQ_HANDLED; 397 return IRQ_HANDLED;
378} 398}
379 399
400irqreturn_t amd_iommu_int_handler(int irq, void *data)
401{
402 return IRQ_WAKE_THREAD;
403}
404
380/**************************************************************************** 405/****************************************************************************
381 * 406 *
382 * IOMMU command queuing functions 407 * IOMMU command queuing functions
383 * 408 *
384 ****************************************************************************/ 409 ****************************************************************************/
385 410
386/* 411static int wait_on_sem(volatile u64 *sem)
387 * Writes the command to the IOMMUs command buffer and informs the 412{
388 * hardware about the new command. Must be called with iommu->lock held. 413 int i = 0;
389 */ 414
390static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 415 while (*sem == 0 && i < LOOP_TIMEOUT) {
416 udelay(1);
417 i += 1;
418 }
419
420 if (i == LOOP_TIMEOUT) {
421 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
422 return -EIO;
423 }
424
425 return 0;
426}
427
428static void copy_cmd_to_buffer(struct amd_iommu *iommu,
429 struct iommu_cmd *cmd,
430 u32 tail)
391{ 431{
392 u32 tail, head;
393 u8 *target; 432 u8 *target;
394 433
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
397 target = iommu->cmd_buf + tail; 434 target = iommu->cmd_buf + tail;
398 memcpy_toio(target, cmd, sizeof(*cmd)); 435 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
399 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 436
400 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 437 /* Copy command to buffer */
401 if (tail == head) 438 memcpy(target, cmd, sizeof(*cmd));
402 return -ENOMEM; 439
440 /* Tell the IOMMU about it */
403 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 441 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
442}
404 443
405 return 0; 444static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
445{
446 WARN_ON(address & 0x7ULL);
447
448 memset(cmd, 0, sizeof(*cmd));
449 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
450 cmd->data[1] = upper_32_bits(__pa(address));
451 cmd->data[2] = 1;
452 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
453}
454
455static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
456{
457 memset(cmd, 0, sizeof(*cmd));
458 cmd->data[0] = devid;
459 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
460}
461
462static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
463 size_t size, u16 domid, int pde)
464{
465 u64 pages;
466 int s;
467
468 pages = iommu_num_pages(address, size, PAGE_SIZE);
469 s = 0;
470
471 if (pages > 1) {
472 /*
473 * If we have to flush more than one page, flush all
474 * TLB entries for this domain
475 */
476 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
477 s = 1;
478 }
479
480 address &= PAGE_MASK;
481
482 memset(cmd, 0, sizeof(*cmd));
483 cmd->data[1] |= domid;
484 cmd->data[2] = lower_32_bits(address);
485 cmd->data[3] = upper_32_bits(address);
486 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
487 if (s) /* size bit - we flush more than one 4kb page */
488 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
489 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
490 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
491}
492
493static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
494 u64 address, size_t size)
495{
496 u64 pages;
497 int s;
498
499 pages = iommu_num_pages(address, size, PAGE_SIZE);
500 s = 0;
501
502 if (pages > 1) {
503 /*
504 * If we have to flush more than one page, flush all
505 * TLB entries for this domain
506 */
507 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
508 s = 1;
509 }
510
511 address &= PAGE_MASK;
512
513 memset(cmd, 0, sizeof(*cmd));
514 cmd->data[0] = devid;
515 cmd->data[0] |= (qdep & 0xff) << 24;
516 cmd->data[1] = devid;
517 cmd->data[2] = lower_32_bits(address);
518 cmd->data[3] = upper_32_bits(address);
519 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
520 if (s)
521 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
522}
523
524static void build_inv_all(struct iommu_cmd *cmd)
525{
526 memset(cmd, 0, sizeof(*cmd));
527 CMD_SET_TYPE(cmd, CMD_INV_ALL);
406} 528}
407 529
408/* 530/*
409 * General queuing function for commands. Takes iommu->lock and calls 531 * Writes the command to the IOMMUs command buffer and informs the
410 * __iommu_queue_command(). 532 * hardware about the new command.
411 */ 533 */
412static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 534static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
413{ 535{
536 u32 left, tail, head, next_tail;
414 unsigned long flags; 537 unsigned long flags;
415 int ret;
416 538
539 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
540
541again:
417 spin_lock_irqsave(&iommu->lock, flags); 542 spin_lock_irqsave(&iommu->lock, flags);
418 ret = __iommu_queue_command(iommu, cmd);
419 if (!ret)
420 iommu->need_sync = true;
421 spin_unlock_irqrestore(&iommu->lock, flags);
422 543
423 return ret; 544 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
424} 545 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
546 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
547 left = (head - next_tail) % iommu->cmd_buf_size;
425 548
426/* 549 if (left <= 2) {
427 * This function waits until an IOMMU has completed a completion 550 struct iommu_cmd sync_cmd;
428 * wait command 551 volatile u64 sem = 0;
429 */ 552 int ret;
430static void __iommu_wait_for_completion(struct amd_iommu *iommu)
431{
432 int ready = 0;
433 unsigned status = 0;
434 unsigned long i = 0;
435 553
436 INC_STATS_COUNTER(compl_wait); 554 build_completion_wait(&sync_cmd, (u64)&sem);
555 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
437 556
438 while (!ready && (i < EXIT_LOOP_COUNT)) { 557 spin_unlock_irqrestore(&iommu->lock, flags);
439 ++i; 558
440 /* wait for the bit to become one */ 559 if ((ret = wait_on_sem(&sem)) != 0)
441 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 560 return ret;
442 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; 561
562 goto again;
443 } 563 }
444 564
445 /* set bit back to zero */ 565 copy_cmd_to_buffer(iommu, cmd, tail);
446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 566
447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 567 /* We need to sync now to make sure all commands are processed */
568 iommu->need_sync = true;
448 569
449 if (unlikely(i == EXIT_LOOP_COUNT)) 570 spin_unlock_irqrestore(&iommu->lock, flags);
450 iommu->reset_in_progress = true; 571
572 return 0;
451} 573}
452 574
453/* 575/*
454 * This function queues a completion wait command into the command 576 * This function queues a completion wait command into the command
455 * buffer of an IOMMU 577 * buffer of an IOMMU
456 */ 578 */
457static int __iommu_completion_wait(struct amd_iommu *iommu) 579static int iommu_completion_wait(struct amd_iommu *iommu)
458{ 580{
459 struct iommu_cmd cmd; 581 struct iommu_cmd cmd;
582 volatile u64 sem = 0;
583 int ret;
460 584
461 memset(&cmd, 0, sizeof(cmd)); 585 if (!iommu->need_sync)
462 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 586 return 0;
463 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
464 587
465 return __iommu_queue_command(iommu, &cmd); 588 build_completion_wait(&cmd, (u64)&sem);
589
590 ret = iommu_queue_command(iommu, &cmd);
591 if (ret)
592 return ret;
593
594 return wait_on_sem(&sem);
466} 595}
467 596
468/* 597static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
469 * This function is called whenever we need to ensure that the IOMMU has
470 * completed execution of all commands we sent. It sends a
471 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
472 * us about that by writing a value to a physical address we pass with
473 * the command.
474 */
475static int iommu_completion_wait(struct amd_iommu *iommu)
476{ 598{
477 int ret = 0; 599 struct iommu_cmd cmd;
478 unsigned long flags;
479
480 spin_lock_irqsave(&iommu->lock, flags);
481 600
482 if (!iommu->need_sync) 601 build_inv_dte(&cmd, devid);
483 goto out;
484 602
485 ret = __iommu_completion_wait(iommu); 603 return iommu_queue_command(iommu, &cmd);
604}
486 605
487 iommu->need_sync = false; 606static void iommu_flush_dte_all(struct amd_iommu *iommu)
607{
608 u32 devid;
488 609
489 if (ret) 610 for (devid = 0; devid <= 0xffff; ++devid)
490 goto out; 611 iommu_flush_dte(iommu, devid);
491 612
492 __iommu_wait_for_completion(iommu); 613 iommu_completion_wait(iommu);
614}
493 615
494out: 616/*
495 spin_unlock_irqrestore(&iommu->lock, flags); 617 * This function uses heavy locking and may disable irqs for some time. But
618 * this is no issue because it is only called during resume.
619 */
620static void iommu_flush_tlb_all(struct amd_iommu *iommu)
621{
622 u32 dom_id;
496 623
497 if (iommu->reset_in_progress) 624 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
498 reset_iommu_command_buffer(iommu); 625 struct iommu_cmd cmd;
626 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
627 dom_id, 1);
628 iommu_queue_command(iommu, &cmd);
629 }
499 630
500 return 0; 631 iommu_completion_wait(iommu);
501} 632}
502 633
503static void iommu_flush_complete(struct protection_domain *domain) 634static void iommu_flush_all(struct amd_iommu *iommu)
504{ 635{
505 int i; 636 struct iommu_cmd cmd;
506 637
507 for (i = 0; i < amd_iommus_present; ++i) { 638 build_inv_all(&cmd);
508 if (!domain->dev_iommu[i])
509 continue;
510 639
511 /* 640 iommu_queue_command(iommu, &cmd);
512 * Devices of this domain are behind this IOMMU 641 iommu_completion_wait(iommu);
513 * We need to wait for completion of all commands. 642}
514 */ 643
515 iommu_completion_wait(amd_iommus[i]); 644void iommu_flush_all_caches(struct amd_iommu *iommu)
645{
646 if (iommu_feature(iommu, FEATURE_IA)) {
647 iommu_flush_all(iommu);
648 } else {
649 iommu_flush_dte_all(iommu);
650 iommu_flush_tlb_all(iommu);
516 } 651 }
517} 652}
518 653
519/* 654/*
520 * Command send function for invalidating a device table entry 655 * Command send function for flushing on-device TLB
521 */ 656 */
522static int iommu_flush_device(struct device *dev) 657static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
523{ 658{
659 struct pci_dev *pdev = to_pci_dev(dev);
524 struct amd_iommu *iommu; 660 struct amd_iommu *iommu;
525 struct iommu_cmd cmd; 661 struct iommu_cmd cmd;
526 u16 devid; 662 u16 devid;
663 int qdep;
527 664
665 qdep = pci_ats_queue_depth(pdev);
528 devid = get_device_id(dev); 666 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid]; 667 iommu = amd_iommu_rlookup_table[devid];
530 668
531 /* Build command */ 669 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
532 memset(&cmd, 0, sizeof(cmd));
533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
534 cmd.data[0] = devid;
535 670
536 return iommu_queue_command(iommu, &cmd); 671 return iommu_queue_command(iommu, &cmd);
537} 672}
538 673
539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
540 u16 domid, int pde, int s)
541{
542 memset(cmd, 0, sizeof(*cmd));
543 address &= PAGE_MASK;
544 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
545 cmd->data[1] |= domid;
546 cmd->data[2] = lower_32_bits(address);
547 cmd->data[3] = upper_32_bits(address);
548 if (s) /* size bit - we flush more than one 4kb page */
549 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
550 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
551 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
552}
553
554/* 674/*
555 * Generic command send function for invalidaing TLB entries 675 * Command send function for invalidating a device table entry
556 */ 676 */
557static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 677static int device_flush_dte(struct device *dev)
558 u64 address, u16 domid, int pde, int s)
559{ 678{
560 struct iommu_cmd cmd; 679 struct amd_iommu *iommu;
680 struct pci_dev *pdev;
681 u16 devid;
561 int ret; 682 int ret;
562 683
563 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); 684 pdev = to_pci_dev(dev);
685 devid = get_device_id(dev);
686 iommu = amd_iommu_rlookup_table[devid];
564 687
565 ret = iommu_queue_command(iommu, &cmd); 688 ret = iommu_flush_dte(iommu, devid);
689 if (ret)
690 return ret;
691
692 if (pci_ats_enabled(pdev))
693 ret = device_flush_iotlb(dev, 0, ~0UL);
566 694
567 return ret; 695 return ret;
568} 696}
@@ -572,23 +700,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
572 * It invalidates a single PTE if the range to flush is within a single 700 * It invalidates a single PTE if the range to flush is within a single
573 * page. Otherwise it flushes the whole TLB of the IOMMU. 701 * page. Otherwise it flushes the whole TLB of the IOMMU.
574 */ 702 */
575static void __iommu_flush_pages(struct protection_domain *domain, 703static void __domain_flush_pages(struct protection_domain *domain,
576 u64 address, size_t size, int pde) 704 u64 address, size_t size, int pde)
577{ 705{
578 int s = 0, i; 706 struct iommu_dev_data *dev_data;
579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); 707 struct iommu_cmd cmd;
580 708 int ret = 0, i;
581 address &= PAGE_MASK;
582
583 if (pages > 1) {
584 /*
585 * If we have to flush more than one page, flush all
586 * TLB entries for this domain
587 */
588 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
589 s = 1;
590 }
591 709
710 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
592 711
593 for (i = 0; i < amd_iommus_present; ++i) { 712 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i]) 713 if (!domain->dev_iommu[i])
@@ -598,101 +717,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
598 * Devices of this domain are behind this IOMMU 717 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush 718 * We need a TLB flush
600 */ 719 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address, 720 ret |= iommu_queue_command(amd_iommus[i], &cmd);
602 domain->id, pde, s); 721 }
722
723 list_for_each_entry(dev_data, &domain->dev_list, list) {
724 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
725
726 if (!pci_ats_enabled(pdev))
727 continue;
728
729 ret |= device_flush_iotlb(dev_data->dev, address, size);
603 } 730 }
604 731
605 return; 732 WARN_ON(ret);
606} 733}
607 734
608static void iommu_flush_pages(struct protection_domain *domain, 735static void domain_flush_pages(struct protection_domain *domain,
609 u64 address, size_t size) 736 u64 address, size_t size)
610{ 737{
611 __iommu_flush_pages(domain, address, size, 0); 738 __domain_flush_pages(domain, address, size, 0);
612} 739}
613 740
614/* Flush the whole IO/TLB for a given protection domain */ 741/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain) 742static void domain_flush_tlb(struct protection_domain *domain)
616{ 743{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 744 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
618} 745}
619 746
620/* Flush the whole IO/TLB for a given protection domain - including PDE */ 747/* Flush the whole IO/TLB for a given protection domain - including PDE */
621static void iommu_flush_tlb_pde(struct protection_domain *domain) 748static void domain_flush_tlb_pde(struct protection_domain *domain)
622{
623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
624}
625
626
627/*
628 * This function flushes the DTEs for all devices in domain
629 */
630static void iommu_flush_domain_devices(struct protection_domain *domain)
631{ 749{
632 struct iommu_dev_data *dev_data; 750 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
633 unsigned long flags;
634
635 spin_lock_irqsave(&domain->lock, flags);
636
637 list_for_each_entry(dev_data, &domain->dev_list, list)
638 iommu_flush_device(dev_data->dev);
639
640 spin_unlock_irqrestore(&domain->lock, flags);
641} 751}
642 752
643static void iommu_flush_all_domain_devices(void) 753static void domain_flush_complete(struct protection_domain *domain)
644{ 754{
645 struct protection_domain *domain; 755 int i;
646 unsigned long flags;
647 756
648 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 757 for (i = 0; i < amd_iommus_present; ++i) {
758 if (!domain->dev_iommu[i])
759 continue;
649 760
650 list_for_each_entry(domain, &amd_iommu_pd_list, list) { 761 /*
651 iommu_flush_domain_devices(domain); 762 * Devices of this domain are behind this IOMMU
652 iommu_flush_complete(domain); 763 * We need to wait for completion of all commands.
764 */
765 iommu_completion_wait(amd_iommus[i]);
653 } 766 }
654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656} 767}
657 768
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
661}
662 769
663/* 770/*
664 * This function uses heavy locking and may disable irqs for some time. But 771 * This function flushes the DTEs for all devices in domain
665 * this is no issue because it is only called during resume.
666 */ 772 */
667void amd_iommu_flush_all_domains(void) 773static void domain_flush_devices(struct protection_domain *domain)
668{ 774{
669 struct protection_domain *domain; 775 struct iommu_dev_data *dev_data;
670 unsigned long flags; 776 unsigned long flags;
671 777
672 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 778 spin_lock_irqsave(&domain->lock, flags);
673
674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
682}
683
684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
685{
686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
687
688 if (iommu->reset_in_progress)
689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690 779
691 amd_iommu_reset_cmd_buffer(iommu); 780 list_for_each_entry(dev_data, &domain->dev_list, list)
692 amd_iommu_flush_all_devices(); 781 device_flush_dte(dev_data->dev);
693 amd_iommu_flush_all_domains();
694 782
695 iommu->reset_in_progress = false; 783 spin_unlock_irqrestore(&domain->lock, flags);
696} 784}
697 785
698/**************************************************************************** 786/****************************************************************************
@@ -1086,7 +1174,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1086 1174
1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1175 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1088 1176
1089 /* Intialize the exclusion range if necessary */ 1177 /* Initialize the exclusion range if necessary */
1090 for_each_iommu(iommu) { 1178 for_each_iommu(iommu) {
1091 if (iommu->exclusion_start && 1179 if (iommu->exclusion_start &&
1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1180 iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1441,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1353 1441
1354/* 1442/*
1355 * Allocates a new protection domain usable for the dma_ops functions. 1443 * Allocates a new protection domain usable for the dma_ops functions.
1356 * It also intializes the page table and the address allocator data 1444 * It also initializes the page table and the address allocator data
1357 * structures required for the dma_ops interface 1445 * structures required for the dma_ops interface
1358 */ 1446 */
1359static struct dma_ops_domain *dma_ops_domain_alloc(void) 1447static struct dma_ops_domain *dma_ops_domain_alloc(void)
@@ -1410,17 +1498,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
1410 return domain->flags & PD_DMA_OPS_MASK; 1498 return domain->flags & PD_DMA_OPS_MASK;
1411} 1499}
1412 1500
1413static void set_dte_entry(u16 devid, struct protection_domain *domain) 1501static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1414{ 1502{
1415 u64 pte_root = virt_to_phys(domain->pt_root); 1503 u64 pte_root = virt_to_phys(domain->pt_root);
1504 u32 flags = 0;
1416 1505
1417 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1506 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1418 << DEV_ENTRY_MODE_SHIFT; 1507 << DEV_ENTRY_MODE_SHIFT;
1419 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 1508 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1420 1509
1421 amd_iommu_dev_table[devid].data[2] = domain->id; 1510 if (ats)
1422 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1511 flags |= DTE_FLAG_IOTLB;
1423 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1512
1513 amd_iommu_dev_table[devid].data[3] |= flags;
1514 amd_iommu_dev_table[devid].data[2] = domain->id;
1515 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1516 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1424} 1517}
1425 1518
1426static void clear_dte_entry(u16 devid) 1519static void clear_dte_entry(u16 devid)
@@ -1437,23 +1530,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
1437{ 1530{
1438 struct iommu_dev_data *dev_data; 1531 struct iommu_dev_data *dev_data;
1439 struct amd_iommu *iommu; 1532 struct amd_iommu *iommu;
1533 struct pci_dev *pdev;
1534 bool ats = false;
1440 u16 devid; 1535 u16 devid;
1441 1536
1442 devid = get_device_id(dev); 1537 devid = get_device_id(dev);
1443 iommu = amd_iommu_rlookup_table[devid]; 1538 iommu = amd_iommu_rlookup_table[devid];
1444 dev_data = get_dev_data(dev); 1539 dev_data = get_dev_data(dev);
1540 pdev = to_pci_dev(dev);
1541
1542 if (amd_iommu_iotlb_sup)
1543 ats = pci_ats_enabled(pdev);
1445 1544
1446 /* Update data structures */ 1545 /* Update data structures */
1447 dev_data->domain = domain; 1546 dev_data->domain = domain;
1448 list_add(&dev_data->list, &domain->dev_list); 1547 list_add(&dev_data->list, &domain->dev_list);
1449 set_dte_entry(devid, domain); 1548 set_dte_entry(devid, domain, ats);
1450 1549
1451 /* Do reference counting */ 1550 /* Do reference counting */
1452 domain->dev_iommu[iommu->index] += 1; 1551 domain->dev_iommu[iommu->index] += 1;
1453 domain->dev_cnt += 1; 1552 domain->dev_cnt += 1;
1454 1553
1455 /* Flush the DTE entry */ 1554 /* Flush the DTE entry */
1456 iommu_flush_device(dev); 1555 device_flush_dte(dev);
1457} 1556}
1458 1557
1459static void do_detach(struct device *dev) 1558static void do_detach(struct device *dev)
@@ -1476,7 +1575,7 @@ static void do_detach(struct device *dev)
1476 clear_dte_entry(devid); 1575 clear_dte_entry(devid);
1477 1576
1478 /* Flush the DTE entry */ 1577 /* Flush the DTE entry */
1479 iommu_flush_device(dev); 1578 device_flush_dte(dev);
1480} 1579}
1481 1580
1482/* 1581/*
@@ -1539,9 +1638,13 @@ out_unlock:
1539static int attach_device(struct device *dev, 1638static int attach_device(struct device *dev,
1540 struct protection_domain *domain) 1639 struct protection_domain *domain)
1541{ 1640{
1641 struct pci_dev *pdev = to_pci_dev(dev);
1542 unsigned long flags; 1642 unsigned long flags;
1543 int ret; 1643 int ret;
1544 1644
1645 if (amd_iommu_iotlb_sup)
1646 pci_enable_ats(pdev, PAGE_SHIFT);
1647
1545 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1648 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1546 ret = __attach_device(dev, domain); 1649 ret = __attach_device(dev, domain);
1547 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1650 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1654,7 @@ static int attach_device(struct device *dev,
1551 * left the caches in the IOMMU dirty. So we have to flush 1654 * left the caches in the IOMMU dirty. So we have to flush
1552 * here to evict all dirty stuff. 1655 * here to evict all dirty stuff.
1553 */ 1656 */
1554 iommu_flush_tlb_pde(domain); 1657 domain_flush_tlb_pde(domain);
1555 1658
1556 return ret; 1659 return ret;
1557} 1660}
@@ -1598,12 +1701,16 @@ static void __detach_device(struct device *dev)
1598 */ 1701 */
1599static void detach_device(struct device *dev) 1702static void detach_device(struct device *dev)
1600{ 1703{
1704 struct pci_dev *pdev = to_pci_dev(dev);
1601 unsigned long flags; 1705 unsigned long flags;
1602 1706
1603 /* lock device table */ 1707 /* lock device table */
1604 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1708 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1605 __detach_device(dev); 1709 __detach_device(dev);
1606 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1710 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1711
1712 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1713 pci_disable_ats(pdev);
1607} 1714}
1608 1715
1609/* 1716/*
@@ -1615,10 +1722,9 @@ static struct protection_domain *domain_for_device(struct device *dev)
1615 struct protection_domain *dom; 1722 struct protection_domain *dom;
1616 struct iommu_dev_data *dev_data, *alias_data; 1723 struct iommu_dev_data *dev_data, *alias_data;
1617 unsigned long flags; 1724 unsigned long flags;
1618 u16 devid, alias; 1725 u16 devid;
1619 1726
1620 devid = get_device_id(dev); 1727 devid = get_device_id(dev);
1621 alias = amd_iommu_alias_table[devid];
1622 dev_data = get_dev_data(dev); 1728 dev_data = get_dev_data(dev);
1623 alias_data = get_dev_data(dev_data->alias); 1729 alias_data = get_dev_data(dev_data->alias);
1624 if (!alias_data) 1730 if (!alias_data)
@@ -1692,7 +1798,7 @@ static int device_change_notifier(struct notifier_block *nb,
1692 goto out; 1798 goto out;
1693 } 1799 }
1694 1800
1695 iommu_flush_device(dev); 1801 device_flush_dte(dev);
1696 iommu_completion_wait(iommu); 1802 iommu_completion_wait(iommu);
1697 1803
1698out: 1804out:
@@ -1753,8 +1859,9 @@ static void update_device_table(struct protection_domain *domain)
1753 struct iommu_dev_data *dev_data; 1859 struct iommu_dev_data *dev_data;
1754 1860
1755 list_for_each_entry(dev_data, &domain->dev_list, list) { 1861 list_for_each_entry(dev_data, &domain->dev_list, list) {
1862 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1756 u16 devid = get_device_id(dev_data->dev); 1863 u16 devid = get_device_id(dev_data->dev);
1757 set_dte_entry(devid, domain); 1864 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1758 } 1865 }
1759} 1866}
1760 1867
@@ -1764,8 +1871,9 @@ static void update_domain(struct protection_domain *domain)
1764 return; 1871 return;
1765 1872
1766 update_device_table(domain); 1873 update_device_table(domain);
1767 iommu_flush_domain_devices(domain); 1874
1768 iommu_flush_tlb_pde(domain); 1875 domain_flush_devices(domain);
1876 domain_flush_tlb_pde(domain);
1769 1877
1770 domain->updated = false; 1878 domain->updated = false;
1771} 1879}
@@ -1924,10 +2032,10 @@ retry:
1924 ADD_STATS_COUNTER(alloced_io_mem, size); 2032 ADD_STATS_COUNTER(alloced_io_mem, size);
1925 2033
1926 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2034 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1927 iommu_flush_tlb(&dma_dom->domain); 2035 domain_flush_tlb(&dma_dom->domain);
1928 dma_dom->need_flush = false; 2036 dma_dom->need_flush = false;
1929 } else if (unlikely(amd_iommu_np_cache)) 2037 } else if (unlikely(amd_iommu_np_cache))
1930 iommu_flush_pages(&dma_dom->domain, address, size); 2038 domain_flush_pages(&dma_dom->domain, address, size);
1931 2039
1932out: 2040out:
1933 return address; 2041 return address;
@@ -1976,7 +2084,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1976 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2084 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1977 2085
1978 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2086 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1979 iommu_flush_pages(&dma_dom->domain, flush_addr, size); 2087 domain_flush_pages(&dma_dom->domain, flush_addr, size);
1980 dma_dom->need_flush = false; 2088 dma_dom->need_flush = false;
1981 } 2089 }
1982} 2090}
@@ -2012,7 +2120,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
2012 if (addr == DMA_ERROR_CODE) 2120 if (addr == DMA_ERROR_CODE)
2013 goto out; 2121 goto out;
2014 2122
2015 iommu_flush_complete(domain); 2123 domain_flush_complete(domain);
2016 2124
2017out: 2125out:
2018 spin_unlock_irqrestore(&domain->lock, flags); 2126 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2147,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2039 2147
2040 __unmap_single(domain->priv, dma_addr, size, dir); 2148 __unmap_single(domain->priv, dma_addr, size, dir);
2041 2149
2042 iommu_flush_complete(domain); 2150 domain_flush_complete(domain);
2043 2151
2044 spin_unlock_irqrestore(&domain->lock, flags); 2152 spin_unlock_irqrestore(&domain->lock, flags);
2045} 2153}
@@ -2104,7 +2212,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
2104 goto unmap; 2212 goto unmap;
2105 } 2213 }
2106 2214
2107 iommu_flush_complete(domain); 2215 domain_flush_complete(domain);
2108 2216
2109out: 2217out:
2110 spin_unlock_irqrestore(&domain->lock, flags); 2218 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2258,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2150 s->dma_address = s->dma_length = 0; 2258 s->dma_address = s->dma_length = 0;
2151 } 2259 }
2152 2260
2153 iommu_flush_complete(domain); 2261 domain_flush_complete(domain);
2154 2262
2155 spin_unlock_irqrestore(&domain->lock, flags); 2263 spin_unlock_irqrestore(&domain->lock, flags);
2156} 2264}
@@ -2200,7 +2308,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
2200 goto out_free; 2308 goto out_free;
2201 } 2309 }
2202 2310
2203 iommu_flush_complete(domain); 2311 domain_flush_complete(domain);
2204 2312
2205 spin_unlock_irqrestore(&domain->lock, flags); 2313 spin_unlock_irqrestore(&domain->lock, flags);
2206 2314
@@ -2232,7 +2340,7 @@ static void free_coherent(struct device *dev, size_t size,
2232 2340
2233 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2341 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2234 2342
2235 iommu_flush_complete(domain); 2343 domain_flush_complete(domain);
2236 2344
2237 spin_unlock_irqrestore(&domain->lock, flags); 2345 spin_unlock_irqrestore(&domain->lock, flags);
2238 2346
@@ -2296,6 +2404,23 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2296 .dma_supported = amd_iommu_dma_supported, 2404 .dma_supported = amd_iommu_dma_supported,
2297}; 2405};
2298 2406
2407static unsigned device_dma_ops_init(void)
2408{
2409 struct pci_dev *pdev = NULL;
2410 unsigned unhandled = 0;
2411
2412 for_each_pci_dev(pdev) {
2413 if (!check_device(&pdev->dev)) {
2414 unhandled += 1;
2415 continue;
2416 }
2417
2418 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
2419 }
2420
2421 return unhandled;
2422}
2423
2299/* 2424/*
2300 * The function which clues the AMD IOMMU driver into dma_ops. 2425 * The function which clues the AMD IOMMU driver into dma_ops.
2301 */ 2426 */
@@ -2308,7 +2433,7 @@ void __init amd_iommu_init_api(void)
2308int __init amd_iommu_init_dma_ops(void) 2433int __init amd_iommu_init_dma_ops(void)
2309{ 2434{
2310 struct amd_iommu *iommu; 2435 struct amd_iommu *iommu;
2311 int ret; 2436 int ret, unhandled;
2312 2437
2313 /* 2438 /*
2314 * first allocate a default protection domain for every IOMMU we 2439 * first allocate a default protection domain for every IOMMU we
@@ -2334,7 +2459,11 @@ int __init amd_iommu_init_dma_ops(void)
2334 swiotlb = 0; 2459 swiotlb = 0;
2335 2460
2336 /* Make the driver finally visible to the drivers */ 2461 /* Make the driver finally visible to the drivers */
2337 dma_ops = &amd_iommu_dma_ops; 2462 unhandled = device_dma_ops_init();
2463 if (unhandled && max_pfn > MAX_DMA32_PFN) {
2464 /* There are unhandled devices - initialize swiotlb for them */
2465 swiotlb = 1;
2466 }
2338 2467
2339 amd_iommu_stats_init(); 2468 amd_iommu_stats_init();
2340 2469
@@ -2476,7 +2605,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
2476 if (!iommu) 2605 if (!iommu)
2477 return; 2606 return;
2478 2607
2479 iommu_flush_device(dev); 2608 device_flush_dte(dev);
2480 iommu_completion_wait(iommu); 2609 iommu_completion_wait(iommu);
2481} 2610}
2482 2611
@@ -2542,7 +2671,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2542 unmap_size = iommu_unmap_page(domain, iova, page_size); 2671 unmap_size = iommu_unmap_page(domain, iova, page_size);
2543 mutex_unlock(&domain->api_lock); 2672 mutex_unlock(&domain->api_lock);
2544 2673
2545 iommu_flush_tlb_pde(domain); 2674 domain_flush_tlb_pde(domain);
2546 2675
2547 return get_order(unmap_size); 2676 return get_order(unmap_size);
2548} 2677}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..bfc8453bd98d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -21,7 +21,7 @@
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
@@ -31,7 +31,7 @@
31#include <asm/iommu.h> 31#include <asm/iommu.h>
32#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h> 33#include <asm/x86_init.h>
34 34#include <asm/iommu_table.h>
35/* 35/*
36 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
37 */ 37 */
@@ -137,6 +137,7 @@ int amd_iommus_present;
137 137
138/* IOMMUs have a non-present cache? */ 138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly; 139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
140 141
141/* 142/*
142 * The ACPI table parsing functions set this variable on an error 143 * The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */
180static u32 alias_table_size; /* size of the alias table */ 181static u32 alias_table_size; /* size of the alias table */
181static u32 rlookup_table_size; /* size if the rlookup table */ 182static u32 rlookup_table_size; /* size if the rlookup table */
182 183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
183static inline void update_last_devid(u16 devid) 190static inline void update_last_devid(u16 devid)
184{ 191{
185 if (devid > amd_iommu_last_bdf) 192 if (devid > amd_iommu_last_bdf)
@@ -194,6 +201,39 @@ static inline unsigned long tbl_size(int entry_size)
194 return 1UL << shift; 201 return 1UL << shift;
195} 202}
196 203
204/* Access to l1 and l2 indexed register spaces */
205
206static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
207{
208 u32 val;
209
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
211 pci_read_config_dword(iommu->dev, 0xfc, &val);
212 return val;
213}
214
215static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
216{
217 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
218 pci_write_config_dword(iommu->dev, 0xfc, val);
219 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
220}
221
222static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
223{
224 u32 val;
225
226 pci_write_config_dword(iommu->dev, 0xf0, address);
227 pci_read_config_dword(iommu->dev, 0xf4, &val);
228 return val;
229}
230
231static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
232{
233 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
234 pci_write_config_dword(iommu->dev, 0xf4, val);
235}
236
197/**************************************************************************** 237/****************************************************************************
198 * 238 *
199 * AMD IOMMU MMIO register space handling functions 239 * AMD IOMMU MMIO register space handling functions
@@ -260,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
260/* Function to enable the hardware */ 300/* Function to enable the hardware */
261static void iommu_enable(struct amd_iommu *iommu) 301static void iommu_enable(struct amd_iommu *iommu)
262{ 302{
263 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", 303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
264 dev_name(&iommu->dev->dev), iommu->cap_ptr); 310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
265 311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
266 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
267} 321}
268 322
@@ -618,7 +672,8 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
618static void __init init_iommu_from_pci(struct amd_iommu *iommu) 672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
619{ 673{
620 int cap_ptr = iommu->cap_ptr; 674 int cap_ptr = iommu->cap_ptr;
621 u32 range, misc; 675 u32 range, misc, low, high;
676 int i, j;
622 677
623 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
624 &iommu->cap); 679 &iommu->cap);
@@ -633,12 +688,38 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
633 MMIO_GET_LD(range)); 688 MMIO_GET_LD(range));
634 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
635 690
636 if (is_rd890_iommu(iommu->dev)) { 691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
637 pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); 692 amd_iommu_iotlb_sup = false;
638 pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); 693
639 pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); 694 /* read extended feature bits */
640 pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); 695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
641 } 696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
700 if (!is_rd890_iommu(iommu->dev))
701 return;
702
703 /*
704 * Some rd890 systems may not be fully reconfigured by the BIOS, so
705 * it's necessary for us to store this information so it can be
706 * reprogrammed on resume
707 */
708
709 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
710 &iommu->stored_addr_lo);
711 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
712 &iommu->stored_addr_hi);
713
714 /* Low bit locks writes to configuration space */
715 iommu->stored_addr_lo &= ~1;
716
717 for (i = 0; i < 6; i++)
718 for (j = 0; j < 0x12; j++)
719 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
720
721 for (i = 0; i < 0x83; i++)
722 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
642} 723}
643 724
644/* 725/*
@@ -650,8 +731,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
650{ 731{
651 u8 *p = (u8 *)h; 732 u8 *p = (u8 *)h;
652 u8 *end = p, flags = 0; 733 u8 *end = p, flags = 0;
653 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 734 u16 devid = 0, devid_start = 0, devid_to = 0;
654 u32 ext_flags = 0; 735 u32 dev_i, ext_flags = 0;
655 bool alias = false; 736 bool alias = false;
656 struct ivhd_entry *e; 737 struct ivhd_entry *e;
657 738
@@ -806,7 +887,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
806/* Initializes the device->iommu mapping for the driver */ 887/* Initializes the device->iommu mapping for the driver */
807static int __init init_iommu_devices(struct amd_iommu *iommu) 888static int __init init_iommu_devices(struct amd_iommu *iommu)
808{ 889{
809 u16 i; 890 u32 i;
810 891
811 for (i = iommu->first_device; i <= iommu->last_device; ++i) 892 for (i = iommu->first_device; i <= iommu->last_device; ++i)
812 set_iommu_for_device(iommu, i); 893 set_iommu_for_device(iommu, i);
@@ -953,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
953 if (pci_enable_msi(iommu->dev)) 1034 if (pci_enable_msi(iommu->dev))
954 return 1; 1035 return 1;
955 1036
956 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 1037 r = request_threaded_irq(iommu->dev->irq,
957 IRQF_SAMPLE_RANDOM, 1038 amd_iommu_int_handler,
958 "AMD-Vi", 1039 amd_iommu_int_thread,
959 NULL); 1040 0, "AMD-Vi",
1041 iommu->dev);
960 1042
961 if (r) { 1043 if (r) {
962 pci_disable_msi(iommu->dev); 1044 pci_disable_msi(iommu->dev);
@@ -1095,7 +1177,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
1095 */ 1177 */
1096static void init_device_table(void) 1178static void init_device_table(void)
1097{ 1179{
1098 u16 devid; 1180 u32 devid;
1099 1181
1100 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 1182 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
1101 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 1183 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
@@ -1127,14 +1209,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
1127 iommu_feature_enable(iommu, CONTROL_COHERENT_EN); 1209 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1128} 1210}
1129 1211
1130static void iommu_apply_quirks(struct amd_iommu *iommu) 1212static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1131{ 1213{
1132 if (is_rd890_iommu(iommu->dev)) { 1214 int i, j;
1133 pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); 1215 u32 ioc_feature_control;
1134 pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); 1216 struct pci_dev *pdev = NULL;
1135 pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); 1217
1136 pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); 1218 /* RD890 BIOSes may not have completely reconfigured the iommu */
1137 } 1219 if (!is_rd890_iommu(iommu->dev))
1220 return;
1221
1222 /*
1223 * First, we need to ensure that the iommu is enabled. This is
1224 * controlled by a register in the northbridge
1225 */
1226 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1227
1228 if (!pdev)
1229 return;
1230
1231 /* Select Northbridge indirect register 0x75 and enable writing */
1232 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1233 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1234
1235 /* Enable the iommu */
1236 if (!(ioc_feature_control & 0x1))
1237 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1238
1239 pci_dev_put(pdev);
1240
1241 /* Restore the iommu BAR */
1242 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1243 iommu->stored_addr_lo);
1244 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1245 iommu->stored_addr_hi);
1246
1247 /* Restore the l1 indirect regs for each of the 6 l1s */
1248 for (i = 0; i < 6; i++)
1249 for (j = 0; j < 0x12; j++)
1250 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1251
1252 /* Restore the l2 indirect regs */
1253 for (i = 0; i < 0x83; i++)
1254 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1255
1256 /* Lock PCI setup registers */
1257 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1258 iommu->stored_addr_lo | 1);
1138} 1259}
1139 1260
1140/* 1261/*
@@ -1147,7 +1268,6 @@ static void enable_iommus(void)
1147 1268
1148 for_each_iommu(iommu) { 1269 for_each_iommu(iommu) {
1149 iommu_disable(iommu); 1270 iommu_disable(iommu);
1150 iommu_apply_quirks(iommu);
1151 iommu_init_flags(iommu); 1271 iommu_init_flags(iommu);
1152 iommu_set_device_table(iommu); 1272 iommu_set_device_table(iommu);
1153 iommu_enable_command_buffer(iommu); 1273 iommu_enable_command_buffer(iommu);
@@ -1155,6 +1275,7 @@ static void enable_iommus(void)
1155 iommu_set_exclusion_range(iommu); 1275 iommu_set_exclusion_range(iommu);
1156 iommu_init_msi(iommu); 1276 iommu_init_msi(iommu);
1157 iommu_enable(iommu); 1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1158 } 1279 }
1159} 1280}
1160 1281
@@ -1171,8 +1292,13 @@ static void disable_iommus(void)
1171 * disable suspend until real resume implemented 1292 * disable suspend until real resume implemented
1172 */ 1293 */
1173 1294
1174static int amd_iommu_resume(struct sys_device *dev) 1295static void amd_iommu_resume(void)
1175{ 1296{
1297 struct amd_iommu *iommu;
1298
1299 for_each_iommu(iommu)
1300 iommu_apply_resume_quirks(iommu);
1301
1176 /* re-load the hardware */ 1302 /* re-load the hardware */
1177 enable_iommus(); 1303 enable_iommus();
1178 1304
@@ -1180,13 +1306,11 @@ static int amd_iommu_resume(struct sys_device *dev)
1180 * we have to flush after the IOMMUs are enabled because a 1306 * we have to flush after the IOMMUs are enabled because a
1181 * disabled IOMMU will never execute the commands we send 1307 * disabled IOMMU will never execute the commands we send
1182 */ 1308 */
1183 amd_iommu_flush_all_devices(); 1309 for_each_iommu(iommu)
1184 amd_iommu_flush_all_domains(); 1310 iommu_flush_all_caches(iommu);
1185
1186 return 0;
1187} 1311}
1188 1312
1189static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1313static int amd_iommu_suspend(void)
1190{ 1314{
1191 /* disable IOMMUs to go out of the way for BIOS */ 1315 /* disable IOMMUs to go out of the way for BIOS */
1192 disable_iommus(); 1316 disable_iommus();
@@ -1194,17 +1318,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
1194 return 0; 1318 return 0;
1195} 1319}
1196 1320
1197static struct sysdev_class amd_iommu_sysdev_class = { 1321static struct syscore_ops amd_iommu_syscore_ops = {
1198 .name = "amd_iommu",
1199 .suspend = amd_iommu_suspend, 1322 .suspend = amd_iommu_suspend,
1200 .resume = amd_iommu_resume, 1323 .resume = amd_iommu_resume,
1201}; 1324};
1202 1325
1203static struct sys_device device_amd_iommu = {
1204 .id = 0,
1205 .cls = &amd_iommu_sysdev_class,
1206};
1207
1208/* 1326/*
1209 * This is the core init function for AMD IOMMU hardware in the system. 1327 * This is the core init function for AMD IOMMU hardware in the system.
1210 * This function is called from the generic x86 DMA layer initialization 1328 * This function is called from the generic x86 DMA layer initialization
@@ -1321,14 +1439,6 @@ static int __init amd_iommu_init(void)
1321 goto free; 1439 goto free;
1322 } 1440 }
1323 1441
1324 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1325 if (ret)
1326 goto free;
1327
1328 ret = sysdev_register(&device_amd_iommu);
1329 if (ret)
1330 goto free;
1331
1332 ret = amd_iommu_init_devices(); 1442 ret = amd_iommu_init_devices();
1333 if (ret) 1443 if (ret)
1334 goto free; 1444 goto free;
@@ -1347,6 +1457,8 @@ static int __init amd_iommu_init(void)
1347 1457
1348 amd_iommu_init_notifier(); 1458 amd_iommu_init_notifier();
1349 1459
1460 register_syscore_ops(&amd_iommu_syscore_ops);
1461
1350 if (iommu_pass_through) 1462 if (iommu_pass_through)
1351 goto out; 1463 goto out;
1352 1464
@@ -1405,13 +1517,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1405 return 0; 1517 return 0;
1406} 1518}
1407 1519
1408void __init amd_iommu_detect(void) 1520int __init amd_iommu_detect(void)
1409{ 1521{
1410 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1522 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1411 return; 1523 return -ENODEV;
1412 1524
1413 if (amd_iommu_disabled) 1525 if (amd_iommu_disabled)
1414 return; 1526 return -ENODEV;
1415 1527
1416 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1528 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1417 iommu_detected = 1; 1529 iommu_detected = 1;
@@ -1420,7 +1532,9 @@ void __init amd_iommu_detect(void)
1420 1532
1421 /* Make sure ACS will be enabled */ 1533 /* Make sure ACS will be enabled */
1422 pci_request_acs(); 1534 pci_request_acs();
1535 return 1;
1423 } 1536 }
1537 return -ENODEV;
1424} 1538}
1425 1539
1426/**************************************************************************** 1540/****************************************************************************
@@ -1451,3 +1565,8 @@ static int __init parse_amd_iommu_options(char *str)
1451 1565
1452__setup("amd_iommu_dump", parse_amd_iommu_dump); 1566__setup("amd_iommu_dump", parse_amd_iommu_dump);
1453__setup("amd_iommu=", parse_amd_iommu_options); 1567__setup("amd_iommu=", parse_amd_iommu_options);
1568
1569IOMMU_INIT_FINISH(amd_iommu_detect,
1570 gart_iommu_hole_init,
1571 0,
1572 0);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000000..4c39baa8facc
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,255 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/amd_nb.h>
12
13static u32 *flush_words;
14
15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {}
20};
21EXPORT_SYMBOL(amd_nb_misc_ids);
22
23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
25 {}
26};
27
28const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
29 { 0x00, 0x18, 0x20 },
30 { 0xff, 0x00, 0x20 },
31 { 0xfe, 0x00, 0x20 },
32 { }
33};
34
35struct amd_northbridge_info amd_northbridges;
36EXPORT_SYMBOL(amd_northbridges);
37
38static struct pci_dev *next_northbridge(struct pci_dev *dev,
39 const struct pci_device_id *ids)
40{
41 do {
42 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
43 if (!dev)
44 break;
45 } while (!pci_match_id(ids, dev));
46 return dev;
47}
48
49int amd_cache_northbridges(void)
50{
51 u16 i = 0;
52 struct amd_northbridge *nb;
53 struct pci_dev *misc, *link;
54
55 if (amd_nb_num())
56 return 0;
57
58 misc = NULL;
59 while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
60 i++;
61
62 if (i == 0)
63 return 0;
64
65 nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
66 if (!nb)
67 return -ENOMEM;
68
69 amd_northbridges.nb = nb;
70 amd_northbridges.num = i;
71
72 link = misc = NULL;
73 for (i = 0; i != amd_nb_num(); i++) {
74 node_to_amd_nb(i)->misc = misc =
75 next_northbridge(misc, amd_nb_misc_ids);
76 node_to_amd_nb(i)->link = link =
77 next_northbridge(link, amd_nb_link_ids);
78 }
79
80 /* some CPU families (e.g. family 0x11) do not support GART */
81 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
82 boot_cpu_data.x86 == 0x15)
83 amd_northbridges.flags |= AMD_NB_GART;
84
85 /*
86 * Some CPU families support L3 Cache Index Disable. There are some
87 * limitations because of E382 and E388 on family 0x10.
88 */
89 if (boot_cpu_data.x86 == 0x10 &&
90 boot_cpu_data.x86_model >= 0x8 &&
91 (boot_cpu_data.x86_model > 0x9 ||
92 boot_cpu_data.x86_mask >= 0x1))
93 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
94
95 if (boot_cpu_data.x86 == 0x15)
96 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
97
98 /* L3 cache partitioning is supported on family 0x15 */
99 if (boot_cpu_data.x86 == 0x15)
100 amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
101
102 return 0;
103}
104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
105
106/*
107 * Ignores subdevice/subvendor but as far as I can figure out
108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
111{
112 const struct pci_device_id *id;
113 u32 vendor = device & 0xffff;
114
115 device >>= 16;
116 for (id = amd_nb_misc_ids; id->vendor; id++)
117 if (vendor == id->vendor && device == id->device)
118 return true;
119 return false;
120}
121
122int amd_get_subcaches(int cpu)
123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask;
126 int cuid = 0;
127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0;
130
131 pci_read_config_dword(link, 0x1d4, &mask);
132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf;
137}
138
139int amd_set_subcaches(int cpu, int mask)
140{
141 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg;
144 int cuid = 0;
145
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL;
148
149 /* if necessary, collect reset state of L3 partitioning and BAN mode */
150 if (reset == 0) {
151 pci_read_config_dword(nb->link, 0x1d4, &reset);
152 pci_read_config_dword(nb->misc, 0x1b8, &ban);
153 ban &= 0x180000;
154 }
155
156 /* deactivate BAN mode if any subcaches are to be disabled */
157 if (mask != 0xf) {
158 pci_read_config_dword(nb->misc, 0x1b8, &reg);
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 }
161
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26;
167
168 pci_write_config_dword(nb->link, 0x1d4, mask);
169
170 /* reset BAN mode if L3 partitioning returned to reset state */
171 pci_read_config_dword(nb->link, 0x1d4, &reg);
172 if (reg == reset) {
173 pci_read_config_dword(nb->misc, 0x1b8, &reg);
174 reg &= ~0x180000;
175 pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
176 }
177
178 return 0;
179}
180
181static int amd_cache_gart(void)
182{
183 u16 i;
184
185 if (!amd_nb_has_feature(AMD_NB_GART))
186 return 0;
187
188 flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
189 if (!flush_words) {
190 amd_northbridges.flags &= ~AMD_NB_GART;
191 return -ENOMEM;
192 }
193
194 for (i = 0; i != amd_nb_num(); i++)
195 pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
196 &flush_words[i]);
197
198 return 0;
199}
200
201void amd_flush_garts(void)
202{
203 int flushed, i;
204 unsigned long flags;
205 static DEFINE_SPINLOCK(gart_lock);
206
207 if (!amd_nb_has_feature(AMD_NB_GART))
208 return;
209
210 /* Avoid races between AGP and IOMMU. In theory it's not needed
211 but I'm not sure if the hardware won't lose flush requests
212 when another is pending. This whole thing is so expensive anyways
213 that it doesn't matter to serialize more. -AK */
214 spin_lock_irqsave(&gart_lock, flags);
215 flushed = 0;
216 for (i = 0; i < amd_nb_num(); i++) {
217 pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
218 flush_words[i] | 1);
219 flushed++;
220 }
221 for (i = 0; i < amd_nb_num(); i++) {
222 u32 w;
223 /* Make sure the hardware actually executed the flush*/
224 for (;;) {
225 pci_read_config_dword(node_to_amd_nb(i)->misc,
226 0x9c, &w);
227 if (!(w & 1))
228 break;
229 cpu_relax();
230 }
231 }
232 spin_unlock_irqrestore(&gart_lock, flags);
233 if (!flushed)
234 printk("nothing to flush?\n");
235}
236EXPORT_SYMBOL_GPL(amd_flush_garts);
237
238static __init int init_amd_nbs(void)
239{
240 int err = 0;
241
242 err = amd_cache_northbridges();
243
244 if (err < 0)
245 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
246
247 if (amd_cache_gart() < 0)
248 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
249 "GART support disabled.\n");
250
251 return err;
252}
253
254/* This has to go after the PCI subsystem */
255fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5d..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
177 .rating = APBT_CLOCKSOURCE_RATING, 177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource, 178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK, 179 .mask = APBT_MASK,
180 .shift = APBT_SHIFT,
181 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
182 .resume = apbt_restart_clocksource, 181 .resume = apbt_restart_clocksource,
183}; 182};
@@ -231,34 +230,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
231 apbt_start_counter(phy_cs_timer_id); 230 apbt_start_counter(phy_cs_timer_id);
232} 231}
233 232
234/* Setup IRQ routing via IOAPIC */
235#ifdef CONFIG_SMP
236static void apbt_setup_irq(struct apbt_dev *adev)
237{
238 struct irq_chip *chip;
239 struct irq_desc *desc;
240
241 /* timer0 irq has been setup early */
242 if (adev->irq == 0)
243 return;
244 desc = irq_to_desc(adev->irq);
245 chip = get_irq_chip(adev->irq);
246 disable_irq(adev->irq);
247 desc->status |= IRQ_MOVE_PCNTXT;
248 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
249 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
250 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
251 enable_irq(adev->irq);
252 if (system_state == SYSTEM_BOOTING)
253 if (request_irq(adev->irq, apbt_interrupt_handler,
254 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
255 adev->name, adev)) {
256 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
257 adev->num);
258 }
259}
260#endif
261
262static void apbt_enable_int(int n) 233static void apbt_enable_int(int n)
263{ 234{
264 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); 235 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -312,7 +283,7 @@ static int __init apbt_clockevent_register(void)
312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
313 284
314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 286 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
316 global_clock_event = &adev->evt; 287 global_clock_event = &adev->evt;
317 printk(KERN_DEBUG "%s clockevent registered as global\n", 288 printk(KERN_DEBUG "%s clockevent registered as global\n",
318 global_clock_event->name); 289 global_clock_event->name);
@@ -334,6 +305,30 @@ static int __init apbt_clockevent_register(void)
334} 305}
335 306
336#ifdef CONFIG_SMP 307#ifdef CONFIG_SMP
308
309static void apbt_setup_irq(struct apbt_dev *adev)
310{
311 /* timer0 irq has been setup early */
312 if (adev->irq == 0)
313 return;
314
315 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
317 /* APB timer irqs are set up as mp_irqs, timer is edge type */
318 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
319
320 if (system_state == SYSTEM_BOOTING) {
321 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED |
323 IRQF_NOBALANCING,
324 adev->name, adev)) {
325 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
326 adev->num);
327 }
328 } else
329 enable_irq(adev->irq);
330}
331
337/* Should be called with per cpu */ 332/* Should be called with per cpu */
338void apbt_setup_secondary_clock(void) 333void apbt_setup_secondary_clock(void)
339{ 334{
@@ -343,7 +338,7 @@ void apbt_setup_secondary_clock(void)
343 338
344 /* Don't register boot CPU clockevent */ 339 /* Don't register boot CPU clockevent */
345 cpu = smp_processor_id(); 340 cpu = smp_processor_id();
346 if (cpu == boot_cpu_id) 341 if (!cpu)
347 return; 342 return;
348 /* 343 /*
349 * We need to calculate the scaled math multiplication factor for 344 * We need to calculate the scaled math multiplication factor for
@@ -389,16 +384,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
389 384
390 switch (action & 0xf) { 385 switch (action & 0xf) {
391 case CPU_DEAD: 386 case CPU_DEAD:
387 disable_irq(adev->irq);
392 apbt_disable_int(cpu); 388 apbt_disable_int(cpu);
393 if (system_state == SYSTEM_RUNNING) 389 if (system_state == SYSTEM_RUNNING) {
394 pr_debug("skipping APBT CPU %lu offline\n", cpu); 390 pr_debug("skipping APBT CPU %lu offline\n", cpu);
395 else if (adev) { 391 } else if (adev) {
396 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 392 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
397 free_irq(adev->irq, adev); 393 free_irq(adev->irq, adev);
398 } 394 }
399 break; 395 break;
400 default: 396 default:
401 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); 397 pr_debug("APBT notified %lu, no action\n", action);
402 } 398 }
403 return NOTIFY_OK; 399 return NOTIFY_OK;
404} 400}
@@ -511,64 +507,12 @@ static int apbt_next_event(unsigned long delta,
511 return 0; 507 return 0;
512} 508}
513 509
514/*
515 * APB timer clock is not in sync with pclk on Langwell, which translates to
516 * unreliable read value caused by sampling error. the error does not add up
517 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
518 * would go backwards. the following code is trying to prevent time traveling
519 * backwards. little bit paranoid.
520 */
521static cycle_t apbt_read_clocksource(struct clocksource *cs) 510static cycle_t apbt_read_clocksource(struct clocksource *cs)
522{ 511{
523 unsigned long t0, t1, t2; 512 unsigned long current_count;
524 static unsigned long last_read; 513
525 514 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
526bad_count: 515 return (cycle_t)~current_count;
527 t1 = apbt_readl(phy_cs_timer_id,
528 APBTMR_N_CURRENT_VALUE);
529 t2 = apbt_readl(phy_cs_timer_id,
530 APBTMR_N_CURRENT_VALUE);
531 if (unlikely(t1 < t2)) {
532 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
533 t1, t2, t2 - t1);
534 goto bad_count;
535 }
536 /*
537 * check against cached last read, makes sure time does not go back.
538 * it could be a normal rollover but we will do tripple check anyway
539 */
540 if (unlikely(t2 > last_read)) {
541 /* check if we have a normal rollover */
542 unsigned long raw_intr_status =
543 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
544 /*
545 * cs timer interrupt is masked but raw intr bit is set if
546 * rollover occurs. then we read EOI reg to clear it.
547 */
548 if (raw_intr_status & (1 << phy_cs_timer_id)) {
549 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
550 goto out;
551 }
552 pr_debug("APB CS going back %lx:%lx:%lx ",
553 t2, last_read, t2 - last_read);
554bad_count_x3:
555 pr_debug(KERN_INFO "tripple check enforced\n");
556 t0 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE);
558 udelay(1);
559 t1 = apbt_readl(phy_cs_timer_id,
560 APBTMR_N_CURRENT_VALUE);
561 udelay(1);
562 t2 = apbt_readl(phy_cs_timer_id,
563 APBTMR_N_CURRENT_VALUE);
564 if ((t2 > t1) || (t1 > t0)) {
565 printk(KERN_ERR "Error: APB CS tripple check failed\n");
566 goto bad_count_x3;
567 }
568 }
569out:
570 last_read = t2;
571 return (cycle_t)~t2;
572} 516}
573 517
574static int apbt_clocksource_register(void) 518static int apbt_clocksource_register(void)
@@ -598,14 +542,7 @@ static int apbt_clocksource_register(void)
598 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 542 if (t1 == apbt_read_clocksource(&clocksource_apbt))
599 panic("APBT counter not counting. APBT disabled\n"); 543 panic("APBT counter not counting. APBT disabled\n");
600 544
601 /* 545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
602 * initialize and register APBT clocksource
603 * convert that to ns/clock cycle
604 * mult = (ns/c) * 2^APBT_SHIFT
605 */
606 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
607 (unsigned long) apbt_freq, APBT_SHIFT);
608 clocksource_register(&clocksource_apbt);
609 546
610 return 0; 547 return 0;
611} 548}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..3d2661ca6542 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -27,9 +27,25 @@
27#include <asm/gart.h> 27#include <asm/gart.h>
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33/*
34 * Using 512M as goal, in case kexec will load kernel_big
35 * that will do the on-position decompress, and could overlap with
36 * with the gart aperture that is used.
37 * Sequence:
38 * kernel_small
39 * ==> kexec (with kdump trigger path or gart still enabled)
40 * ==> kernel_small (gart area become e820_reserved)
41 * ==> kexec (with kdump trigger path or gart still enabled)
42 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
43 * So don't use 512M below as gart iommu, leave the space for kernel
44 * code for safe.
45 */
46#define GART_MIN_ADDR (512ULL << 20)
47#define GART_MAX_ADDR (1ULL << 32)
48
33int gart_iommu_aperture; 49int gart_iommu_aperture;
34int gart_iommu_aperture_disabled __initdata; 50int gart_iommu_aperture_disabled __initdata;
35int gart_iommu_aperture_allowed __initdata; 51int gart_iommu_aperture_allowed __initdata;
@@ -39,18 +55,6 @@ int fallback_aper_force __initdata;
39 55
40int fix_aperture __initdata = 1; 56int fix_aperture __initdata = 1;
41 57
42struct bus_dev_range {
43 int bus;
44 int dev_base;
45 int dev_limit;
46};
47
48static struct bus_dev_range bus_dev_ranges[] __initdata = {
49 { 0x00, 0x18, 0x20},
50 { 0xff, 0x00, 0x20},
51 { 0xfe, 0x00, 0x20}
52};
53
54static struct resource gart_resource = { 58static struct resource gart_resource = {
55 .name = "GART", 59 .name = "GART",
56 .flags = IORESOURCE_MEM, 60 .flags = IORESOURCE_MEM,
@@ -69,7 +73,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
69static u32 __init allocate_aperture(void) 73static u32 __init allocate_aperture(void)
70{ 74{
71 u32 aper_size; 75 u32 aper_size;
72 void *p; 76 unsigned long addr;
73 77
74 /* aper_size should <= 1G */ 78 /* aper_size should <= 1G */
75 if (fallback_aper_order > 5) 79 if (fallback_aper_order > 5)
@@ -82,40 +86,27 @@ static u32 __init allocate_aperture(void)
82 * memory. Unfortunately we cannot move it up because that would 86 * memory. Unfortunately we cannot move it up because that would
83 * make the IOMMU useless. 87 * make the IOMMU useless.
84 */ 88 */
85 /* 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
86 * using 512M as goal, in case kexec will load kernel_big 90 aper_size, aper_size);
87 * that will do the on position decompress, and could overlap with 91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
88 * that positon with gart that is used. 92 printk(KERN_ERR
89 * sequende: 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 * kernel_small 94 addr, aper_size>>10);
91 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 95 return 0;
92 * ==> kernel_small(gart area become e820_reserved) 96 }
93 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 97 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
94 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
95 * so don't use 512M below as gart iommu, leave the space for kernel
96 * code for safe
97 */
98 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
99 /* 98 /*
100 * Kmemleak should not scan this block as it may not be mapped via the 99 * Kmemleak should not scan this block as it may not be mapped via the
101 * kernel direct mapping. 100 * kernel direct mapping.
102 */ 101 */
103 kmemleak_ignore(p); 102 kmemleak_ignore(phys_to_virt(addr));
104 if (!p || __pa(p)+aper_size > 0xffffffff) {
105 printk(KERN_ERR
106 "Cannot allocate aperture memory hole (%p,%uK)\n",
107 p, aper_size>>10);
108 if (p)
109 free_bootmem(__pa(p), aper_size);
110 return 0;
111 }
112 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 103 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
113 aper_size >> 10, __pa(p)); 104 aper_size >> 10, addr);
114 insert_aperture_resource((u32)__pa(p), aper_size); 105 insert_aperture_resource((u32)addr, aper_size);
115 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 106 register_nosave_region(addr >> PAGE_SHIFT,
116 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 107 (addr+aper_size) >> PAGE_SHIFT);
117 108
118 return (u32)__pa(p); 109 return (u32)addr;
119} 110}
120 111
121 112
@@ -206,7 +197,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
206 * Do an PCI bus scan by hand because we're running before the PCI 197 * Do an PCI bus scan by hand because we're running before the PCI
207 * subsystem. 198 * subsystem.
208 * 199 *
209 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan 200 * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
210 * generically. It's probably overkill to always scan all slots because 201 * generically. It's probably overkill to always scan all slots because
211 * the AGP bridges should be always an own bus on the HT hierarchy, 202 * the AGP bridges should be always an own bus on the HT hierarchy,
212 * but do it here for future safety. 203 * but do it here for future safety.
@@ -294,20 +285,20 @@ void __init early_gart_iommu_check(void)
294 search_agp_bridge(&agp_aper_order, &valid_agp); 285 search_agp_bridge(&agp_aper_order, &valid_agp);
295 286
296 fix = 0; 287 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 288 for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
298 int bus; 289 int bus;
299 int dev_base, dev_limit; 290 int dev_base, dev_limit;
300 291
301 bus = bus_dev_ranges[i].bus; 292 bus = amd_nb_bus_dev_ranges[i].bus;
302 dev_base = bus_dev_ranges[i].dev_base; 293 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
303 dev_limit = bus_dev_ranges[i].dev_limit; 294 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
304 295
305 for (slot = dev_base; slot < dev_limit; slot++) { 296 for (slot = dev_base; slot < dev_limit; slot++) {
306 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 297 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
307 continue; 298 continue;
308 299
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 300 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
310 aper_enabled = ctl & AMD64_GARTEN; 301 aper_enabled = ctl & GARTEN;
311 aper_order = (ctl >> 1) & 7; 302 aper_order = (ctl >> 1) & 7;
312 aper_size = (32 * 1024 * 1024) << aper_order; 303 aper_size = (32 * 1024 * 1024) << aper_order;
313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 304 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -349,20 +340,20 @@ void __init early_gart_iommu_check(void)
349 return; 340 return;
350 341
351 /* disable them all at first */ 342 /* disable them all at first */
352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 343 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
353 int bus; 344 int bus;
354 int dev_base, dev_limit; 345 int dev_base, dev_limit;
355 346
356 bus = bus_dev_ranges[i].bus; 347 bus = amd_nb_bus_dev_ranges[i].bus;
357 dev_base = bus_dev_ranges[i].dev_base; 348 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
358 dev_limit = bus_dev_ranges[i].dev_limit; 349 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
359 350
360 for (slot = dev_base; slot < dev_limit; slot++) { 351 for (slot = dev_base; slot < dev_limit; slot++) {
361 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 352 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
362 continue; 353 continue;
363 354
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 355 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
365 ctl &= ~AMD64_GARTEN; 356 ctl &= ~GARTEN;
366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 357 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
367 } 358 }
368 } 359 }
@@ -371,7 +362,7 @@ void __init early_gart_iommu_check(void)
371 362
372static int __initdata printed_gart_size_msg; 363static int __initdata printed_gart_size_msg;
373 364
374void __init gart_iommu_hole_init(void) 365int __init gart_iommu_hole_init(void)
375{ 366{
376 u32 agp_aper_base = 0, agp_aper_order = 0; 367 u32 agp_aper_base = 0, agp_aper_order = 0;
377 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 368 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +372,7 @@ void __init gart_iommu_hole_init(void)
381 372
382 if (gart_iommu_aperture_disabled || !fix_aperture || 373 if (gart_iommu_aperture_disabled || !fix_aperture ||
383 !early_pci_allowed()) 374 !early_pci_allowed())
384 return; 375 return -ENODEV;
385 376
386 printk(KERN_INFO "Checking aperture...\n"); 377 printk(KERN_INFO "Checking aperture...\n");
387 378
@@ -390,17 +381,17 @@ void __init gart_iommu_hole_init(void)
390 381
391 fix = 0; 382 fix = 0;
392 node = 0; 383 node = 0;
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 384 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
394 int bus; 385 int bus;
395 int dev_base, dev_limit; 386 int dev_base, dev_limit;
396 u32 ctl; 387 u32 ctl;
397 388
398 bus = bus_dev_ranges[i].bus; 389 bus = amd_nb_bus_dev_ranges[i].bus;
399 dev_base = bus_dev_ranges[i].dev_base; 390 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
400 dev_limit = bus_dev_ranges[i].dev_limit; 391 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
401 392
402 for (slot = dev_base; slot < dev_limit; slot++) { 393 for (slot = dev_base; slot < dev_limit; slot++) {
403 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 394 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
404 continue; 395 continue;
405 396
406 iommu_detected = 1; 397 iommu_detected = 1;
@@ -463,8 +454,9 @@ out:
463 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 454 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
464 455
465 insert_aperture_resource((u32)last_aper_base, n); 456 insert_aperture_resource((u32)last_aper_base, n);
457 return 1;
466 } 458 }
467 return; 459 return 0;
468 } 460 }
469 461
470 if (!fallback_aper_force) { 462 if (!fallback_aper_force) {
@@ -500,28 +492,32 @@ out:
500 panic("Not enough memory for aperture"); 492 panic("Not enough memory for aperture");
501 } 493 }
502 } else { 494 } else {
503 return; 495 return 0;
504 } 496 }
505 497
506 /* Fix up the north bridges */ 498 /* Fix up the north bridges */
507 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 499 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
508 int bus; 500 int bus, dev_base, dev_limit;
509 int dev_base, dev_limit; 501
510 502 /*
511 bus = bus_dev_ranges[i].bus; 503 * Don't enable translation yet but enable GART IO and CPU
512 dev_base = bus_dev_ranges[i].dev_base; 504 * accesses and set DISTLBWALKPRB since GART table memory is UC.
513 dev_limit = bus_dev_ranges[i].dev_limit; 505 */
506 u32 ctl = aper_order << 1;
507
508 bus = amd_nb_bus_dev_ranges[i].bus;
509 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
510 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
514 for (slot = dev_base; slot < dev_limit; slot++) { 511 for (slot = dev_base; slot < dev_limit; slot++) {
515 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 512 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
516 continue; 513 continue;
517 514
518 /* Don't enable translation yet. That is done later. 515 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
519 Assume this BIOS didn't initialise the GART so
520 just overwrite all previous bits */
521 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
522 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); 516 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
523 } 517 }
524 } 518 }
525 519
526 set_up_gart_resume(aper_order, aper_alloc); 520 set_up_gart_resume(aper_order, aper_alloc);
521
522 return 1;
527} 523}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..767fd04f2843 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,23 +2,25 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) 6obj-y += hw_nmi.o
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10 7
11obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
12obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
13 10
14ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
15obj-y += apic_flat_64.o 12# APIC probe will depend on the listing order here
16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
17obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
18obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
16obj-y += apic_flat_64.o
19endif 17endif
20 18
21obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o 19# APIC probe will depend on the listing order here
22obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 20obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24obj-$(CONFIG_X86_SUMMIT) += summit_32.o 21obj-$(CONFIG_X86_SUMMIT) += summit_32.o
22obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24
25# For 32bit, probe_32 need to be listed last
26obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49a..b9338b8cf420 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,14 +24,13 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/sysdev.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/dmar.h> 30#include <linux/dmar.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/cpu.h> 32#include <linux/cpu.h>
33#include <linux/dmi.h> 33#include <linux/dmi.h>
34#include <linux/nmi.h>
35#include <linux/smp.h> 34#include <linux/smp.h>
36#include <linux/mm.h> 35#include <linux/mm.h>
37 36
@@ -44,14 +43,15 @@
44#include <asm/i8259.h> 43#include <asm/i8259.h>
45#include <asm/proto.h> 44#include <asm/proto.h>
46#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
47#include <asm/desc.h> 47#include <asm/desc.h>
48#include <asm/hpet.h> 48#include <asm/hpet.h>
49#include <asm/idle.h> 49#include <asm/idle.h>
50#include <asm/mtrr.h> 50#include <asm/mtrr.h>
51#include <asm/smp.h> 51#include <asm/smp.h>
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h>
54#include <asm/tsc.h> 53#include <asm/tsc.h>
54#include <asm/hypervisor.h>
55 55
56unsigned int num_processors; 56unsigned int num_processors;
57 57
@@ -79,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
80 80
81#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
82/* 91/*
83 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
84 * 93 *
85 * +1=force-enable 94 * +1=force-enable
86 */ 95 */
87static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
88/* 97/*
89 * APIC command line parameters 98 * APIC command line parameters
90 */ 99 */
@@ -154,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
154unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
155int disable_apic; 164int disable_apic;
156/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
157static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
158/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
159int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
160EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -178,29 +187,8 @@ static struct resource lapic_resource = {
178 187
179static unsigned int calibration_result; 188static unsigned int calibration_result;
180 189
181static int lapic_next_event(unsigned long delta,
182 struct clock_event_device *evt);
183static void lapic_timer_setup(enum clock_event_mode mode,
184 struct clock_event_device *evt);
185static void lapic_timer_broadcast(const struct cpumask *mask);
186static void apic_pm_activate(void); 190static void apic_pm_activate(void);
187 191
188/*
189 * The local apic timer can be used for any function which is CPU local.
190 */
191static struct clock_event_device lapic_clockevent = {
192 .name = "lapic",
193 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
194 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
195 .shift = 32,
196 .set_mode = lapic_timer_setup,
197 .set_next_event = lapic_next_event,
198 .broadcast = lapic_timer_broadcast,
199 .rating = 100,
200 .irq = -1,
201};
202static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
203
204static unsigned long apic_phys; 192static unsigned long apic_phys;
205 193
206/* 194/*
@@ -239,7 +227,7 @@ static int modern_apic(void)
239 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
240 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
241 */ 229 */
242void apic_disable(void) 230static void __init apic_disable(void)
243{ 231{
244 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
245 apic = &apic_noop; 233 apic = &apic_noop;
@@ -283,23 +271,6 @@ u64 native_apic_icr_read(void)
283 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
284} 272}
285 273
286/**
287 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
288 */
289void __cpuinit enable_NMI_through_LVT0(void)
290{
291 unsigned int v;
292
293 /* unmask and set to NMI */
294 v = APIC_DM_NMI;
295
296 /* Level triggered for 82489DX (32bit mode) */
297 if (!lapic_is_integrated())
298 v |= APIC_LVT_LEVEL_TRIGGER;
299
300 apic_write(APIC_LVT0, v);
301}
302
303#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
304/** 275/**
305 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -370,38 +341,89 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
370} 341}
371 342
372/* 343/*
373 * Setup extended LVT, AMD specific (K8, family 10h) 344 * Setup extended LVT, AMD specific
374 * 345 *
375 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 346 * Software should use the LVT offsets the BIOS provides. The offsets
376 * MCE interrupts are supported. Thus MCE offset must be set to 0. 347 * are determined by the subsystems using it like those for MCE
348 * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
349 * are supported. Beginning with family 10h at least 4 offsets are
350 * available.
377 * 351 *
378 * If mask=1, the LVT entry does not generate interrupts while mask=0 352 * Since the offsets must be consistent for all cores, we keep track
379 * enables the vector. See also the BKDGs. 353 * of the LVT offsets in software and reserve the offset for the same
354 * vector also to be used on other cores. An offset is freed by
355 * setting the entry to APIC_EILVT_MASKED.
356 *
357 * If the BIOS is right, there should be no conflicts. Otherwise a
358 * "[Firmware Bug]: ..." error message is generated. However, if
359 * software does not properly determines the offsets, it is not
360 * necessarily a BIOS bug.
380 */ 361 */
381 362
382#define APIC_EILVT_LVTOFF_MCE 0 363static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
383#define APIC_EILVT_LVTOFF_IBS 1
384 364
385static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 365static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
386{ 366{
387 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); 367 return (old & APIC_EILVT_MASKED)
388 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 368 || (new == APIC_EILVT_MASKED)
389 369 || ((new & ~APIC_EILVT_MASKED) == old);
390 apic_write(reg, v);
391} 370}
392 371
393u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) 372static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
394{ 373{
395 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); 374 unsigned int rsvd; /* 0: uninitialized */
396 return APIC_EILVT_LVTOFF_MCE; 375
376 if (offset >= APIC_EILVT_NR_MAX)
377 return ~0;
378
379 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
380 do {
381 if (rsvd &&
382 !eilvt_entry_is_changeable(rsvd, new))
383 /* may not change if vectors are different */
384 return rsvd;
385 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
386 } while (rsvd != new);
387
388 return new;
397} 389}
398 390
399u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) 391/*
392 * If mask=1, the LVT entry does not generate interrupts while mask=0
393 * enables the vector. See also the BKDGs. Must be called with
394 * preemption disabled.
395 */
396
397int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
400{ 398{
401 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 399 unsigned long reg = APIC_EILVTn(offset);
402 return APIC_EILVT_LVTOFF_IBS; 400 unsigned int new, old, reserved;
401
402 new = (mask << 16) | (msg_type << 8) | vector;
403 old = apic_read(reg);
404 reserved = reserve_eilvt_offset(offset, new);
405
406 if (reserved != new) {
407 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
408 "vector 0x%x, but the register is already in use for "
409 "vector 0x%x on another cpu\n",
410 smp_processor_id(), reg, offset, new, reserved);
411 return -EINVAL;
412 }
413
414 if (!eilvt_entry_is_changeable(old, new)) {
415 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
416 "vector 0x%x, but the register is already in use for "
417 "vector 0x%x on this cpu\n",
418 smp_processor_id(), reg, offset, new, old);
419 return -EBUSY;
420 }
421
422 apic_write(reg, new);
423
424 return 0;
403} 425}
404EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); 426EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
405 427
406/* 428/*
407 * Program the next event, relative to now 429 * Program the next event, relative to now
@@ -459,6 +481,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
459#endif 481#endif
460} 482}
461 483
484
485/*
486 * The local apic timer can be used for any function which is CPU local.
487 */
488static struct clock_event_device lapic_clockevent = {
489 .name = "lapic",
490 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
491 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
492 .shift = 32,
493 .set_mode = lapic_timer_setup,
494 .set_next_event = lapic_next_event,
495 .broadcast = lapic_timer_broadcast,
496 .rating = 100,
497 .irq = -1,
498};
499static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
500
462/* 501/*
463 * Setup the local APIC timer for this CPU. Copy the initialized values 502 * Setup the local APIC timer for this CPU. Copy the initialized values
464 * of the boot CPU and register the clock event in the framework. 503 * of the boot CPU and register the clock event in the framework.
@@ -467,7 +506,7 @@ static void __cpuinit setup_APIC_timer(void)
467{ 506{
468 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 507 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
469 508
470 if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) { 509 if (this_cpu_has(X86_FEATURE_ARAT)) {
471 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 510 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
472 /* Make LAPIC timer preferrable over percpu HPET */ 511 /* Make LAPIC timer preferrable over percpu HPET */
473 lapic_clockevent.rating = 150; 512 lapic_clockevent.rating = 150;
@@ -635,7 +674,7 @@ static int __init calibrate_APIC_clock(void)
635 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 674 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
636 lapic_clockevent.shift); 675 lapic_clockevent.shift);
637 lapic_clockevent.max_delta_ns = 676 lapic_clockevent.max_delta_ns =
638 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); 677 clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
639 lapic_clockevent.min_delta_ns = 678 lapic_clockevent.min_delta_ns =
640 clockevent_delta2ns(0xF, &lapic_clockevent); 679 clockevent_delta2ns(0xF, &lapic_clockevent);
641 680
@@ -750,11 +789,7 @@ void __init setup_boot_APIC_clock(void)
750 * PIT/HPET going. Otherwise register lapic as a dummy 789 * PIT/HPET going. Otherwise register lapic as a dummy
751 * device. 790 * device.
752 */ 791 */
753 if (nmi_watchdog != NMI_IO_APIC) 792 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
754 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
755 else
756 pr_warning("APIC timer registered as dummy,"
757 " due to nmi_watchdog=%d!\n", nmi_watchdog);
758 793
759 /* Setup the lapic or request the broadcast */ 794 /* Setup the lapic or request the broadcast */
760 setup_APIC_timer(); 795 setup_APIC_timer();
@@ -1146,12 +1181,15 @@ static void __cpuinit lapic_setup_esr(void)
1146 oldvalue, value); 1181 oldvalue, value);
1147} 1182}
1148 1183
1149
1150/** 1184/**
1151 * setup_local_APIC - setup the local APIC 1185 * setup_local_APIC - setup the local APIC
1186 *
1187 * Used to setup local APIC while initializing BSP or bringin up APs.
1188 * Always called with preemption disabled.
1152 */ 1189 */
1153void __cpuinit setup_local_APIC(void) 1190void __cpuinit setup_local_APIC(void)
1154{ 1191{
1192 int cpu = smp_processor_id();
1155 unsigned int value, queued; 1193 unsigned int value, queued;
1156 int i, j, acked = 0; 1194 int i, j, acked = 0;
1157 unsigned long long tsc = 0, ntsc; 1195 unsigned long long tsc = 0, ntsc;
@@ -1161,7 +1199,7 @@ void __cpuinit setup_local_APIC(void)
1161 rdtscll(tsc); 1199 rdtscll(tsc);
1162 1200
1163 if (disable_apic) { 1201 if (disable_apic) {
1164 arch_disable_smp_support(); 1202 disable_ioapic_support();
1165 return; 1203 return;
1166 } 1204 }
1167 1205
@@ -1176,8 +1214,6 @@ void __cpuinit setup_local_APIC(void)
1176#endif 1214#endif
1177 perf_events_lapic_init(); 1215 perf_events_lapic_init();
1178 1216
1179 preempt_disable();
1180
1181 /* 1217 /*
1182 * Double-check whether this APIC is really registered. 1218 * Double-check whether this APIC is really registered.
1183 * This is meaningless in clustered apic mode, so we skip it. 1219 * This is meaningless in clustered apic mode, so we skip it.
@@ -1191,6 +1227,30 @@ void __cpuinit setup_local_APIC(void)
1191 */ 1227 */
1192 apic->init_apic_ldr(); 1228 apic->init_apic_ldr();
1193 1229
1230#ifdef CONFIG_X86_32
1231 /*
1232 * APIC LDR is initialized. If logical_apicid mapping was
1233 * initialized during get_smp_config(), make sure it matches the
1234 * actual value.
1235 */
1236 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1237 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1238 /* always use the value from LDR */
1239 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1240 logical_smp_processor_id();
1241
1242 /*
1243 * Some NUMA implementations (NUMAQ) don't initialize apicid to
1244 * node mapping during NUMA init. Now that logical apicid is
1245 * guaranteed to be known, give it another chance. This is already
1246 * a bit too late - percpu allocation has already happened without
1247 * proper NUMA affinity.
1248 */
1249 if (apic->x86_32_numa_cpu_node)
1250 set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
1251 apic->x86_32_numa_cpu_node(cpu));
1252#endif
1253
1194 /* 1254 /*
1195 * Set Task Priority to 'accept all'. We never change this 1255 * Set Task Priority to 'accept all'. We never change this
1196 * later on. 1256 * later on.
@@ -1293,21 +1353,19 @@ void __cpuinit setup_local_APIC(void)
1293 * TODO: set up through-local-APIC from through-I/O-APIC? --macro 1353 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
1294 */ 1354 */
1295 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; 1355 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
1296 if (!smp_processor_id() && (pic_mode || !value)) { 1356 if (!cpu && (pic_mode || !value)) {
1297 value = APIC_DM_EXTINT; 1357 value = APIC_DM_EXTINT;
1298 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", 1358 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
1299 smp_processor_id());
1300 } else { 1359 } else {
1301 value = APIC_DM_EXTINT | APIC_LVT_MASKED; 1360 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
1302 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1361 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
1303 smp_processor_id());
1304 } 1362 }
1305 apic_write(APIC_LVT0, value); 1363 apic_write(APIC_LVT0, value);
1306 1364
1307 /* 1365 /*
1308 * only the BP should see the LINT1 NMI signal, obviously. 1366 * only the BP should see the LINT1 NMI signal, obviously.
1309 */ 1367 */
1310 if (!smp_processor_id()) 1368 if (!cpu)
1311 value = APIC_DM_NMI; 1369 value = APIC_DM_NMI;
1312 else 1370 else
1313 value = APIC_DM_NMI | APIC_LVT_MASKED; 1371 value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1315,11 +1373,9 @@ void __cpuinit setup_local_APIC(void)
1315 value |= APIC_LVT_LEVEL_TRIGGER; 1373 value |= APIC_LVT_LEVEL_TRIGGER;
1316 apic_write(APIC_LVT1, value); 1374 apic_write(APIC_LVT1, value);
1317 1375
1318 preempt_enable();
1319
1320#ifdef CONFIG_X86_MCE_INTEL 1376#ifdef CONFIG_X86_MCE_INTEL
1321 /* Recheck CMCI information after local APIC is up on CPU #0 */ 1377 /* Recheck CMCI information after local APIC is up on CPU #0 */
1322 if (smp_processor_id() == 0) 1378 if (!cpu)
1323 cmci_recheck(); 1379 cmci_recheck();
1324#endif 1380#endif
1325} 1381}
@@ -1338,10 +1394,22 @@ void __cpuinit end_local_APIC_setup(void)
1338 } 1394 }
1339#endif 1395#endif
1340 1396
1341 setup_apic_nmi_watchdog(NULL);
1342 apic_pm_activate(); 1397 apic_pm_activate();
1343} 1398}
1344 1399
1400void __init bsp_end_local_APIC_setup(void)
1401{
1402 end_local_APIC_setup();
1403
1404 /*
1405 * Now that local APIC setup is completed for BP, configure the fault
1406 * handling for interrupt remapping.
1407 */
1408 if (intr_remapping_enabled)
1409 enable_drhd_fault_handling();
1410
1411}
1412
1345#ifdef CONFIG_X86_X2APIC 1413#ifdef CONFIG_X86_X2APIC
1346void check_x2apic(void) 1414void check_x2apic(void)
1347{ 1415{
@@ -1394,7 +1462,6 @@ int __init enable_IR(void)
1394void __init enable_IR_x2apic(void) 1462void __init enable_IR_x2apic(void)
1395{ 1463{
1396 unsigned long flags; 1464 unsigned long flags;
1397 struct IO_APIC_route_entry **ioapic_entries = NULL;
1398 int ret, x2apic_enabled = 0; 1465 int ret, x2apic_enabled = 0;
1399 int dmar_table_init_ret; 1466 int dmar_table_init_ret;
1400 1467
@@ -1402,13 +1469,7 @@ void __init enable_IR_x2apic(void)
1402 if (dmar_table_init_ret && !x2apic_supported()) 1469 if (dmar_table_init_ret && !x2apic_supported())
1403 return; 1470 return;
1404 1471
1405 ioapic_entries = alloc_ioapic_entries(); 1472 ret = save_ioapic_entries();
1406 if (!ioapic_entries) {
1407 pr_err("Allocate ioapic_entries failed\n");
1408 goto out;
1409 }
1410
1411 ret = save_IO_APIC_setup(ioapic_entries);
1412 if (ret) { 1473 if (ret) {
1413 pr_info("Saving IO-APIC state failed: %d\n", ret); 1474 pr_info("Saving IO-APIC state failed: %d\n", ret);
1414 goto out; 1475 goto out;
@@ -1416,7 +1477,7 @@ void __init enable_IR_x2apic(void)
1416 1477
1417 local_irq_save(flags); 1478 local_irq_save(flags);
1418 legacy_pic->mask_all(); 1479 legacy_pic->mask_all();
1419 mask_IO_APIC_setup(ioapic_entries); 1480 mask_ioapic_entries();
1420 1481
1421 if (dmar_table_init_ret) 1482 if (dmar_table_init_ret)
1422 ret = 0; 1483 ret = 0;
@@ -1427,7 +1488,8 @@ void __init enable_IR_x2apic(void)
1427 /* IR is required if there is APIC ID > 255 even when running 1488 /* IR is required if there is APIC ID > 255 even when running
1428 * under KVM 1489 * under KVM
1429 */ 1490 */
1430 if (max_physical_apicid > 255 || !kvm_para_available()) 1491 if (max_physical_apicid > 255 ||
1492 !hypervisor_x2apic_available())
1431 goto nox2apic; 1493 goto nox2apic;
1432 /* 1494 /*
1433 * without IR all CPUs can be addressed by IOAPIC/MSI 1495 * without IR all CPUs can be addressed by IOAPIC/MSI
@@ -1446,14 +1508,11 @@ void __init enable_IR_x2apic(void)
1446 1508
1447nox2apic: 1509nox2apic:
1448 if (!ret) /* IR enabling failed */ 1510 if (!ret) /* IR enabling failed */
1449 restore_IO_APIC_setup(ioapic_entries); 1511 restore_ioapic_entries();
1450 legacy_pic->restore_mask(); 1512 legacy_pic->restore_mask();
1451 local_irq_restore(flags); 1513 local_irq_restore(flags);
1452 1514
1453out: 1515out:
1454 if (ioapic_entries)
1455 free_ioapic_entries(ioapic_entries);
1456
1457 if (x2apic_enabled) 1516 if (x2apic_enabled)
1458 return; 1517 return;
1459 1518
@@ -1481,13 +1540,60 @@ static int __init detect_init_APIC(void)
1481 return 0; 1540 return 0;
1482} 1541}
1483#else 1542#else
1543
1544static int __init apic_verify(void)
1545{
1546 u32 features, h, l;
1547
1548 /*
1549 * The APIC feature bit should now be enabled
1550 * in `cpuid'
1551 */
1552 features = cpuid_edx(1);
1553 if (!(features & (1 << X86_FEATURE_APIC))) {
1554 pr_warning("Could not enable APIC!\n");
1555 return -1;
1556 }
1557 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1558 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1559
1560 /* The BIOS may have set up the APIC at some other address */
1561 rdmsr(MSR_IA32_APICBASE, l, h);
1562 if (l & MSR_IA32_APICBASE_ENABLE)
1563 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1564
1565 pr_info("Found and enabled local APIC!\n");
1566 return 0;
1567}
1568
1569int __init apic_force_enable(unsigned long addr)
1570{
1571 u32 h, l;
1572
1573 if (disable_apic)
1574 return -1;
1575
1576 /*
1577 * Some BIOSes disable the local APIC in the APIC_BASE
1578 * MSR. This can only be done in software for Intel P6 or later
1579 * and AMD K7 (Model > 1) or later.
1580 */
1581 rdmsr(MSR_IA32_APICBASE, l, h);
1582 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1583 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1584 l &= ~MSR_IA32_APICBASE_BASE;
1585 l |= MSR_IA32_APICBASE_ENABLE | addr;
1586 wrmsr(MSR_IA32_APICBASE, l, h);
1587 enabled_via_apicbase = 1;
1588 }
1589 return apic_verify();
1590}
1591
1484/* 1592/*
1485 * Detect and initialize APIC 1593 * Detect and initialize APIC
1486 */ 1594 */
1487static int __init detect_init_APIC(void) 1595static int __init detect_init_APIC(void)
1488{ 1596{
1489 u32 h, l, features;
1490
1491 /* Disabled by kernel option? */ 1597 /* Disabled by kernel option? */
1492 if (disable_apic) 1598 if (disable_apic)
1493 return -1; 1599 return -1;
@@ -1517,38 +1623,12 @@ static int __init detect_init_APIC(void)
1517 "you can enable it with \"lapic\"\n"); 1623 "you can enable it with \"lapic\"\n");
1518 return -1; 1624 return -1;
1519 } 1625 }
1520 /* 1626 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1521 * Some BIOSes disable the local APIC in the APIC_BASE 1627 return -1;
1522 * MSR. This can only be done in software for Intel P6 or later 1628 } else {
1523 * and AMD K7 (Model > 1) or later. 1629 if (apic_verify())
1524 */ 1630 return -1;
1525 rdmsr(MSR_IA32_APICBASE, l, h);
1526 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1527 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1528 l &= ~MSR_IA32_APICBASE_BASE;
1529 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1530 wrmsr(MSR_IA32_APICBASE, l, h);
1531 enabled_via_apicbase = 1;
1532 }
1533 }
1534 /*
1535 * The APIC feature bit should now be enabled
1536 * in `cpuid'
1537 */
1538 features = cpuid_edx(1);
1539 if (!(features & (1 << X86_FEATURE_APIC))) {
1540 pr_warning("Could not enable APIC!\n");
1541 return -1;
1542 } 1631 }
1543 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1544 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1545
1546 /* The BIOS may have set up the APIC at some other address */
1547 rdmsr(MSR_IA32_APICBASE, l, h);
1548 if (l & MSR_IA32_APICBASE_ENABLE)
1549 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1550
1551 pr_info("Found and enabled local APIC!\n");
1552 1632
1553 apic_pm_activate(); 1633 apic_pm_activate();
1554 1634
@@ -1560,28 +1640,6 @@ no_apic:
1560} 1640}
1561#endif 1641#endif
1562 1642
1563#ifdef CONFIG_X86_64
1564void __init early_init_lapic_mapping(void)
1565{
1566 /*
1567 * If no local APIC can be found then go out
1568 * : it means there is no mpatable and MADT
1569 */
1570 if (!smp_found_config)
1571 return;
1572
1573 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
1574 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1575 APIC_BASE, mp_lapic_addr);
1576
1577 /*
1578 * Fetch the APIC ID of the BSP in case we have a
1579 * default configuration (or the MP table is broken).
1580 */
1581 boot_cpu_physical_apicid = read_apic_id();
1582}
1583#endif
1584
1585/** 1643/**
1586 * init_apic_mappings - initialize APIC mappings 1644 * init_apic_mappings - initialize APIC mappings
1587 */ 1645 */
@@ -1607,10 +1665,7 @@ void __init init_apic_mappings(void)
1607 * acpi_register_lapic_address() 1665 * acpi_register_lapic_address()
1608 */ 1666 */
1609 if (!acpi_lapic && !smp_found_config) 1667 if (!acpi_lapic && !smp_found_config)
1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1668 register_lapic_address(apic_phys);
1611
1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1613 APIC_BASE, apic_phys);
1614 } 1669 }
1615 1670
1616 /* 1671 /*
@@ -1632,11 +1687,27 @@ void __init init_apic_mappings(void)
1632 } 1687 }
1633} 1688}
1634 1689
1690void __init register_lapic_address(unsigned long address)
1691{
1692 mp_lapic_addr = address;
1693
1694 if (!x2apic_mode) {
1695 set_fixmap_nocache(FIX_APIC_BASE, address);
1696 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1697 APIC_BASE, mp_lapic_addr);
1698 }
1699 if (boot_cpu_physical_apicid == -1U) {
1700 boot_cpu_physical_apicid = read_apic_id();
1701 apic_version[boot_cpu_physical_apicid] =
1702 GET_APIC_VERSION(apic_read(APIC_LVR));
1703 }
1704}
1705
1635/* 1706/*
1636 * This initializes the IO-APIC and APIC hardware if this is 1707 * This initializes the IO-APIC and APIC hardware if this is
1637 * a UP kernel. 1708 * a UP kernel.
1638 */ 1709 */
1639int apic_version[MAX_APICS]; 1710int apic_version[MAX_LOCAL_APIC];
1640 1711
1641int __init APIC_init_uniprocessor(void) 1712int __init APIC_init_uniprocessor(void)
1642{ 1713{
@@ -1665,10 +1736,7 @@ int __init APIC_init_uniprocessor(void)
1665 } 1736 }
1666#endif 1737#endif
1667 1738
1668#ifndef CONFIG_SMP
1669 enable_IR_x2apic();
1670 default_setup_apic_routing(); 1739 default_setup_apic_routing();
1671#endif
1672 1740
1673 verify_local_APIC(); 1741 verify_local_APIC();
1674 connect_bsp_APIC(); 1742 connect_bsp_APIC();
@@ -1697,24 +1765,17 @@ int __init APIC_init_uniprocessor(void)
1697 enable_IO_APIC(); 1765 enable_IO_APIC();
1698#endif 1766#endif
1699 1767
1700 end_local_APIC_setup(); 1768 bsp_end_local_APIC_setup();
1701 1769
1702#ifdef CONFIG_X86_IO_APIC 1770#ifdef CONFIG_X86_IO_APIC
1703 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1771 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1704 setup_IO_APIC(); 1772 setup_IO_APIC();
1705 else { 1773 else {
1706 nr_ioapics = 0; 1774 nr_ioapics = 0;
1707 localise_nmi_watchdog();
1708 } 1775 }
1709#else
1710 localise_nmi_watchdog();
1711#endif 1776#endif
1712 1777
1713 x86_init.timers.setup_percpu_clockev(); 1778 x86_init.timers.setup_percpu_clockev();
1714#ifdef CONFIG_X86_64
1715 check_nmi_watchdog();
1716#endif
1717
1718 return 0; 1779 return 0;
1719} 1780}
1720 1781
@@ -1753,30 +1814,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1753 */ 1814 */
1754void smp_error_interrupt(struct pt_regs *regs) 1815void smp_error_interrupt(struct pt_regs *regs)
1755{ 1816{
1756 u32 v, v1; 1817 u32 v0, v1;
1818 u32 i = 0;
1819 static const char * const error_interrupt_reason[] = {
1820 "Send CS error", /* APIC Error Bit 0 */
1821 "Receive CS error", /* APIC Error Bit 1 */
1822 "Send accept error", /* APIC Error Bit 2 */
1823 "Receive accept error", /* APIC Error Bit 3 */
1824 "Redirectable IPI", /* APIC Error Bit 4 */
1825 "Send illegal vector", /* APIC Error Bit 5 */
1826 "Received illegal vector", /* APIC Error Bit 6 */
1827 "Illegal register address", /* APIC Error Bit 7 */
1828 };
1757 1829
1758 exit_idle(); 1830 exit_idle();
1759 irq_enter(); 1831 irq_enter();
1760 /* First tickle the hardware, only then report what went on. -- REW */ 1832 /* First tickle the hardware, only then report what went on. -- REW */
1761 v = apic_read(APIC_ESR); 1833 v0 = apic_read(APIC_ESR);
1762 apic_write(APIC_ESR, 0); 1834 apic_write(APIC_ESR, 0);
1763 v1 = apic_read(APIC_ESR); 1835 v1 = apic_read(APIC_ESR);
1764 ack_APIC_irq(); 1836 ack_APIC_irq();
1765 atomic_inc(&irq_err_count); 1837 atomic_inc(&irq_err_count);
1766 1838
1767 /* 1839 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
1768 * Here is what the APIC error bits mean: 1840 smp_processor_id(), v0 , v1);
1769 * 0: Send CS error 1841
1770 * 1: Receive CS error 1842 v1 = v1 & 0xff;
1771 * 2: Send accept error 1843 while (v1) {
1772 * 3: Receive accept error 1844 if (v1 & 0x1)
1773 * 4: Reserved 1845 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1774 * 5: Send illegal vector 1846 i++;
1775 * 6: Received illegal vector 1847 v1 >>= 1;
1776 * 7: Illegal register address 1848 };
1777 */ 1849
1778 pr_debug("APIC error on CPU%d: %02x(%02x)\n", 1850 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1779 smp_processor_id(), v , v1); 1851
1780 irq_exit(); 1852 irq_exit();
1781} 1853}
1782 1854
@@ -1873,17 +1945,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1873{ 1945{
1874 int cpu; 1946 int cpu;
1875 1947
1876 /*
1877 * Validate version
1878 */
1879 if (version == 0x0) {
1880 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1881 "fixing up to 0x10. (tell your hw vendor)\n",
1882 version);
1883 version = 0x10;
1884 }
1885 apic_version[apicid] = version;
1886
1887 if (num_processors >= nr_cpu_ids) { 1948 if (num_processors >= nr_cpu_ids) {
1888 int max = nr_cpu_ids; 1949 int max = nr_cpu_ids;
1889 int thiscpu = max + disabled_cpus; 1950 int thiscpu = max + disabled_cpus;
@@ -1897,22 +1958,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1897 } 1958 }
1898 1959
1899 num_processors++; 1960 num_processors++;
1900 cpu = cpumask_next_zero(-1, cpu_present_mask);
1901
1902 if (version != apic_version[boot_cpu_physical_apicid])
1903 WARN_ONCE(1,
1904 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1905 apic_version[boot_cpu_physical_apicid], cpu, version);
1906
1907 physid_set(apicid, phys_cpu_present_map);
1908 if (apicid == boot_cpu_physical_apicid) { 1961 if (apicid == boot_cpu_physical_apicid) {
1909 /* 1962 /*
1910 * x86_bios_cpu_apicid is required to have processors listed 1963 * x86_bios_cpu_apicid is required to have processors listed
1911 * in same order as logical cpu numbers. Hence the first 1964 * in same order as logical cpu numbers. Hence the first
1912 * entry is BSP, and so on. 1965 * entry is BSP, and so on.
1966 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1967 * for BSP.
1913 */ 1968 */
1914 cpu = 0; 1969 cpu = 0;
1970 } else
1971 cpu = cpumask_next_zero(-1, cpu_present_mask);
1972
1973 /*
1974 * Validate version
1975 */
1976 if (version == 0x0) {
1977 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1978 cpu, apicid);
1979 version = 0x10;
1915 } 1980 }
1981 apic_version[apicid] = version;
1982
1983 if (version != apic_version[boot_cpu_physical_apicid]) {
1984 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1985 apic_version[boot_cpu_physical_apicid], cpu, version);
1986 }
1987
1988 physid_set(apicid, phys_cpu_present_map);
1916 if (apicid > max_physical_apicid) 1989 if (apicid > max_physical_apicid)
1917 max_physical_apicid = apicid; 1990 max_physical_apicid = apicid;
1918 1991
@@ -1920,7 +1993,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1920 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1993 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1921 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1994 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1922#endif 1995#endif
1923 1996#ifdef CONFIG_X86_32
1997 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1998 apic->x86_32_early_logical_apicid(cpu);
1999#endif
1924 set_cpu_possible(cpu, true); 2000 set_cpu_possible(cpu, true);
1925 set_cpu_present(cpu, true); 2001 set_cpu_present(cpu, true);
1926} 2002}
@@ -1940,17 +2016,6 @@ void default_init_apic_ldr(void)
1940 apic_write(APIC_LDR, val); 2016 apic_write(APIC_LDR, val);
1941} 2017}
1942 2018
1943#ifdef CONFIG_X86_32
1944int default_apicid_to_node(int logical_apicid)
1945{
1946#ifdef CONFIG_SMP
1947 return apicid_2_node[hard_smp_processor_id()];
1948#else
1949 return 0;
1950#endif
1951}
1952#endif
1953
1954/* 2019/*
1955 * Power management 2020 * Power management
1956 */ 2021 */
@@ -1979,7 +2044,7 @@ static struct {
1979 unsigned int apic_thmr; 2044 unsigned int apic_thmr;
1980} apic_pm_state; 2045} apic_pm_state;
1981 2046
1982static int lapic_suspend(struct sys_device *dev, pm_message_t state) 2047static int lapic_suspend(void)
1983{ 2048{
1984 unsigned long flags; 2049 unsigned long flags;
1985 int maxlvt; 2050 int maxlvt;
@@ -2017,34 +2082,24 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2017 return 0; 2082 return 0;
2018} 2083}
2019 2084
2020static int lapic_resume(struct sys_device *dev) 2085static void lapic_resume(void)
2021{ 2086{
2022 unsigned int l, h; 2087 unsigned int l, h;
2023 unsigned long flags; 2088 unsigned long flags;
2024 int maxlvt; 2089 int maxlvt;
2025 int ret = 0;
2026 struct IO_APIC_route_entry **ioapic_entries = NULL;
2027 2090
2028 if (!apic_pm_state.active) 2091 if (!apic_pm_state.active)
2029 return 0; 2092 return;
2030 2093
2031 local_irq_save(flags); 2094 local_irq_save(flags);
2032 if (intr_remapping_enabled) { 2095 if (intr_remapping_enabled) {
2033 ioapic_entries = alloc_ioapic_entries(); 2096 /*
2034 if (!ioapic_entries) { 2097 * IO-APIC and PIC have their own resume routines.
2035 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2098 * We just mask them here to make sure the interrupt
2036 ret = -ENOMEM; 2099 * subsystem is completely quiet while we enable x2apic
2037 goto restore; 2100 * and interrupt-remapping.
2038 } 2101 */
2039 2102 mask_ioapic_entries();
2040 ret = save_IO_APIC_setup(ioapic_entries);
2041 if (ret) {
2042 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2043 free_ioapic_entries(ioapic_entries);
2044 goto restore;
2045 }
2046
2047 mask_IO_APIC_setup(ioapic_entries);
2048 legacy_pic->mask_all(); 2103 legacy_pic->mask_all();
2049 } 2104 }
2050 2105
@@ -2087,16 +2142,10 @@ static int lapic_resume(struct sys_device *dev)
2087 apic_write(APIC_ESR, 0); 2142 apic_write(APIC_ESR, 0);
2088 apic_read(APIC_ESR); 2143 apic_read(APIC_ESR);
2089 2144
2090 if (intr_remapping_enabled) { 2145 if (intr_remapping_enabled)
2091 reenable_intr_remapping(x2apic_mode); 2146 reenable_intr_remapping(x2apic_mode);
2092 legacy_pic->restore_mask();
2093 restore_IO_APIC_setup(ioapic_entries);
2094 free_ioapic_entries(ioapic_entries);
2095 }
2096restore:
2097 local_irq_restore(flags);
2098 2147
2099 return ret; 2148 local_irq_restore(flags);
2100} 2149}
2101 2150
2102/* 2151/*
@@ -2104,17 +2153,11 @@ restore:
2104 * are needed on every CPU up until machine_halt/restart/poweroff. 2153 * are needed on every CPU up until machine_halt/restart/poweroff.
2105 */ 2154 */
2106 2155
2107static struct sysdev_class lapic_sysclass = { 2156static struct syscore_ops lapic_syscore_ops = {
2108 .name = "lapic",
2109 .resume = lapic_resume, 2157 .resume = lapic_resume,
2110 .suspend = lapic_suspend, 2158 .suspend = lapic_suspend,
2111}; 2159};
2112 2160
2113static struct sys_device device_lapic = {
2114 .id = 0,
2115 .cls = &lapic_sysclass,
2116};
2117
2118static void __cpuinit apic_pm_activate(void) 2161static void __cpuinit apic_pm_activate(void)
2119{ 2162{
2120 apic_pm_state.active = 1; 2163 apic_pm_state.active = 1;
@@ -2122,16 +2165,11 @@ static void __cpuinit apic_pm_activate(void)
2122 2165
2123static int __init init_lapic_sysfs(void) 2166static int __init init_lapic_sysfs(void)
2124{ 2167{
2125 int error;
2126
2127 if (!cpu_has_apic)
2128 return 0;
2129 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 2168 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
2169 if (cpu_has_apic)
2170 register_syscore_ops(&lapic_syscore_ops);
2130 2171
2131 error = sysdev_class_register(&lapic_sysclass); 2172 return 0;
2132 if (!error)
2133 error = sysdev_register(&device_lapic);
2134 return error;
2135} 2173}
2136 2174
2137/* local apic needs to resume before other devices access its registers. */ 2175/* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..f7a41e4cae47 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/module.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
@@ -24,6 +25,12 @@
24#include <acpi/acpi_bus.h> 25#include <acpi/acpi_bus.h>
25#endif 26#endif
26 27
28static struct apic apic_physflat;
29static struct apic apic_flat;
30
31struct apic __read_mostly *apic = &apic_flat;
32EXPORT_SYMBOL_GPL(apic);
33
27static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 34static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
28{ 35{
29 return 1; 36 return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
164 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
165} 172}
166 173
167struct apic apic_flat = { 174static struct apic apic_flat = {
168 .name = "flat", 175 .name = "flat",
169 .probe = NULL, 176 .probe = NULL,
170 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 177 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -185,8 +192,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 192 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 193 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 194 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 195 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 196 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 197 .setup_portio_remap = NULL,
@@ -314,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
314 return per_cpu(x86_cpu_to_apicid, cpu); 319 return per_cpu(x86_cpu_to_apicid, cpu);
315} 320}
316 321
317struct apic apic_physflat = { 322static int physflat_probe(void)
323{
324 if (apic == &apic_physflat || num_possible_cpus() > 8)
325 return 1;
326
327 return 0;
328}
329
330static struct apic apic_physflat = {
318 331
319 .name = "physical flat", 332 .name = "physical flat",
320 .probe = NULL, 333 .probe = physflat_probe,
321 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 334 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
322 .apic_id_registered = flat_apic_id_registered, 335 .apic_id_registered = flat_apic_id_registered,
323 336
@@ -337,8 +350,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 350 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 351 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 352 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 353 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 354 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 355 .setup_portio_remap = NULL,
@@ -373,3 +384,8 @@ struct apic apic_physflat = {
373 .wait_icr_idle = native_apic_wait_icr_idle, 384 .wait_icr_idle = native_apic_wait_icr_idle,
374 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 385 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
375}; 386};
387
388/*
389 * We need to check for physflat first, so this order is important.
390 */
391apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -153,9 +142,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 142 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 143 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 144 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 145
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 146 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 147 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 148
@@ -197,4 +184,8 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 184 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 185 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 186 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
187
188#ifdef CONFIG_X86_32
189 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
190#endif
200}; 191};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..efd737e827f4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -196,7 +193,7 @@ static int probe_bigsmp(void)
196 return dmi_bigsmp; 193 return dmi_bigsmp;
197} 194}
198 195
199struct apic apic_bigsmp = { 196static struct apic apic_bigsmp = {
200 197
201 .name = "bigsmp", 198 .name = "bigsmp",
202 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,16 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
259}; 256};
257
258struct apic * __init generic_bigsmp_probe(void)
259{
260 if (probe_bigsmp())
261 return &apic_bigsmp;
262
263 return NULL;
264}
265
266apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..9536b3fe43f8 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,6 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid)
508{
509 return 0;
510}
511
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 513static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 514{
515 if (!mps_cpu) 515 if (!mps_cpu)
@@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 528 ++cpu_id;
529} 529}
530 530
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 531static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 532{
545 /* For clustered we don't have a good way to do this yet - hack */ 533 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 549 * The cpus in the mask must all be on the apic cluster.
562 */ 550 */
563 for_each_cpu(cpu, cpumask) { 551 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 553
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 555 WARN(1, "Not a valid mask!");
@@ -578,7 +566,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 567 const struct cpumask *andmask)
580{ 568{
581 int apicid = es7000_cpu_to_logical_apicid(0); 569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 570 cpumask_var_t cpumask;
583 571
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -632,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
632} 620}
633 621
634/* We've been warned by a false positive warning.Use __refdata to keep calm. */ 622/* We've been warned by a false positive warning.Use __refdata to keep calm. */
635struct apic __refdata apic_es7000_cluster = { 623static struct apic __refdata apic_es7000_cluster = {
636 624
637 .name = "es7000", 625 .name = "es7000",
638 .probe = probe_es7000, 626 .probe = probe_es7000,
@@ -655,8 +643,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 643 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 644 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 645 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 646 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 647 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 648 .setup_portio_remap = NULL,
@@ -695,9 +681,11 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 681 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 682 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 683 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
684
685 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
698}; 686};
699 687
700struct apic __refdata apic_es7000 = { 688static struct apic __refdata apic_es7000 = {
701 689
702 .name = "es7000", 690 .name = "es7000",
703 .probe = probe_es7000, 691 .probe = probe_es7000,
@@ -720,8 +708,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 708 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 709 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 710 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 711 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 712 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 713 .setup_portio_remap = NULL,
@@ -758,4 +744,12 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 744 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 745 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 746 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
747
748 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
761}; 749};
750
751/*
752 * Need to check for es7000 followed by es7000_cluster, so this order
753 * in apic_drivers is important.
754 */
755apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,20 +16,33 @@
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/delay.h>
19 20
21#ifdef CONFIG_HARDLOCKUP_DETECTOR
22u64 hw_nmi_get_sample_period(int watchdog_thresh)
23{
24 return (u64)(cpu_khz) * 1000 * watchdog_thresh;
25}
26#endif
27
28#ifdef arch_trigger_all_cpu_backtrace
20/* For reliability, we're prepared to waste bits here. */ 29/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; 30static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22 31
23u64 hw_nmi_get_sample_period(void) 32/* "in progress" flag of arch_trigger_all_cpu_backtrace */
24{ 33static unsigned long backtrace_flag;
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27 34
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void) 35void arch_trigger_all_cpu_backtrace(void)
30{ 36{
31 int i; 37 int i;
32 38
39 if (test_and_set_bit(0, &backtrace_flag))
40 /*
41 * If there is already a trigger_all_cpu_backtrace() in progress
42 * (backtrace_flag == 1), don't output double cpu dump infos.
43 */
44 return;
45
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); 46 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34 47
35 printk(KERN_INFO "sending NMI to all CPUs:\n"); 48 printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +54,9 @@ void arch_trigger_all_cpu_backtrace(void)
41 break; 54 break;
42 mdelay(1); 55 mdelay(1);
43 } 56 }
57
58 clear_bit(0, &backtrace_flag);
59 smp_mb__after_clear_bit();
44} 60}
45 61
46static int __kprobes 62static int __kprobes
@@ -49,11 +65,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
49{ 65{
50 struct die_args *args = __args; 66 struct die_args *args = __args;
51 struct pt_regs *regs; 67 struct pt_regs *regs;
52 int cpu = smp_processor_id(); 68 int cpu;
53 69
54 switch (cmd) { 70 switch (cmd) {
55 case DIE_NMI: 71 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break; 72 break;
58 73
59 default: 74 default:
@@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
61 } 76 }
62 77
63 regs = args->regs; 78 regs = args->regs;
79 cpu = smp_processor_id();
64 80
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 81 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; 82 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -68,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
68 arch_spin_lock(&lock); 84 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 85 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs); 86 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock); 87 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP; 89 return NOTIFY_STOP;
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
80static __read_mostly struct notifier_block backtrace_notifier = { 95static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler, 96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL, 97 .next = NULL,
83 .priority = 1 98 .priority = NMI_LOCAL_LOW_PRIOR,
84}; 99};
85 100
86static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
90} 105}
91early_initcall(register_trigger_all_cpu_backtrace); 106early_initcall(register_trigger_all_cpu_backtrace);
92#endif 107#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb5..e5293394b548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/sysdev.h> 33#include <linux/syscore_ops.h>
34#include <linux/msi.h> 34#include <linux/msi.h>
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
@@ -54,7 +54,6 @@
54#include <asm/dma.h> 54#include <asm/dma.h>
55#include <asm/timer.h> 55#include <asm/timer.h>
56#include <asm/i8259.h> 56#include <asm/i8259.h>
57#include <asm/nmi.h>
58#include <asm/msidef.h> 57#include <asm/msidef.h>
59#include <asm/hypertransport.h> 58#include <asm/hypertransport.h>
60#include <asm/setup.h> 59#include <asm/setup.h>
@@ -77,17 +76,40 @@ int sis_apic_bug = -1;
77static DEFINE_RAW_SPINLOCK(ioapic_lock); 76static DEFINE_RAW_SPINLOCK(ioapic_lock);
78static DEFINE_RAW_SPINLOCK(vector_lock); 77static DEFINE_RAW_SPINLOCK(vector_lock);
79 78
80/* 79static struct ioapic {
81 * # of IRQ routing registers 80 /*
82 */ 81 * # of IRQ routing registers
83int nr_ioapic_registers[MAX_IO_APICS]; 82 */
83 int nr_registers;
84 /*
85 * Saved state during suspend/resume, or while enabling intr-remap.
86 */
87 struct IO_APIC_route_entry *saved_registers;
88 /* I/O APIC config */
89 struct mpc_ioapic mp_config;
90 /* IO APIC gsi routing info */
91 struct mp_ioapic_gsi gsi_config;
92 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
93} ioapics[MAX_IO_APICS];
84 94
85/* I/O APIC entries */ 95#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
86struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
87int nr_ioapics;
88 96
89/* IO APIC gsi routing info */ 97int mpc_ioapic_id(int id)
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 98{
99 return ioapics[id].mp_config.apicid;
100}
101
102unsigned int mpc_ioapic_addr(int id)
103{
104 return ioapics[id].mp_config.apicaddr;
105}
106
107struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
108{
109 return &ioapics[id].gsi_config;
110}
111
112int nr_ioapics;
91 113
92/* The one past the highest gsi number used */ 114/* The one past the highest gsi number used */
93u32 gsi_top; 115u32 gsi_top;
@@ -109,7 +131,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
109 131
110int skip_ioapic_setup; 132int skip_ioapic_setup;
111 133
112void arch_disable_smp_support(void) 134/**
135 * disable_ioapic_support() - disables ioapic support at runtime
136 */
137void disable_ioapic_support(void)
113{ 138{
114#ifdef CONFIG_PCI 139#ifdef CONFIG_PCI
115 noioapicquirk = 1; 140 noioapicquirk = 1;
@@ -121,25 +146,45 @@ void arch_disable_smp_support(void)
121static int __init parse_noapic(char *str) 146static int __init parse_noapic(char *str)
122{ 147{
123 /* disable IO-APIC */ 148 /* disable IO-APIC */
124 arch_disable_smp_support(); 149 disable_ioapic_support();
125 return 0; 150 return 0;
126} 151}
127early_param("noapic", parse_noapic); 152early_param("noapic", parse_noapic);
128 153
154static int io_apic_setup_irq_pin(unsigned int irq, int node,
155 struct io_apic_irq_attr *attr);
156
157/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
158void mp_save_irq(struct mpc_intsrc *m)
159{
160 int i;
161
162 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
163 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
164 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
165 m->srcbusirq, m->dstapic, m->dstirq);
166
167 for (i = 0; i < mp_irq_entries; i++) {
168 if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
169 return;
170 }
171
172 memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
173 if (++mp_irq_entries == MAX_IRQ_SOURCES)
174 panic("Max # of irq sources exceeded!!\n");
175}
176
129struct irq_pin_list { 177struct irq_pin_list {
130 int apic, pin; 178 int apic, pin;
131 struct irq_pin_list *next; 179 struct irq_pin_list *next;
132}; 180};
133 181
134static struct irq_pin_list *get_one_free_irq_2_pin(int node) 182static struct irq_pin_list *alloc_irq_pin_list(int node)
135{ 183{
136 struct irq_pin_list *pin; 184 return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
137
138 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
139
140 return pin;
141} 185}
142 186
187
143/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 188/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
144#ifdef CONFIG_SPARSE_IRQ 189#ifdef CONFIG_SPARSE_IRQ
145static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; 190static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
@@ -150,25 +195,32 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
150int __init arch_early_irq_init(void) 195int __init arch_early_irq_init(void)
151{ 196{
152 struct irq_cfg *cfg; 197 struct irq_cfg *cfg;
153 struct irq_desc *desc; 198 int count, node, i;
154 int count;
155 int node;
156 int i;
157 199
158 if (!legacy_pic->nr_legacy_irqs) { 200 if (!legacy_pic->nr_legacy_irqs) {
159 nr_irqs_gsi = 0; 201 nr_irqs_gsi = 0;
160 io_apic_irqs = ~0UL; 202 io_apic_irqs = ~0UL;
161 } 203 }
162 204
205 for (i = 0; i < nr_ioapics; i++) {
206 ioapics[i].saved_registers =
207 kzalloc(sizeof(struct IO_APIC_route_entry) *
208 ioapics[i].nr_registers, GFP_KERNEL);
209 if (!ioapics[i].saved_registers)
210 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
211 }
212
163 cfg = irq_cfgx; 213 cfg = irq_cfgx;
164 count = ARRAY_SIZE(irq_cfgx); 214 count = ARRAY_SIZE(irq_cfgx);
165 node= cpu_to_node(boot_cpu_id); 215 node = cpu_to_node(0);
216
217 /* Make sure the legacy interrupts are marked in the bitmap */
218 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
166 219
167 for (i = 0; i < count; i++) { 220 for (i = 0; i < count; i++) {
168 desc = irq_to_desc(i); 221 irq_set_chip_data(i, &cfg[i]);
169 desc->chip_data = &cfg[i]; 222 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
170 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 223 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
171 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
172 /* 224 /*
173 * For legacy IRQ's, start with assigning irq0 to irq15 to 225 * For legacy IRQ's, start with assigning irq0 to irq15 to
174 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. 226 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +235,88 @@ int __init arch_early_irq_init(void)
183} 235}
184 236
185#ifdef CONFIG_SPARSE_IRQ 237#ifdef CONFIG_SPARSE_IRQ
186struct irq_cfg *irq_cfg(unsigned int irq) 238static struct irq_cfg *irq_cfg(unsigned int irq)
187{ 239{
188 struct irq_cfg *cfg = NULL; 240 return irq_get_chip_data(irq);
189 struct irq_desc *desc;
190
191 desc = irq_to_desc(irq);
192 if (desc)
193 cfg = desc->chip_data;
194
195 return cfg;
196} 241}
197 242
198static struct irq_cfg *get_one_free_irq_cfg(int node) 243static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
199{ 244{
200 struct irq_cfg *cfg; 245 struct irq_cfg *cfg;
201 246
202 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 247 cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
203 if (cfg) { 248 if (!cfg)
204 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 249 return NULL;
205 kfree(cfg); 250 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
206 cfg = NULL; 251 goto out_cfg;
207 } else if (!zalloc_cpumask_var_node(&cfg->old_domain, 252 if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
208 GFP_ATOMIC, node)) { 253 goto out_domain;
209 free_cpumask_var(cfg->domain);
210 kfree(cfg);
211 cfg = NULL;
212 }
213 }
214
215 return cfg; 254 return cfg;
255out_domain:
256 free_cpumask_var(cfg->domain);
257out_cfg:
258 kfree(cfg);
259 return NULL;
216} 260}
217 261
218int arch_init_chip_data(struct irq_desc *desc, int node) 262static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
219{
220 struct irq_cfg *cfg;
221
222 cfg = desc->chip_data;
223 if (!cfg) {
224 desc->chip_data = get_one_free_irq_cfg(node);
225 if (!desc->chip_data) {
226 printk(KERN_ERR "can not alloc irq_cfg\n");
227 BUG_ON(1);
228 }
229 }
230
231 return 0;
232}
233
234/* for move_irq_desc */
235static void
236init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
237{ 263{
238 struct irq_pin_list *old_entry, *head, *tail, *entry; 264 if (!cfg)
239
240 cfg->irq_2_pin = NULL;
241 old_entry = old_cfg->irq_2_pin;
242 if (!old_entry)
243 return;
244
245 entry = get_one_free_irq_2_pin(node);
246 if (!entry)
247 return; 265 return;
266 irq_set_chip_data(at, NULL);
267 free_cpumask_var(cfg->domain);
268 free_cpumask_var(cfg->old_domain);
269 kfree(cfg);
270}
248 271
249 entry->apic = old_entry->apic; 272#else
250 entry->pin = old_entry->pin;
251 head = entry;
252 tail = entry;
253 old_entry = old_entry->next;
254 while (old_entry) {
255 entry = get_one_free_irq_2_pin(node);
256 if (!entry) {
257 entry = head;
258 while (entry) {
259 head = entry->next;
260 kfree(entry);
261 entry = head;
262 }
263 /* still use the old one */
264 return;
265 }
266 entry->apic = old_entry->apic;
267 entry->pin = old_entry->pin;
268 tail->next = entry;
269 tail = entry;
270 old_entry = old_entry->next;
271 }
272 273
273 tail->next = NULL; 274struct irq_cfg *irq_cfg(unsigned int irq)
274 cfg->irq_2_pin = head; 275{
276 return irq < nr_irqs ? irq_cfgx + irq : NULL;
275} 277}
276 278
277static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) 279static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
278{ 280{
279 struct irq_pin_list *entry, *next; 281 return irq_cfgx + irq;
280 282}
281 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
282 return;
283 283
284 entry = old_cfg->irq_2_pin; 284static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
285 285
286 while (entry) { 286#endif
287 next = entry->next;
288 kfree(entry);
289 entry = next;
290 }
291 old_cfg->irq_2_pin = NULL;
292}
293 287
294void arch_init_copy_chip_data(struct irq_desc *old_desc, 288static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
295 struct irq_desc *desc, int node)
296{ 289{
290 int res = irq_alloc_desc_at(at, node);
297 struct irq_cfg *cfg; 291 struct irq_cfg *cfg;
298 struct irq_cfg *old_cfg;
299 292
300 cfg = get_one_free_irq_cfg(node); 293 if (res < 0) {
301 294 if (res != -EEXIST)
302 if (!cfg) 295 return NULL;
303 return; 296 cfg = irq_get_chip_data(at);
304 297 if (cfg)
305 desc->chip_data = cfg; 298 return cfg;
306 299 }
307 old_cfg = old_desc->chip_data;
308
309 cfg->vector = old_cfg->vector;
310 cfg->move_in_progress = old_cfg->move_in_progress;
311 cpumask_copy(cfg->domain, old_cfg->domain);
312 cpumask_copy(cfg->old_domain, old_cfg->old_domain);
313
314 init_copy_irq_2_pin(old_cfg, cfg, node);
315}
316 300
317static void free_irq_cfg(struct irq_cfg *cfg) 301 cfg = alloc_irq_cfg(at, node);
318{ 302 if (cfg)
319 free_cpumask_var(cfg->domain); 303 irq_set_chip_data(at, cfg);
320 free_cpumask_var(cfg->old_domain); 304 else
321 kfree(cfg); 305 irq_free_desc(at);
306 return cfg;
322} 307}
323 308
324void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) 309static int alloc_irq_from(unsigned int from, int node)
325{ 310{
326 struct irq_cfg *old_cfg, *cfg; 311 return irq_alloc_desc_from(from, node);
327
328 old_cfg = old_desc->chip_data;
329 cfg = desc->chip_data;
330
331 if (old_cfg == cfg)
332 return;
333
334 if (old_cfg) {
335 free_irq_2_pin(old_cfg, cfg);
336 free_irq_cfg(old_cfg);
337 old_desc->chip_data = NULL;
338 }
339} 312}
340/* end for move_irq_desc */
341 313
342#else 314static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
343struct irq_cfg *irq_cfg(unsigned int irq)
344{ 315{
345 return irq < nr_irqs ? irq_cfgx + irq : NULL; 316 free_irq_cfg(at, cfg);
317 irq_free_desc(at);
346} 318}
347 319
348#endif
349
350struct io_apic { 320struct io_apic {
351 unsigned int index; 321 unsigned int index;
352 unsigned int unused[3]; 322 unsigned int unused[3];
@@ -358,7 +328,7 @@ struct io_apic {
358static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 328static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
359{ 329{
360 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 330 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
361 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 331 + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
362} 332}
363 333
364static inline void io_apic_eoi(unsigned int apic, unsigned int vector) 334static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -451,7 +421,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
451 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 421 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
452} 422}
453 423
454void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 424static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
455{ 425{
456 unsigned long flags; 426 unsigned long flags;
457 raw_spin_lock_irqsave(&ioapic_lock, flags); 427 raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +451,7 @@ static void ioapic_mask_entry(int apic, int pin)
481 * fast in the common case, and fast for shared ISA-space IRQs. 451 * fast in the common case, and fast for shared ISA-space IRQs.
482 */ 452 */
483static int 453static int
484add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) 454__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
485{ 455{
486 struct irq_pin_list **last, *entry; 456 struct irq_pin_list **last, *entry;
487 457
@@ -493,7 +463,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
493 last = &entry->next; 463 last = &entry->next;
494 } 464 }
495 465
496 entry = get_one_free_irq_2_pin(node); 466 entry = alloc_irq_pin_list(node);
497 if (!entry) { 467 if (!entry) {
498 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 468 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
499 node, apic, pin); 469 node, apic, pin);
@@ -508,7 +478,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
508 478
509static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 479static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
510{ 480{
511 if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) 481 if (__add_pin_to_irq_node(cfg, node, apic, pin))
512 panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); 482 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
513} 483}
514 484
@@ -571,11 +541,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
571 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 541 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
572} 542}
573 543
574static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
575{
576 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
577}
578
579static void io_apic_sync(struct irq_pin_list *entry) 544static void io_apic_sync(struct irq_pin_list *entry)
580{ 545{
581 /* 546 /*
@@ -587,44 +552,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
587 readl(&io_apic->data); 552 readl(&io_apic->data);
588} 553}
589 554
590static void __mask_IO_APIC_irq(struct irq_cfg *cfg) 555static void mask_ioapic(struct irq_cfg *cfg)
591{ 556{
557 unsigned long flags;
558
559 raw_spin_lock_irqsave(&ioapic_lock, flags);
592 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 560 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
561 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
593} 562}
594 563
595static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 564static void mask_ioapic_irq(struct irq_data *data)
596{ 565{
597 struct irq_cfg *cfg = desc->chip_data; 566 mask_ioapic(data->chip_data);
598 unsigned long flags; 567}
599
600 BUG_ON(!cfg);
601 568
602 raw_spin_lock_irqsave(&ioapic_lock, flags); 569static void __unmask_ioapic(struct irq_cfg *cfg)
603 __mask_IO_APIC_irq(cfg); 570{
604 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 571 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
605} 572}
606 573
607static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 574static void unmask_ioapic(struct irq_cfg *cfg)
608{ 575{
609 struct irq_cfg *cfg = desc->chip_data;
610 unsigned long flags; 576 unsigned long flags;
611 577
612 raw_spin_lock_irqsave(&ioapic_lock, flags); 578 raw_spin_lock_irqsave(&ioapic_lock, flags);
613 __unmask_IO_APIC_irq(cfg); 579 __unmask_ioapic(cfg);
614 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 580 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
615} 581}
616 582
617static void mask_IO_APIC_irq(unsigned int irq) 583static void unmask_ioapic_irq(struct irq_data *data)
618{ 584{
619 struct irq_desc *desc = irq_to_desc(irq); 585 unmask_ioapic(data->chip_data);
620
621 mask_IO_APIC_irq_desc(desc);
622}
623static void unmask_IO_APIC_irq(unsigned int irq)
624{
625 struct irq_desc *desc = irq_to_desc(irq);
626
627 unmask_IO_APIC_irq_desc(desc);
628} 586}
629 587
630static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 588static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -646,7 +604,7 @@ static void clear_IO_APIC (void)
646 int apic, pin; 604 int apic, pin;
647 605
648 for (apic = 0; apic < nr_ioapics; apic++) 606 for (apic = 0; apic < nr_ioapics; apic++)
649 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 607 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
650 clear_IO_APIC_pin(apic, pin); 608 clear_IO_APIC_pin(apic, pin);
651} 609}
652 610
@@ -688,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
688__setup("pirq=", ioapic_pirq_setup); 646__setup("pirq=", ioapic_pirq_setup);
689#endif /* CONFIG_X86_32 */ 647#endif /* CONFIG_X86_32 */
690 648
691struct IO_APIC_route_entry **alloc_ioapic_entries(void)
692{
693 int apic;
694 struct IO_APIC_route_entry **ioapic_entries;
695
696 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
697 GFP_ATOMIC);
698 if (!ioapic_entries)
699 return 0;
700
701 for (apic = 0; apic < nr_ioapics; apic++) {
702 ioapic_entries[apic] =
703 kzalloc(sizeof(struct IO_APIC_route_entry) *
704 nr_ioapic_registers[apic], GFP_ATOMIC);
705 if (!ioapic_entries[apic])
706 goto nomem;
707 }
708
709 return ioapic_entries;
710
711nomem:
712 while (--apic >= 0)
713 kfree(ioapic_entries[apic]);
714 kfree(ioapic_entries);
715
716 return 0;
717}
718
719/* 649/*
720 * Saves all the IO-APIC RTE's 650 * Saves all the IO-APIC RTE's
721 */ 651 */
722int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 652int save_ioapic_entries(void)
723{ 653{
724 int apic, pin; 654 int apic, pin;
725 655 int err = 0;
726 if (!ioapic_entries)
727 return -ENOMEM;
728 656
729 for (apic = 0; apic < nr_ioapics; apic++) { 657 for (apic = 0; apic < nr_ioapics; apic++) {
730 if (!ioapic_entries[apic]) 658 if (!ioapics[apic].saved_registers) {
731 return -ENOMEM; 659 err = -ENOMEM;
660 continue;
661 }
732 662
733 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 663 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
734 ioapic_entries[apic][pin] = 664 ioapics[apic].saved_registers[pin] =
735 ioapic_read_entry(apic, pin); 665 ioapic_read_entry(apic, pin);
736 } 666 }
737 667
738 return 0; 668 return err;
739} 669}
740 670
741/* 671/*
742 * Mask all IO APIC entries. 672 * Mask all IO APIC entries.
743 */ 673 */
744void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 674void mask_ioapic_entries(void)
745{ 675{
746 int apic, pin; 676 int apic, pin;
747 677
748 if (!ioapic_entries)
749 return;
750
751 for (apic = 0; apic < nr_ioapics; apic++) { 678 for (apic = 0; apic < nr_ioapics; apic++) {
752 if (!ioapic_entries[apic]) 679 if (!ioapics[apic].saved_registers)
753 break; 680 continue;
754 681
755 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 682 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
756 struct IO_APIC_route_entry entry; 683 struct IO_APIC_route_entry entry;
757 684
758 entry = ioapic_entries[apic][pin]; 685 entry = ioapics[apic].saved_registers[pin];
759 if (!entry.mask) { 686 if (!entry.mask) {
760 entry.mask = 1; 687 entry.mask = 1;
761 ioapic_write_entry(apic, pin, entry); 688 ioapic_write_entry(apic, pin, entry);
@@ -765,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
765} 692}
766 693
767/* 694/*
768 * Restore IO APIC entries which was saved in ioapic_entries. 695 * Restore IO APIC entries which was saved in the ioapic structure.
769 */ 696 */
770int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 697int restore_ioapic_entries(void)
771{ 698{
772 int apic, pin; 699 int apic, pin;
773 700
774 if (!ioapic_entries)
775 return -ENOMEM;
776
777 for (apic = 0; apic < nr_ioapics; apic++) { 701 for (apic = 0; apic < nr_ioapics; apic++) {
778 if (!ioapic_entries[apic]) 702 if (!ioapics[apic].saved_registers)
779 return -ENOMEM; 703 continue;
780 704
781 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 705 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
782 ioapic_write_entry(apic, pin, 706 ioapic_write_entry(apic, pin,
783 ioapic_entries[apic][pin]); 707 ioapics[apic].saved_registers[pin]);
784 } 708 }
785 return 0; 709 return 0;
786} 710}
787 711
788void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
789{
790 int apic;
791
792 for (apic = 0; apic < nr_ioapics; apic++)
793 kfree(ioapic_entries[apic]);
794
795 kfree(ioapic_entries);
796}
797
798/* 712/*
799 * Find the IRQ entry number of a certain pin. 713 * Find the IRQ entry number of a certain pin.
800 */ 714 */
@@ -804,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
804 718
805 for (i = 0; i < mp_irq_entries; i++) 719 for (i = 0; i < mp_irq_entries; i++)
806 if (mp_irqs[i].irqtype == type && 720 if (mp_irqs[i].irqtype == type &&
807 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || 721 (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
808 mp_irqs[i].dstapic == MP_APIC_ALL) && 722 mp_irqs[i].dstapic == MP_APIC_ALL) &&
809 mp_irqs[i].dstirq == pin) 723 mp_irqs[i].dstirq == pin)
810 return i; 724 return i;
@@ -846,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
846 if (i < mp_irq_entries) { 760 if (i < mp_irq_entries) {
847 int apic; 761 int apic;
848 for(apic = 0; apic < nr_ioapics; apic++) { 762 for(apic = 0; apic < nr_ioapics; apic++) {
849 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) 763 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
850 return apic; 764 return apic;
851 } 765 }
852 } 766 }
@@ -897,7 +811,7 @@ static int EISA_ELCR(unsigned int irq)
897#define default_MCA_trigger(idx) (1) 811#define default_MCA_trigger(idx) (1)
898#define default_MCA_polarity(idx) default_ISA_polarity(idx) 812#define default_MCA_polarity(idx) default_ISA_polarity(idx)
899 813
900static int MPBIOS_polarity(int idx) 814static int irq_polarity(int idx)
901{ 815{
902 int bus = mp_irqs[idx].srcbus; 816 int bus = mp_irqs[idx].srcbus;
903 int polarity; 817 int polarity;
@@ -939,7 +853,7 @@ static int MPBIOS_polarity(int idx)
939 return polarity; 853 return polarity;
940} 854}
941 855
942static int MPBIOS_trigger(int idx) 856static int irq_trigger(int idx)
943{ 857{
944 int bus = mp_irqs[idx].srcbus; 858 int bus = mp_irqs[idx].srcbus;
945 int trigger; 859 int trigger;
@@ -1011,20 +925,11 @@ static int MPBIOS_trigger(int idx)
1011 return trigger; 925 return trigger;
1012} 926}
1013 927
1014static inline int irq_polarity(int idx)
1015{
1016 return MPBIOS_polarity(idx);
1017}
1018
1019static inline int irq_trigger(int idx)
1020{
1021 return MPBIOS_trigger(idx);
1022}
1023
1024static int pin_2_irq(int idx, int apic, int pin) 928static int pin_2_irq(int idx, int apic, int pin)
1025{ 929{
1026 int irq; 930 int irq;
1027 int bus = mp_irqs[idx].srcbus; 931 int bus = mp_irqs[idx].srcbus;
932 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
1028 933
1029 /* 934 /*
1030 * Debugging check, we are in big trouble if this message pops up! 935 * Debugging check, we are in big trouble if this message pops up!
@@ -1035,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
1035 if (test_bit(bus, mp_bus_not_pci)) { 940 if (test_bit(bus, mp_bus_not_pci)) {
1036 irq = mp_irqs[idx].srcbusirq; 941 irq = mp_irqs[idx].srcbusirq;
1037 } else { 942 } else {
1038 u32 gsi = mp_gsi_routing[apic].gsi_base + pin; 943 u32 gsi = gsi_cfg->gsi_base + pin;
1039 944
1040 if (gsi >= NR_IRQS_LEGACY) 945 if (gsi >= NR_IRQS_LEGACY)
1041 irq = gsi; 946 irq = gsi;
@@ -1086,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1086 int lbus = mp_irqs[i].srcbus; 991 int lbus = mp_irqs[i].srcbus;
1087 992
1088 for (apic = 0; apic < nr_ioapics; apic++) 993 for (apic = 0; apic < nr_ioapics; apic++)
1089 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || 994 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
1090 mp_irqs[i].dstapic == MP_APIC_ALL) 995 mp_irqs[i].dstapic == MP_APIC_ALL)
1091 break; 996 break;
1092 997
@@ -1259,7 +1164,6 @@ void __setup_vector_irq(int cpu)
1259 /* Initialize vector_irq on a new cpu */ 1164 /* Initialize vector_irq on a new cpu */
1260 int irq, vector; 1165 int irq, vector;
1261 struct irq_cfg *cfg; 1166 struct irq_cfg *cfg;
1262 struct irq_desc *desc;
1263 1167
1264 /* 1168 /*
1265 * vector_lock will make sure that we don't run into irq vector 1169 * vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1172,10 @@ void __setup_vector_irq(int cpu)
1268 */ 1172 */
1269 raw_spin_lock(&vector_lock); 1173 raw_spin_lock(&vector_lock);
1270 /* Mark the inuse vectors */ 1174 /* Mark the inuse vectors */
1271 for_each_irq_desc(irq, desc) { 1175 for_each_active_irq(irq) {
1272 cfg = desc->chip_data; 1176 cfg = irq_get_chip_data(irq);
1273 1177 if (!cfg)
1178 continue;
1274 /* 1179 /*
1275 * If it is a legacy IRQ handled by the legacy PIC, this cpu 1180 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1276 * will be part of the irq_cfg's domain. 1181 * will be part of the irq_cfg's domain.
@@ -1299,17 +1204,13 @@ void __setup_vector_irq(int cpu)
1299static struct irq_chip ioapic_chip; 1204static struct irq_chip ioapic_chip;
1300static struct irq_chip ir_ioapic_chip; 1205static struct irq_chip ir_ioapic_chip;
1301 1206
1302#define IOAPIC_AUTO -1
1303#define IOAPIC_EDGE 0
1304#define IOAPIC_LEVEL 1
1305
1306#ifdef CONFIG_X86_32 1207#ifdef CONFIG_X86_32
1307static inline int IO_APIC_irq_trigger(int irq) 1208static inline int IO_APIC_irq_trigger(int irq)
1308{ 1209{
1309 int apic, idx, pin; 1210 int apic, idx, pin;
1310 1211
1311 for (apic = 0; apic < nr_ioapics; apic++) { 1212 for (apic = 0; apic < nr_ioapics; apic++) {
1312 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1213 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1313 idx = find_irq_entry(apic, pin, mp_INT); 1214 idx = find_irq_entry(apic, pin, mp_INT);
1314 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) 1215 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1315 return irq_trigger(idx); 1216 return irq_trigger(idx);
@@ -1327,41 +1228,37 @@ static inline int IO_APIC_irq_trigger(int irq)
1327} 1228}
1328#endif 1229#endif
1329 1230
1330static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) 1231static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1232 unsigned long trigger)
1331{ 1233{
1234 struct irq_chip *chip = &ioapic_chip;
1235 irq_flow_handler_t hdl;
1236 bool fasteoi;
1332 1237
1333 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1238 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1334 trigger == IOAPIC_LEVEL) 1239 trigger == IOAPIC_LEVEL) {
1335 desc->status |= IRQ_LEVEL; 1240 irq_set_status_flags(irq, IRQ_LEVEL);
1336 else 1241 fasteoi = true;
1337 desc->status &= ~IRQ_LEVEL; 1242 } else {
1338 1243 irq_clear_status_flags(irq, IRQ_LEVEL);
1339 if (irq_remapped(irq)) { 1244 fasteoi = false;
1340 desc->status |= IRQ_MOVE_PCNTXT;
1341 if (trigger)
1342 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1343 handle_fasteoi_irq,
1344 "fasteoi");
1345 else
1346 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1347 handle_edge_irq, "edge");
1348 return;
1349 } 1245 }
1350 1246
1351 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1247 if (irq_remapped(cfg)) {
1352 trigger == IOAPIC_LEVEL) 1248 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1353 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1249 chip = &ir_ioapic_chip;
1354 handle_fasteoi_irq, 1250 fasteoi = trigger != 0;
1355 "fasteoi"); 1251 }
1356 else 1252
1357 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1253 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1358 handle_edge_irq, "edge"); 1254 irq_set_chip_and_handler_name(irq, chip, hdl,
1255 fasteoi ? "fasteoi" : "edge");
1359} 1256}
1360 1257
1361int setup_ioapic_entry(int apic_id, int irq, 1258static int setup_ioapic_entry(int apic_id, int irq,
1362 struct IO_APIC_route_entry *entry, 1259 struct IO_APIC_route_entry *entry,
1363 unsigned int destination, int trigger, 1260 unsigned int destination, int trigger,
1364 int polarity, int vector, int pin) 1261 int polarity, int vector, int pin)
1365{ 1262{
1366 /* 1263 /*
1367 * add it to the IO-APIC irq-routing table: 1264 * add it to the IO-APIC irq-routing table:
@@ -1382,21 +1279,7 @@ int setup_ioapic_entry(int apic_id, int irq,
1382 if (index < 0) 1279 if (index < 0)
1383 panic("Failed to allocate IRTE for ioapic %d\n", apic_id); 1280 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1384 1281
1385 memset(&irte, 0, sizeof(irte)); 1282 prepare_irte(&irte, vector, destination);
1386
1387 irte.present = 1;
1388 irte.dst_mode = apic->irq_dest_mode;
1389 /*
1390 * Trigger mode in the IRTE will always be edge, and the
1391 * actual level or edge trigger will be setup in the IO-APIC
1392 * RTE. This will help simplify level triggered irq migration.
1393 * For more details, see the comments above explainig IO-APIC
1394 * irq migration in the presence of interrupt-remapping.
1395 */
1396 irte.trigger_mode = 0;
1397 irte.dlvry_mode = apic->irq_delivery_mode;
1398 irte.vector = vector;
1399 irte.dest_id = IRTE_DEST(destination);
1400 1283
1401 /* Set source-id of interrupt request */ 1284 /* Set source-id of interrupt request */
1402 set_ioapic_sid(&irte, apic_id); 1285 set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1314,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1431 return 0; 1314 return 0;
1432} 1315}
1433 1316
1434static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, 1317static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1435 int trigger, int polarity) 1318 struct irq_cfg *cfg, int trigger, int polarity)
1436{ 1319{
1437 struct irq_cfg *cfg;
1438 struct IO_APIC_route_entry entry; 1320 struct IO_APIC_route_entry entry;
1439 unsigned int dest; 1321 unsigned int dest;
1440 1322
1441 if (!IO_APIC_IRQ(irq)) 1323 if (!IO_APIC_IRQ(irq))
1442 return; 1324 return;
1443
1444 cfg = desc->chip_data;
1445
1446 /* 1325 /*
1447 * For legacy irqs, cfg->domain starts with cpu 0 for legacy 1326 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1448 * controllers like 8259. Now that IO-APIC can handle this irq, update 1327 * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1459,58 +1338,45 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1459 apic_printk(APIC_VERBOSE,KERN_DEBUG 1338 apic_printk(APIC_VERBOSE,KERN_DEBUG
1460 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1461 "IRQ %d Mode:%i Active:%i)\n", 1340 "IRQ %d Mode:%i Active:%i)\n",
1462 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, 1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1463 irq, trigger, polarity); 1342 irq, trigger, polarity);
1464 1343
1465 1344
1466 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
1467 dest, trigger, polarity, cfg->vector, pin)) { 1346 dest, trigger, polarity, cfg->vector, pin)) {
1468 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1347 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1469 mp_ioapics[apic_id].apicid, pin); 1348 mpc_ioapic_id(apic_id), pin);
1470 __clear_irq_vector(irq, cfg); 1349 __clear_irq_vector(irq, cfg);
1471 return; 1350 return;
1472 } 1351 }
1473 1352
1474 ioapic_register_intr(irq, desc, trigger); 1353 ioapic_register_intr(irq, cfg, trigger);
1475 if (irq < legacy_pic->nr_legacy_irqs) 1354 if (irq < legacy_pic->nr_legacy_irqs)
1476 legacy_pic->chip->mask(irq); 1355 legacy_pic->mask(irq);
1477 1356
1478 ioapic_write_entry(apic_id, pin, entry); 1357 ioapic_write_entry(apic_id, pin, entry);
1479} 1358}
1480 1359
1481static struct { 1360static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1482 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1483} mp_ioapic_routing[MAX_IO_APICS];
1484
1485static void __init setup_IO_APIC_irqs(void)
1486{ 1361{
1487 int apic_id, pin, idx, irq; 1362 if (idx != -1)
1488 int notcon = 0; 1363 return false;
1489 struct irq_desc *desc;
1490 struct irq_cfg *cfg;
1491 int node = cpu_to_node(boot_cpu_id);
1492 1364
1493 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1365 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1366 mpc_ioapic_id(apic_id), pin);
1367 return true;
1368}
1494 1369
1495 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) 1370static void __init __io_apic_setup_irqs(unsigned int apic_id)
1496 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1371{
1372 int idx, node = cpu_to_node(0);
1373 struct io_apic_irq_attr attr;
1374 unsigned int pin, irq;
1375
1376 for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
1497 idx = find_irq_entry(apic_id, pin, mp_INT); 1377 idx = find_irq_entry(apic_id, pin, mp_INT);
1498 if (idx == -1) { 1378 if (io_apic_pin_not_connected(idx, apic_id, pin))
1499 if (!notcon) {
1500 notcon = 1;
1501 apic_printk(APIC_VERBOSE,
1502 KERN_DEBUG " %d-%d",
1503 mp_ioapics[apic_id].apicid, pin);
1504 } else
1505 apic_printk(APIC_VERBOSE, " %d-%d",
1506 mp_ioapics[apic_id].apicid, pin);
1507 continue; 1379 continue;
1508 }
1509 if (notcon) {
1510 apic_printk(APIC_VERBOSE,
1511 " (apicid-pin) not connected\n");
1512 notcon = 0;
1513 }
1514 1380
1515 irq = pin_2_irq(idx, apic_id, pin); 1381 irq = pin_2_irq(idx, apic_id, pin);
1516 1382
@@ -1522,27 +1388,24 @@ static void __init setup_IO_APIC_irqs(void)
1522 * installed and if it returns 1: 1388 * installed and if it returns 1:
1523 */ 1389 */
1524 if (apic->multi_timer_check && 1390 if (apic->multi_timer_check &&
1525 apic->multi_timer_check(apic_id, irq)) 1391 apic->multi_timer_check(apic_id, irq))
1526 continue; 1392 continue;
1527 1393
1528 desc = irq_to_desc_alloc_node(irq, node); 1394 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1529 if (!desc) { 1395 irq_polarity(idx));
1530 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1396
1531 continue; 1397 io_apic_setup_irq_pin(irq, node, &attr);
1532 }
1533 cfg = desc->chip_data;
1534 add_pin_to_irq_node(cfg, node, apic_id, pin);
1535 /*
1536 * don't mark it in pin_programmed, so later acpi could
1537 * set it correctly when irq < 16
1538 */
1539 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1540 irq_trigger(idx), irq_polarity(idx));
1541 } 1398 }
1399}
1542 1400
1543 if (notcon) 1401static void __init setup_IO_APIC_irqs(void)
1544 apic_printk(APIC_VERBOSE, 1402{
1545 " (apicid-pin) not connected\n"); 1403 unsigned int apic_id;
1404
1405 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1406
1407 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1408 __io_apic_setup_irqs(apic_id);
1546} 1409}
1547 1410
1548/* 1411/*
@@ -1552,10 +1415,8 @@ static void __init setup_IO_APIC_irqs(void)
1552 */ 1415 */
1553void setup_IO_APIC_irq_extra(u32 gsi) 1416void setup_IO_APIC_irq_extra(u32 gsi)
1554{ 1417{
1555 int apic_id = 0, pin, idx, irq; 1418 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1556 int node = cpu_to_node(boot_cpu_id); 1419 struct io_apic_irq_attr attr;
1557 struct irq_desc *desc;
1558 struct irq_cfg *cfg;
1559 1420
1560 /* 1421 /*
1561 * Convert 'gsi' to 'ioapic.pin'. 1422 * Convert 'gsi' to 'ioapic.pin'.
@@ -1570,29 +1431,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1570 return; 1431 return;
1571 1432
1572 irq = pin_2_irq(idx, apic_id, pin); 1433 irq = pin_2_irq(idx, apic_id, pin);
1573#ifdef CONFIG_SPARSE_IRQ
1574 desc = irq_to_desc(irq);
1575 if (desc)
1576 return;
1577#endif
1578 desc = irq_to_desc_alloc_node(irq, node);
1579 if (!desc) {
1580 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1581 return;
1582 }
1583 1434
1584 cfg = desc->chip_data; 1435 /* Only handle the non legacy irqs on secondary ioapics */
1585 add_pin_to_irq_node(cfg, node, apic_id, pin); 1436 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1586
1587 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1588 pr_debug("Pin %d-%d already programmed\n",
1589 mp_ioapics[apic_id].apicid, pin);
1590 return; 1437 return;
1591 }
1592 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1593 1438
1594 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1439 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1595 irq_trigger(idx), irq_polarity(idx)); 1440 irq_polarity(idx));
1441
1442 io_apic_setup_irq_pin_once(irq, node, &attr);
1596} 1443}
1597 1444
1598/* 1445/*
@@ -1624,7 +1471,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1624 * The timer IRQ doesn't have to know that behind the 1471 * The timer IRQ doesn't have to know that behind the
1625 * scene we may have a 8259A-master in AEOI mode ... 1472 * scene we may have a 8259A-master in AEOI mode ...
1626 */ 1473 */
1627 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1474 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1475 "edge");
1628 1476
1629 /* 1477 /*
1630 * Add it to the IO-APIC irq-routing table: 1478 * Add it to the IO-APIC irq-routing table:
@@ -1642,13 +1490,12 @@ __apicdebuginit(void) print_IO_APIC(void)
1642 union IO_APIC_reg_03 reg_03; 1490 union IO_APIC_reg_03 reg_03;
1643 unsigned long flags; 1491 unsigned long flags;
1644 struct irq_cfg *cfg; 1492 struct irq_cfg *cfg;
1645 struct irq_desc *desc;
1646 unsigned int irq; 1493 unsigned int irq;
1647 1494
1648 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1495 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1649 for (i = 0; i < nr_ioapics; i++) 1496 for (i = 0; i < nr_ioapics; i++)
1650 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1497 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1651 mp_ioapics[i].apicid, nr_ioapic_registers[i]); 1498 mpc_ioapic_id(i), ioapics[i].nr_registers);
1652 1499
1653 /* 1500 /*
1654 * We are a bit conservative about what we expect. We have to 1501 * We are a bit conservative about what we expect. We have to
@@ -1668,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1668 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1515 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1669 1516
1670 printk("\n"); 1517 printk("\n");
1671 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1518 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
1672 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1519 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1673 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1520 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1674 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1521 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1729,10 +1576,10 @@ __apicdebuginit(void) print_IO_APIC(void)
1729 } 1576 }
1730 } 1577 }
1731 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1578 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1732 for_each_irq_desc(irq, desc) { 1579 for_each_active_irq(irq) {
1733 struct irq_pin_list *entry; 1580 struct irq_pin_list *entry;
1734 1581
1735 cfg = desc->chip_data; 1582 cfg = irq_get_chip_data(irq);
1736 if (!cfg) 1583 if (!cfg)
1737 continue; 1584 continue;
1738 entry = cfg->irq_2_pin; 1585 entry = cfg->irq_2_pin;
@@ -1962,7 +1809,7 @@ void __init enable_IO_APIC(void)
1962 for(apic = 0; apic < nr_ioapics; apic++) { 1809 for(apic = 0; apic < nr_ioapics; apic++) {
1963 int pin; 1810 int pin;
1964 /* See if any of the pins is in ExtINT mode */ 1811 /* See if any of the pins is in ExtINT mode */
1965 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1812 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1966 struct IO_APIC_route_entry entry; 1813 struct IO_APIC_route_entry entry;
1967 entry = ioapic_read_entry(apic, pin); 1814 entry = ioapic_read_entry(apic, pin);
1968 1815
@@ -2023,7 +1870,7 @@ void disable_IO_APIC(void)
2023 * 1870 *
2024 * With interrupt-remapping, for now we will use virtual wire A mode, 1871 * With interrupt-remapping, for now we will use virtual wire A mode,
2025 * as virtual wire B is little complex (need to configure both 1872 * as virtual wire B is little complex (need to configure both
2026 * IOAPIC RTE aswell as interrupt-remapping table entry). 1873 * IOAPIC RTE as well as interrupt-remapping table entry).
2027 * As this gets called during crash dump, keep this simple for now. 1874 * As this gets called during crash dump, keep this simple for now.
2028 */ 1875 */
2029 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1876 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2061,8 +1908,7 @@ void disable_IO_APIC(void)
2061 * 1908 *
2062 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1909 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
2063 */ 1910 */
2064 1911void __init setup_ioapic_ids_from_mpc_nocheck(void)
2065void __init setup_ioapic_ids_from_mpc(void)
2066{ 1912{
2067 union IO_APIC_reg_00 reg_00; 1913 union IO_APIC_reg_00 reg_00;
2068 physid_mask_t phys_id_present_map; 1914 physid_mask_t phys_id_present_map;
@@ -2071,15 +1917,6 @@ void __init setup_ioapic_ids_from_mpc(void)
2071 unsigned char old_id; 1917 unsigned char old_id;
2072 unsigned long flags; 1918 unsigned long flags;
2073 1919
2074 if (acpi_ioapic)
2075 return;
2076 /*
2077 * Don't check I/O APIC IDs for xAPIC systems. They have
2078 * no meaning without the serial APIC bus.
2079 */
2080 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2081 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2082 return;
2083 /* 1920 /*
2084 * This is broken; anything with a real cpu count has to 1921 * This is broken; anything with a real cpu count has to
2085 * circumvent this idiocy regardless. 1922 * circumvent this idiocy regardless.
@@ -2096,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc(void)
2096 reg_00.raw = io_apic_read(apic_id, 0); 1933 reg_00.raw = io_apic_read(apic_id, 0);
2097 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1934 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2098 1935
2099 old_id = mp_ioapics[apic_id].apicid; 1936 old_id = mpc_ioapic_id(apic_id);
2100 1937
2101 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { 1938 if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
2102 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1939 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2103 apic_id, mp_ioapics[apic_id].apicid); 1940 apic_id, mpc_ioapic_id(apic_id));
2104 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1941 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2105 reg_00.bits.ID); 1942 reg_00.bits.ID);
2106 mp_ioapics[apic_id].apicid = reg_00.bits.ID; 1943 ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
2107 } 1944 }
2108 1945
2109 /* 1946 /*
@@ -2112,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2112 * 'stuck on smp_invalidate_needed IPI wait' messages. 1949 * 'stuck on smp_invalidate_needed IPI wait' messages.
2113 */ 1950 */
2114 if (apic->check_apicid_used(&phys_id_present_map, 1951 if (apic->check_apicid_used(&phys_id_present_map,
2115 mp_ioapics[apic_id].apicid)) { 1952 mpc_ioapic_id(apic_id))) {
2116 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1953 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2117 apic_id, mp_ioapics[apic_id].apicid); 1954 apic_id, mpc_ioapic_id(apic_id));
2118 for (i = 0; i < get_physical_broadcast(); i++) 1955 for (i = 0; i < get_physical_broadcast(); i++)
2119 if (!physid_isset(i, phys_id_present_map)) 1956 if (!physid_isset(i, phys_id_present_map))
2120 break; 1957 break;
@@ -2123,36 +1960,39 @@ void __init setup_ioapic_ids_from_mpc(void)
2123 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1960 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2124 i); 1961 i);
2125 physid_set(i, phys_id_present_map); 1962 physid_set(i, phys_id_present_map);
2126 mp_ioapics[apic_id].apicid = i; 1963 ioapics[apic_id].mp_config.apicid = i;
2127 } else { 1964 } else {
2128 physid_mask_t tmp; 1965 physid_mask_t tmp;
2129 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); 1966 apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
1967 &tmp);
2130 apic_printk(APIC_VERBOSE, "Setting %d in the " 1968 apic_printk(APIC_VERBOSE, "Setting %d in the "
2131 "phys_id_present_map\n", 1969 "phys_id_present_map\n",
2132 mp_ioapics[apic_id].apicid); 1970 mpc_ioapic_id(apic_id));
2133 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1971 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2134 } 1972 }
2135 1973
2136
2137 /* 1974 /*
2138 * We need to adjust the IRQ routing table 1975 * We need to adjust the IRQ routing table
2139 * if the ID changed. 1976 * if the ID changed.
2140 */ 1977 */
2141 if (old_id != mp_ioapics[apic_id].apicid) 1978 if (old_id != mpc_ioapic_id(apic_id))
2142 for (i = 0; i < mp_irq_entries; i++) 1979 for (i = 0; i < mp_irq_entries; i++)
2143 if (mp_irqs[i].dstapic == old_id) 1980 if (mp_irqs[i].dstapic == old_id)
2144 mp_irqs[i].dstapic 1981 mp_irqs[i].dstapic
2145 = mp_ioapics[apic_id].apicid; 1982 = mpc_ioapic_id(apic_id);
2146 1983
2147 /* 1984 /*
2148 * Read the right value from the MPC table and 1985 * Update the ID register according to the right value
2149 * write it into the ID register. 1986 * from the MPC table if they are different.
2150 */ 1987 */
1988 if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
1989 continue;
1990
2151 apic_printk(APIC_VERBOSE, KERN_INFO 1991 apic_printk(APIC_VERBOSE, KERN_INFO
2152 "...changing IO-APIC physical APIC ID to %d ...", 1992 "...changing IO-APIC physical APIC ID to %d ...",
2153 mp_ioapics[apic_id].apicid); 1993 mpc_ioapic_id(apic_id));
2154 1994
2155 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 1995 reg_00.bits.ID = mpc_ioapic_id(apic_id);
2156 raw_spin_lock_irqsave(&ioapic_lock, flags); 1996 raw_spin_lock_irqsave(&ioapic_lock, flags);
2157 io_apic_write(apic_id, 0, reg_00.raw); 1997 io_apic_write(apic_id, 0, reg_00.raw);
2158 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1998 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2163,12 +2003,27 @@ void __init setup_ioapic_ids_from_mpc(void)
2163 raw_spin_lock_irqsave(&ioapic_lock, flags); 2003 raw_spin_lock_irqsave(&ioapic_lock, flags);
2164 reg_00.raw = io_apic_read(apic_id, 0); 2004 reg_00.raw = io_apic_read(apic_id, 0);
2165 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2005 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2166 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2006 if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
2167 printk("could not set ID!\n"); 2007 printk("could not set ID!\n");
2168 else 2008 else
2169 apic_printk(APIC_VERBOSE, " ok.\n"); 2009 apic_printk(APIC_VERBOSE, " ok.\n");
2170 } 2010 }
2171} 2011}
2012
2013void __init setup_ioapic_ids_from_mpc(void)
2014{
2015
2016 if (acpi_ioapic)
2017 return;
2018 /*
2019 * Don't check I/O APIC IDs for xAPIC systems. They have
2020 * no meaning without the serial APIC bus.
2021 */
2022 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2023 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2024 return;
2025 setup_ioapic_ids_from_mpc_nocheck();
2026}
2172#endif 2027#endif
2173 2028
2174int no_timer_check __initdata; 2029int no_timer_check __initdata;
@@ -2239,29 +2094,26 @@ static int __init timer_irq_works(void)
2239 * an edge even if it isn't on the 8259A... 2094 * an edge even if it isn't on the 8259A...
2240 */ 2095 */
2241 2096
2242static unsigned int startup_ioapic_irq(unsigned int irq) 2097static unsigned int startup_ioapic_irq(struct irq_data *data)
2243{ 2098{
2244 int was_pending = 0; 2099 int was_pending = 0, irq = data->irq;
2245 unsigned long flags; 2100 unsigned long flags;
2246 struct irq_cfg *cfg;
2247 2101
2248 raw_spin_lock_irqsave(&ioapic_lock, flags); 2102 raw_spin_lock_irqsave(&ioapic_lock, flags);
2249 if (irq < legacy_pic->nr_legacy_irqs) { 2103 if (irq < legacy_pic->nr_legacy_irqs) {
2250 legacy_pic->chip->mask(irq); 2104 legacy_pic->mask(irq);
2251 if (legacy_pic->irq_pending(irq)) 2105 if (legacy_pic->irq_pending(irq))
2252 was_pending = 1; 2106 was_pending = 1;
2253 } 2107 }
2254 cfg = irq_cfg(irq); 2108 __unmask_ioapic(data->chip_data);
2255 __unmask_IO_APIC_irq(cfg);
2256 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2109 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2257 2110
2258 return was_pending; 2111 return was_pending;
2259} 2112}
2260 2113
2261static int ioapic_retrigger_irq(unsigned int irq) 2114static int ioapic_retrigger_irq(struct irq_data *data)
2262{ 2115{
2263 2116 struct irq_cfg *cfg = data->chip_data;
2264 struct irq_cfg *cfg = irq_cfg(irq);
2265 unsigned long flags; 2117 unsigned long flags;
2266 2118
2267 raw_spin_lock_irqsave(&vector_lock, flags); 2119 raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2164,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2312 * With interrupt-remapping, destination information comes 2164 * With interrupt-remapping, destination information comes
2313 * from interrupt-remapping table entry. 2165 * from interrupt-remapping table entry.
2314 */ 2166 */
2315 if (!irq_remapped(irq)) 2167 if (!irq_remapped(cfg))
2316 io_apic_write(apic, 0x11 + pin*2, dest); 2168 io_apic_write(apic, 0x11 + pin*2, dest);
2317 reg = io_apic_read(apic, 0x10 + pin*2); 2169 reg = io_apic_read(apic, 0x10 + pin*2);
2318 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2170 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2174,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2322} 2174}
2323 2175
2324/* 2176/*
2325 * Either sets desc->affinity to a valid value, and returns 2177 * Either sets data->affinity to a valid value, and returns
2326 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and 2178 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2327 * leaves desc->affinity untouched. 2179 * leaves data->affinity untouched.
2328 */ 2180 */
2329unsigned int 2181int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2330set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, 2182 unsigned int *dest_id)
2331 unsigned int *dest_id)
2332{ 2183{
2333 struct irq_cfg *cfg; 2184 struct irq_cfg *cfg = data->chip_data;
2334 unsigned int irq;
2335 2185
2336 if (!cpumask_intersects(mask, cpu_online_mask)) 2186 if (!cpumask_intersects(mask, cpu_online_mask))
2337 return -1; 2187 return -1;
2338 2188
2339 irq = desc->irq; 2189 if (assign_irq_vector(data->irq, data->chip_data, mask))
2340 cfg = desc->chip_data;
2341 if (assign_irq_vector(irq, cfg, mask))
2342 return -1; 2190 return -1;
2343 2191
2344 cpumask_copy(desc->affinity, mask); 2192 cpumask_copy(data->affinity, mask);
2345 2193
2346 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2194 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2347 return 0; 2195 return 0;
2348} 2196}
2349 2197
2350static int 2198static int
2351set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2199ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2200 bool force)
2352{ 2201{
2353 struct irq_cfg *cfg; 2202 unsigned int dest, irq = data->irq;
2354 unsigned long flags; 2203 unsigned long flags;
2355 unsigned int dest; 2204 int ret;
2356 unsigned int irq;
2357 int ret = -1;
2358
2359 irq = desc->irq;
2360 cfg = desc->chip_data;
2361 2205
2362 raw_spin_lock_irqsave(&ioapic_lock, flags); 2206 raw_spin_lock_irqsave(&ioapic_lock, flags);
2363 ret = set_desc_affinity(desc, mask, &dest); 2207 ret = __ioapic_set_affinity(data, mask, &dest);
2364 if (!ret) { 2208 if (!ret) {
2365 /* Only the high 8 bits are valid. */ 2209 /* Only the high 8 bits are valid. */
2366 dest = SET_APIC_LOGICAL_ID(dest); 2210 dest = SET_APIC_LOGICAL_ID(dest);
2367 __target_IO_APIC_irq(irq, dest, cfg); 2211 __target_IO_APIC_irq(irq, dest, data->chip_data);
2368 } 2212 }
2369 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2213 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2370
2371 return ret; 2214 return ret;
2372} 2215}
2373 2216
2374static int
2375set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2376{
2377 struct irq_desc *desc;
2378
2379 desc = irq_to_desc(irq);
2380
2381 return set_ioapic_affinity_irq_desc(desc, mask);
2382}
2383
2384#ifdef CONFIG_INTR_REMAP 2217#ifdef CONFIG_INTR_REMAP
2385 2218
2386/* 2219/*
@@ -2395,24 +2228,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2395 * the interrupt-remapping table entry. 2228 * the interrupt-remapping table entry.
2396 */ 2229 */
2397static int 2230static int
2398migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2231ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2232 bool force)
2399{ 2233{
2400 struct irq_cfg *cfg; 2234 struct irq_cfg *cfg = data->chip_data;
2235 unsigned int dest, irq = data->irq;
2401 struct irte irte; 2236 struct irte irte;
2402 unsigned int dest;
2403 unsigned int irq;
2404 int ret = -1;
2405 2237
2406 if (!cpumask_intersects(mask, cpu_online_mask)) 2238 if (!cpumask_intersects(mask, cpu_online_mask))
2407 return ret; 2239 return -EINVAL;
2408 2240
2409 irq = desc->irq;
2410 if (get_irte(irq, &irte)) 2241 if (get_irte(irq, &irte))
2411 return ret; 2242 return -EBUSY;
2412 2243
2413 cfg = desc->chip_data;
2414 if (assign_irq_vector(irq, cfg, mask)) 2244 if (assign_irq_vector(irq, cfg, mask))
2415 return ret; 2245 return -EBUSY;
2416 2246
2417 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2247 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2418 2248
@@ -2427,29 +2257,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2427 if (cfg->move_in_progress) 2257 if (cfg->move_in_progress)
2428 send_cleanup_vector(cfg); 2258 send_cleanup_vector(cfg);
2429 2259
2430 cpumask_copy(desc->affinity, mask); 2260 cpumask_copy(data->affinity, mask);
2431
2432 return 0; 2261 return 0;
2433} 2262}
2434 2263
2435/*
2436 * Migrates the IRQ destination in the process context.
2437 */
2438static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2439 const struct cpumask *mask)
2440{
2441 return migrate_ioapic_irq_desc(desc, mask);
2442}
2443static int set_ir_ioapic_affinity_irq(unsigned int irq,
2444 const struct cpumask *mask)
2445{
2446 struct irq_desc *desc = irq_to_desc(irq);
2447
2448 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2449}
2450#else 2264#else
2451static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2265static inline int
2452 const struct cpumask *mask) 2266ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2267 bool force)
2453{ 2268{
2454 return 0; 2269 return 0;
2455} 2270}
@@ -2469,7 +2284,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2469 unsigned int irr; 2284 unsigned int irr;
2470 struct irq_desc *desc; 2285 struct irq_desc *desc;
2471 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2472 irq = __get_cpu_var(vector_irq)[vector]; 2287 irq = __this_cpu_read(vector_irq[vector]);
2473 2288
2474 if (irq == -1) 2289 if (irq == -1)
2475 continue; 2290 continue;
@@ -2503,7 +2318,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2503 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); 2318 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2504 goto unlock; 2319 goto unlock;
2505 } 2320 }
2506 __get_cpu_var(vector_irq)[vector] = -1; 2321 __this_cpu_write(vector_irq[vector], -1);
2507unlock: 2322unlock:
2508 raw_spin_unlock(&desc->lock); 2323 raw_spin_unlock(&desc->lock);
2509 } 2324 }
@@ -2511,10 +2326,8 @@ unlock:
2511 irq_exit(); 2326 irq_exit();
2512} 2327}
2513 2328
2514static void __irq_complete_move(struct irq_desc **descp, unsigned vector) 2329static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
2515{ 2330{
2516 struct irq_desc *desc = *descp;
2517 struct irq_cfg *cfg = desc->chip_data;
2518 unsigned me; 2331 unsigned me;
2519 2332
2520 if (likely(!cfg->move_in_progress)) 2333 if (likely(!cfg->move_in_progress))
@@ -2526,31 +2339,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2526 send_cleanup_vector(cfg); 2339 send_cleanup_vector(cfg);
2527} 2340}
2528 2341
2529static void irq_complete_move(struct irq_desc **descp) 2342static void irq_complete_move(struct irq_cfg *cfg)
2530{ 2343{
2531 __irq_complete_move(descp, ~get_irq_regs()->orig_ax); 2344 __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
2532} 2345}
2533 2346
2534void irq_force_complete_move(int irq) 2347void irq_force_complete_move(int irq)
2535{ 2348{
2536 struct irq_desc *desc = irq_to_desc(irq); 2349 struct irq_cfg *cfg = irq_get_chip_data(irq);
2537 struct irq_cfg *cfg = desc->chip_data;
2538 2350
2539 if (!cfg) 2351 if (!cfg)
2540 return; 2352 return;
2541 2353
2542 __irq_complete_move(&desc, cfg->vector); 2354 __irq_complete_move(cfg, cfg->vector);
2543} 2355}
2544#else 2356#else
2545static inline void irq_complete_move(struct irq_desc **descp) {} 2357static inline void irq_complete_move(struct irq_cfg *cfg) { }
2546#endif 2358#endif
2547 2359
2548static void ack_apic_edge(unsigned int irq) 2360static void ack_apic_edge(struct irq_data *data)
2549{ 2361{
2550 struct irq_desc *desc = irq_to_desc(irq); 2362 irq_complete_move(data->chip_data);
2551 2363 irq_move_irq(data);
2552 irq_complete_move(&desc);
2553 move_native_irq(irq);
2554 ack_APIC_irq(); 2364 ack_APIC_irq();
2555} 2365}
2556 2366
@@ -2572,19 +2382,21 @@ atomic_t irq_mis_count;
2572 * Otherwise, we simulate the EOI message manually by changing the trigger 2382 * Otherwise, we simulate the EOI message manually by changing the trigger
2573 * mode to edge and then back to level, with RTE being masked during this. 2383 * mode to edge and then back to level, with RTE being masked during this.
2574*/ 2384*/
2575static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 2385static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2576{ 2386{
2577 struct irq_pin_list *entry; 2387 struct irq_pin_list *entry;
2388 unsigned long flags;
2578 2389
2390 raw_spin_lock_irqsave(&ioapic_lock, flags);
2579 for_each_irq_pin(entry, cfg->irq_2_pin) { 2391 for_each_irq_pin(entry, cfg->irq_2_pin) {
2580 if (mp_ioapics[entry->apic].apicver >= 0x20) { 2392 if (mpc_ioapic_ver(entry->apic) >= 0x20) {
2581 /* 2393 /*
2582 * Intr-remapping uses pin number as the virtual vector 2394 * Intr-remapping uses pin number as the virtual vector
2583 * in the RTE. Actual vector is programmed in 2395 * in the RTE. Actual vector is programmed in
2584 * intr-remapping table entry. Hence for the io-apic 2396 * intr-remapping table entry. Hence for the io-apic
2585 * EOI we use the pin number. 2397 * EOI we use the pin number.
2586 */ 2398 */
2587 if (irq_remapped(irq)) 2399 if (irq_remapped(cfg))
2588 io_apic_eoi(entry->apic, entry->pin); 2400 io_apic_eoi(entry->apic, entry->pin);
2589 else 2401 else
2590 io_apic_eoi(entry->apic, cfg->vector); 2402 io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2405,21 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2593 __unmask_and_level_IO_APIC_irq(entry); 2405 __unmask_and_level_IO_APIC_irq(entry);
2594 } 2406 }
2595 } 2407 }
2596}
2597
2598static void eoi_ioapic_irq(struct irq_desc *desc)
2599{
2600 struct irq_cfg *cfg;
2601 unsigned long flags;
2602 unsigned int irq;
2603
2604 irq = desc->irq;
2605 cfg = desc->chip_data;
2606
2607 raw_spin_lock_irqsave(&ioapic_lock, flags);
2608 __eoi_ioapic_irq(irq, cfg);
2609 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2408 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2610} 2409}
2611 2410
2612static void ack_apic_level(unsigned int irq) 2411static void ack_apic_level(struct irq_data *data)
2613{ 2412{
2614 struct irq_desc *desc = irq_to_desc(irq); 2413 struct irq_cfg *cfg = data->chip_data;
2414 int i, do_unmask_irq = 0, irq = data->irq;
2615 unsigned long v; 2415 unsigned long v;
2616 int i;
2617 struct irq_cfg *cfg;
2618 int do_unmask_irq = 0;
2619 2416
2620 irq_complete_move(&desc); 2417 irq_complete_move(cfg);
2621#ifdef CONFIG_GENERIC_PENDING_IRQ 2418#ifdef CONFIG_GENERIC_PENDING_IRQ
2622 /* If we are moving the irq we need to mask it */ 2419 /* If we are moving the irq we need to mask it */
2623 if (unlikely(desc->status & IRQ_MOVE_PENDING)) { 2420 if (unlikely(irqd_is_setaffinity_pending(data))) {
2624 do_unmask_irq = 1; 2421 do_unmask_irq = 1;
2625 mask_IO_APIC_irq_desc(desc); 2422 mask_ioapic(cfg);
2626 } 2423 }
2627#endif 2424#endif
2628 2425
@@ -2658,7 +2455,6 @@ static void ack_apic_level(unsigned int irq)
2658 * we use the above logic (mask+edge followed by unmask+level) from 2455 * we use the above logic (mask+edge followed by unmask+level) from
2659 * Manfred Spraul to clear the remote IRR. 2456 * Manfred Spraul to clear the remote IRR.
2660 */ 2457 */
2661 cfg = desc->chip_data;
2662 i = cfg->vector; 2458 i = cfg->vector;
2663 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2459 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2664 2460
@@ -2678,7 +2474,7 @@ static void ack_apic_level(unsigned int irq)
2678 if (!(v & (1 << (i & 0x1f)))) { 2474 if (!(v & (1 << (i & 0x1f)))) {
2679 atomic_inc(&irq_mis_count); 2475 atomic_inc(&irq_mis_count);
2680 2476
2681 eoi_ioapic_irq(desc); 2477 eoi_ioapic_irq(irq, cfg);
2682 } 2478 }
2683 2479
2684 /* Now we can move and renable the irq */ 2480 /* Now we can move and renable the irq */
@@ -2709,61 +2505,57 @@ static void ack_apic_level(unsigned int irq)
2709 * accurate and is causing problems then it is a hardware bug 2505 * accurate and is causing problems then it is a hardware bug
2710 * and you can go talk to the chipset vendor about it. 2506 * and you can go talk to the chipset vendor about it.
2711 */ 2507 */
2712 cfg = desc->chip_data;
2713 if (!io_apic_level_ack_pending(cfg)) 2508 if (!io_apic_level_ack_pending(cfg))
2714 move_masked_irq(irq); 2509 irq_move_masked_irq(data);
2715 unmask_IO_APIC_irq_desc(desc); 2510 unmask_ioapic(cfg);
2716 } 2511 }
2717} 2512}
2718 2513
2719#ifdef CONFIG_INTR_REMAP 2514#ifdef CONFIG_INTR_REMAP
2720static void ir_ack_apic_edge(unsigned int irq) 2515static void ir_ack_apic_edge(struct irq_data *data)
2721{ 2516{
2722 ack_APIC_irq(); 2517 ack_APIC_irq();
2723} 2518}
2724 2519
2725static void ir_ack_apic_level(unsigned int irq) 2520static void ir_ack_apic_level(struct irq_data *data)
2726{ 2521{
2727 struct irq_desc *desc = irq_to_desc(irq);
2728
2729 ack_APIC_irq(); 2522 ack_APIC_irq();
2730 eoi_ioapic_irq(desc); 2523 eoi_ioapic_irq(data->irq, data->chip_data);
2731} 2524}
2732#endif /* CONFIG_INTR_REMAP */ 2525#endif /* CONFIG_INTR_REMAP */
2733 2526
2734static struct irq_chip ioapic_chip __read_mostly = { 2527static struct irq_chip ioapic_chip __read_mostly = {
2735 .name = "IO-APIC", 2528 .name = "IO-APIC",
2736 .startup = startup_ioapic_irq, 2529 .irq_startup = startup_ioapic_irq,
2737 .mask = mask_IO_APIC_irq, 2530 .irq_mask = mask_ioapic_irq,
2738 .unmask = unmask_IO_APIC_irq, 2531 .irq_unmask = unmask_ioapic_irq,
2739 .ack = ack_apic_edge, 2532 .irq_ack = ack_apic_edge,
2740 .eoi = ack_apic_level, 2533 .irq_eoi = ack_apic_level,
2741#ifdef CONFIG_SMP 2534#ifdef CONFIG_SMP
2742 .set_affinity = set_ioapic_affinity_irq, 2535 .irq_set_affinity = ioapic_set_affinity,
2743#endif 2536#endif
2744 .retrigger = ioapic_retrigger_irq, 2537 .irq_retrigger = ioapic_retrigger_irq,
2745}; 2538};
2746 2539
2747static struct irq_chip ir_ioapic_chip __read_mostly = { 2540static struct irq_chip ir_ioapic_chip __read_mostly = {
2748 .name = "IR-IO-APIC", 2541 .name = "IR-IO-APIC",
2749 .startup = startup_ioapic_irq, 2542 .irq_startup = startup_ioapic_irq,
2750 .mask = mask_IO_APIC_irq, 2543 .irq_mask = mask_ioapic_irq,
2751 .unmask = unmask_IO_APIC_irq, 2544 .irq_unmask = unmask_ioapic_irq,
2752#ifdef CONFIG_INTR_REMAP 2545#ifdef CONFIG_INTR_REMAP
2753 .ack = ir_ack_apic_edge, 2546 .irq_ack = ir_ack_apic_edge,
2754 .eoi = ir_ack_apic_level, 2547 .irq_eoi = ir_ack_apic_level,
2755#ifdef CONFIG_SMP 2548#ifdef CONFIG_SMP
2756 .set_affinity = set_ir_ioapic_affinity_irq, 2549 .irq_set_affinity = ir_ioapic_set_affinity,
2757#endif 2550#endif
2758#endif 2551#endif
2759 .retrigger = ioapic_retrigger_irq, 2552 .irq_retrigger = ioapic_retrigger_irq,
2760}; 2553};
2761 2554
2762static inline void init_IO_APIC_traps(void) 2555static inline void init_IO_APIC_traps(void)
2763{ 2556{
2764 int irq;
2765 struct irq_desc *desc;
2766 struct irq_cfg *cfg; 2557 struct irq_cfg *cfg;
2558 unsigned int irq;
2767 2559
2768 /* 2560 /*
2769 * NOTE! The local APIC isn't very good at handling 2561 * NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2568,8 @@ static inline void init_IO_APIC_traps(void)
2776 * Also, we've got to be careful not to trash gate 2568 * Also, we've got to be careful not to trash gate
2777 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2569 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2778 */ 2570 */
2779 for_each_irq_desc(irq, desc) { 2571 for_each_active_irq(irq) {
2780 cfg = desc->chip_data; 2572 cfg = irq_get_chip_data(irq);
2781 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2573 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2782 /* 2574 /*
2783 * Hmm.. We don't have an entry for this, 2575 * Hmm.. We don't have an entry for this,
@@ -2788,7 +2580,7 @@ static inline void init_IO_APIC_traps(void)
2788 legacy_pic->make_irq(irq); 2580 legacy_pic->make_irq(irq);
2789 else 2581 else
2790 /* Strange. Oh, well.. */ 2582 /* Strange. Oh, well.. */
2791 desc->chip = &no_irq_chip; 2583 irq_set_chip(irq, &no_irq_chip);
2792 } 2584 }
2793 } 2585 }
2794} 2586}
@@ -2797,7 +2589,7 @@ static inline void init_IO_APIC_traps(void)
2797 * The local APIC irq-chip implementation: 2589 * The local APIC irq-chip implementation:
2798 */ 2590 */
2799 2591
2800static void mask_lapic_irq(unsigned int irq) 2592static void mask_lapic_irq(struct irq_data *data)
2801{ 2593{
2802 unsigned long v; 2594 unsigned long v;
2803 2595
@@ -2805,7 +2597,7 @@ static void mask_lapic_irq(unsigned int irq)
2805 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2597 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2806} 2598}
2807 2599
2808static void unmask_lapic_irq(unsigned int irq) 2600static void unmask_lapic_irq(struct irq_data *data)
2809{ 2601{
2810 unsigned long v; 2602 unsigned long v;
2811 2603
@@ -2813,43 +2605,25 @@ static void unmask_lapic_irq(unsigned int irq)
2813 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2605 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2814} 2606}
2815 2607
2816static void ack_lapic_irq(unsigned int irq) 2608static void ack_lapic_irq(struct irq_data *data)
2817{ 2609{
2818 ack_APIC_irq(); 2610 ack_APIC_irq();
2819} 2611}
2820 2612
2821static struct irq_chip lapic_chip __read_mostly = { 2613static struct irq_chip lapic_chip __read_mostly = {
2822 .name = "local-APIC", 2614 .name = "local-APIC",
2823 .mask = mask_lapic_irq, 2615 .irq_mask = mask_lapic_irq,
2824 .unmask = unmask_lapic_irq, 2616 .irq_unmask = unmask_lapic_irq,
2825 .ack = ack_lapic_irq, 2617 .irq_ack = ack_lapic_irq,
2826}; 2618};
2827 2619
2828static void lapic_register_intr(int irq, struct irq_desc *desc) 2620static void lapic_register_intr(int irq)
2829{ 2621{
2830 desc->status &= ~IRQ_LEVEL; 2622 irq_clear_status_flags(irq, IRQ_LEVEL);
2831 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2623 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2832 "edge"); 2624 "edge");
2833} 2625}
2834 2626
2835static void __init setup_nmi(void)
2836{
2837 /*
2838 * Dirty trick to enable the NMI watchdog ...
2839 * We put the 8259A master into AEOI mode and
2840 * unmask on all local APICs LVT0 as NMI.
2841 *
2842 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2843 * is from Maciej W. Rozycki - so we do not have to EOI from
2844 * the NMI handler or the timer interrupt.
2845 */
2846 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2847
2848 enable_NMI_through_LVT0();
2849
2850 apic_printk(APIC_VERBOSE, " done.\n");
2851}
2852
2853/* 2627/*
2854 * This looks a bit hackish but it's about the only one way of sending 2628 * This looks a bit hackish but it's about the only one way of sending
2855 * a few INTA cycles to 8259As and any associated glue logic. ICR does 2629 * a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2930,9 +2704,8 @@ int timer_through_8259 __initdata;
2930 */ 2704 */
2931static inline void __init check_timer(void) 2705static inline void __init check_timer(void)
2932{ 2706{
2933 struct irq_desc *desc = irq_to_desc(0); 2707 struct irq_cfg *cfg = irq_get_chip_data(0);
2934 struct irq_cfg *cfg = desc->chip_data; 2708 int node = cpu_to_node(0);
2935 int node = cpu_to_node(boot_cpu_id);
2936 int apic1, pin1, apic2, pin2; 2709 int apic1, pin1, apic2, pin2;
2937 unsigned long flags; 2710 unsigned long flags;
2938 int no_pin1 = 0; 2711 int no_pin1 = 0;
@@ -2942,7 +2715,7 @@ static inline void __init check_timer(void)
2942 /* 2715 /*
2943 * get/set the timer IRQ vector: 2716 * get/set the timer IRQ vector:
2944 */ 2717 */
2945 legacy_pic->chip->mask(0); 2718 legacy_pic->mask(0);
2946 assign_irq_vector(0, cfg, apic->target_cpus()); 2719 assign_irq_vector(0, cfg, apic->target_cpus());
2947 2720
2948 /* 2721 /*
@@ -2956,15 +2729,6 @@ static inline void __init check_timer(void)
2956 */ 2729 */
2957 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2730 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2958 legacy_pic->init(1); 2731 legacy_pic->init(1);
2959#ifdef CONFIG_X86_32
2960 {
2961 unsigned int ver;
2962
2963 ver = apic_read(APIC_LVR);
2964 ver = GET_APIC_VERSION(ver);
2965 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2966 }
2967#endif
2968 2732
2969 pin1 = find_isa_irq_pin(0, mp_INT); 2733 pin1 = find_isa_irq_pin(0, mp_INT);
2970 apic1 = find_isa_irq_apic(0, mp_INT); 2734 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -3001,7 +2765,7 @@ static inline void __init check_timer(void)
3001 add_pin_to_irq_node(cfg, node, apic1, pin1); 2765 add_pin_to_irq_node(cfg, node, apic1, pin1);
3002 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2766 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
3003 } else { 2767 } else {
3004 /* for edge trigger, setup_IO_APIC_irq already 2768 /* for edge trigger, setup_ioapic_irq already
3005 * leave it unmasked. 2769 * leave it unmasked.
3006 * so only need to unmask if it is level-trigger 2770 * so only need to unmask if it is level-trigger
3007 * do we really have level trigger timer? 2771 * do we really have level trigger timer?
@@ -3009,13 +2773,9 @@ static inline void __init check_timer(void)
3009 int idx; 2773 int idx;
3010 idx = find_irq_entry(apic1, pin1, mp_INT); 2774 idx = find_irq_entry(apic1, pin1, mp_INT);
3011 if (idx != -1 && irq_trigger(idx)) 2775 if (idx != -1 && irq_trigger(idx))
3012 unmask_IO_APIC_irq_desc(desc); 2776 unmask_ioapic(cfg);
3013 } 2777 }
3014 if (timer_irq_works()) { 2778 if (timer_irq_works()) {
3015 if (nmi_watchdog == NMI_IO_APIC) {
3016 setup_nmi();
3017 legacy_pic->chip->unmask(0);
3018 }
3019 if (disable_timer_pin_1 > 0) 2779 if (disable_timer_pin_1 > 0)
3020 clear_IO_APIC_pin(0, pin1); 2780 clear_IO_APIC_pin(0, pin1);
3021 goto out; 2781 goto out;
@@ -3037,48 +2797,34 @@ static inline void __init check_timer(void)
3037 */ 2797 */
3038 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 2798 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3039 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2799 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3040 legacy_pic->chip->unmask(0); 2800 legacy_pic->unmask(0);
3041 if (timer_irq_works()) { 2801 if (timer_irq_works()) {
3042 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2802 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
3043 timer_through_8259 = 1; 2803 timer_through_8259 = 1;
3044 if (nmi_watchdog == NMI_IO_APIC) {
3045 legacy_pic->chip->mask(0);
3046 setup_nmi();
3047 legacy_pic->chip->unmask(0);
3048 }
3049 goto out; 2804 goto out;
3050 } 2805 }
3051 /* 2806 /*
3052 * Cleanup, just in case ... 2807 * Cleanup, just in case ...
3053 */ 2808 */
3054 local_irq_disable(); 2809 local_irq_disable();
3055 legacy_pic->chip->mask(0); 2810 legacy_pic->mask(0);
3056 clear_IO_APIC_pin(apic2, pin2); 2811 clear_IO_APIC_pin(apic2, pin2);
3057 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2812 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3058 } 2813 }
3059 2814
3060 if (nmi_watchdog == NMI_IO_APIC) {
3061 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
3062 "through the IO-APIC - disabling NMI Watchdog!\n");
3063 nmi_watchdog = NMI_NONE;
3064 }
3065#ifdef CONFIG_X86_32
3066 timer_ack = 0;
3067#endif
3068
3069 apic_printk(APIC_QUIET, KERN_INFO 2815 apic_printk(APIC_QUIET, KERN_INFO
3070 "...trying to set up timer as Virtual Wire IRQ...\n"); 2816 "...trying to set up timer as Virtual Wire IRQ...\n");
3071 2817
3072 lapic_register_intr(0, desc); 2818 lapic_register_intr(0);
3073 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2819 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3074 legacy_pic->chip->unmask(0); 2820 legacy_pic->unmask(0);
3075 2821
3076 if (timer_irq_works()) { 2822 if (timer_irq_works()) {
3077 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 2823 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3078 goto out; 2824 goto out;
3079 } 2825 }
3080 local_irq_disable(); 2826 local_irq_disable();
3081 legacy_pic->chip->mask(0); 2827 legacy_pic->mask(0);
3082 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2828 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3083 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 2829 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3084 2830
@@ -3144,7 +2890,7 @@ void __init setup_IO_APIC(void)
3144} 2890}
3145 2891
3146/* 2892/*
3147 * Called after all the initialization is done. If we didnt find any 2893 * Called after all the initialization is done. If we didn't find any
3148 * APIC bugs then we can allow the modify fast path 2894 * APIC bugs then we can allow the modify fast path
3149 */ 2895 */
3150 2896
@@ -3157,136 +2903,84 @@ static int __init io_apic_bug_finalize(void)
3157 2903
3158late_initcall(io_apic_bug_finalize); 2904late_initcall(io_apic_bug_finalize);
3159 2905
3160struct sysfs_ioapic_data { 2906static void resume_ioapic_id(int ioapic_id)
3161 struct sys_device dev;
3162 struct IO_APIC_route_entry entry[0];
3163};
3164static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
3165
3166static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
3167{ 2907{
3168 struct IO_APIC_route_entry *entry;
3169 struct sysfs_ioapic_data *data;
3170 int i;
3171
3172 data = container_of(dev, struct sysfs_ioapic_data, dev);
3173 entry = data->entry;
3174 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
3175 *entry = ioapic_read_entry(dev->id, i);
3176
3177 return 0;
3178}
3179
3180static int ioapic_resume(struct sys_device *dev)
3181{
3182 struct IO_APIC_route_entry *entry;
3183 struct sysfs_ioapic_data *data;
3184 unsigned long flags; 2908 unsigned long flags;
3185 union IO_APIC_reg_00 reg_00; 2909 union IO_APIC_reg_00 reg_00;
3186 int i;
3187 2910
3188 data = container_of(dev, struct sysfs_ioapic_data, dev);
3189 entry = data->entry;
3190 2911
3191 raw_spin_lock_irqsave(&ioapic_lock, flags); 2912 raw_spin_lock_irqsave(&ioapic_lock, flags);
3192 reg_00.raw = io_apic_read(dev->id, 0); 2913 reg_00.raw = io_apic_read(ioapic_id, 0);
3193 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 2914 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
3194 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 2915 reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
3195 io_apic_write(dev->id, 0, reg_00.raw); 2916 io_apic_write(ioapic_id, 0, reg_00.raw);
3196 } 2917 }
3197 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2918 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3198 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 2919}
3199 ioapic_write_entry(dev->id, i, entry[i]);
3200 2920
3201 return 0; 2921static void ioapic_resume(void)
2922{
2923 int ioapic_id;
2924
2925 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2926 resume_ioapic_id(ioapic_id);
2927
2928 restore_ioapic_entries();
3202} 2929}
3203 2930
3204static struct sysdev_class ioapic_sysdev_class = { 2931static struct syscore_ops ioapic_syscore_ops = {
3205 .name = "ioapic", 2932 .suspend = save_ioapic_entries,
3206 .suspend = ioapic_suspend,
3207 .resume = ioapic_resume, 2933 .resume = ioapic_resume,
3208}; 2934};
3209 2935
3210static int __init ioapic_init_sysfs(void) 2936static int __init ioapic_init_ops(void)
3211{ 2937{
3212 struct sys_device * dev; 2938 register_syscore_ops(&ioapic_syscore_ops);
3213 int i, size, error;
3214
3215 error = sysdev_class_register(&ioapic_sysdev_class);
3216 if (error)
3217 return error;
3218
3219 for (i = 0; i < nr_ioapics; i++ ) {
3220 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
3221 * sizeof(struct IO_APIC_route_entry);
3222 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
3223 if (!mp_ioapic_data[i]) {
3224 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3225 continue;
3226 }
3227 dev = &mp_ioapic_data[i]->dev;
3228 dev->id = i;
3229 dev->cls = &ioapic_sysdev_class;
3230 error = sysdev_register(dev);
3231 if (error) {
3232 kfree(mp_ioapic_data[i]);
3233 mp_ioapic_data[i] = NULL;
3234 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3235 continue;
3236 }
3237 }
3238 2939
3239 return 0; 2940 return 0;
3240} 2941}
3241 2942
3242device_initcall(ioapic_init_sysfs); 2943device_initcall(ioapic_init_ops);
3243 2944
3244/* 2945/*
3245 * Dynamic irq allocate and deallocation 2946 * Dynamic irq allocate and deallocation
3246 */ 2947 */
3247unsigned int create_irq_nr(unsigned int irq_want, int node) 2948unsigned int create_irq_nr(unsigned int from, int node)
3248{ 2949{
3249 /* Allocate an unused irq */ 2950 struct irq_cfg *cfg;
3250 unsigned int irq;
3251 unsigned int new;
3252 unsigned long flags; 2951 unsigned long flags;
3253 struct irq_cfg *cfg_new = NULL; 2952 unsigned int ret = 0;
3254 struct irq_desc *desc_new = NULL; 2953 int irq;
3255
3256 irq = 0;
3257 if (irq_want < nr_irqs_gsi)
3258 irq_want = nr_irqs_gsi;
3259
3260 raw_spin_lock_irqsave(&vector_lock, flags);
3261 for (new = irq_want; new < nr_irqs; new++) {
3262 desc_new = irq_to_desc_alloc_node(new, node);
3263 if (!desc_new) {
3264 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3265 continue;
3266 }
3267 cfg_new = desc_new->chip_data;
3268
3269 if (cfg_new->vector != 0)
3270 continue;
3271 2954
3272 desc_new = move_irq_desc(desc_new, node); 2955 if (from < nr_irqs_gsi)
3273 cfg_new = desc_new->chip_data; 2956 from = nr_irqs_gsi;
3274 2957
3275 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 2958 irq = alloc_irq_from(from, node);
3276 irq = new; 2959 if (irq < 0)
3277 break; 2960 return 0;
2961 cfg = alloc_irq_cfg(irq, node);
2962 if (!cfg) {
2963 free_irq_at(irq, NULL);
2964 return 0;
3278 } 2965 }
3279 raw_spin_unlock_irqrestore(&vector_lock, flags);
3280 2966
3281 if (irq > 0) 2967 raw_spin_lock_irqsave(&vector_lock, flags);
3282 dynamic_irq_init_keep_chip_data(irq); 2968 if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
2969 ret = irq;
2970 raw_spin_unlock_irqrestore(&vector_lock, flags);
3283 2971
3284 return irq; 2972 if (ret) {
2973 irq_set_chip_data(irq, cfg);
2974 irq_clear_status_flags(irq, IRQ_NOREQUEST);
2975 } else {
2976 free_irq_at(irq, cfg);
2977 }
2978 return ret;
3285} 2979}
3286 2980
3287int create_irq(void) 2981int create_irq(void)
3288{ 2982{
3289 int node = cpu_to_node(boot_cpu_id); 2983 int node = cpu_to_node(0);
3290 unsigned int irq_want; 2984 unsigned int irq_want;
3291 int irq; 2985 int irq;
3292 2986
@@ -3301,14 +2995,17 @@ int create_irq(void)
3301 2995
3302void destroy_irq(unsigned int irq) 2996void destroy_irq(unsigned int irq)
3303{ 2997{
2998 struct irq_cfg *cfg = irq_get_chip_data(irq);
3304 unsigned long flags; 2999 unsigned long flags;
3305 3000
3306 dynamic_irq_cleanup_keep_chip_data(irq); 3001 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3307 3002
3308 free_irte(irq); 3003 if (irq_remapped(cfg))
3004 free_irte(irq);
3309 raw_spin_lock_irqsave(&vector_lock, flags); 3005 raw_spin_lock_irqsave(&vector_lock, flags);
3310 __clear_irq_vector(irq, get_irq_chip_data(irq)); 3006 __clear_irq_vector(irq, cfg);
3311 raw_spin_unlock_irqrestore(&vector_lock, flags); 3007 raw_spin_unlock_irqrestore(&vector_lock, flags);
3008 free_irq_at(irq, cfg);
3312} 3009}
3313 3010
3314/* 3011/*
@@ -3332,7 +3029,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3332 3029
3333 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3030 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3334 3031
3335 if (irq_remapped(irq)) { 3032 if (irq_remapped(cfg)) {
3336 struct irte irte; 3033 struct irte irte;
3337 int ir_index; 3034 int ir_index;
3338 u16 sub_handle; 3035 u16 sub_handle;
@@ -3340,14 +3037,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3340 ir_index = map_irq_to_irte_handle(irq, &sub_handle); 3037 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
3341 BUG_ON(ir_index == -1); 3038 BUG_ON(ir_index == -1);
3342 3039
3343 memset (&irte, 0, sizeof(irte)); 3040 prepare_irte(&irte, cfg->vector, dest);
3344
3345 irte.present = 1;
3346 irte.dst_mode = apic->irq_dest_mode;
3347 irte.trigger_mode = 0; /* edge */
3348 irte.dlvry_mode = apic->irq_delivery_mode;
3349 irte.vector = cfg->vector;
3350 irte.dest_id = IRTE_DEST(dest);
3351 3041
3352 /* Set source-id of interrupt request */ 3042 /* Set source-id of interrupt request */
3353 if (pdev) 3043 if (pdev)
@@ -3392,26 +3082,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3392} 3082}
3393 3083
3394#ifdef CONFIG_SMP 3084#ifdef CONFIG_SMP
3395static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3085static int
3086msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3396{ 3087{
3397 struct irq_desc *desc = irq_to_desc(irq); 3088 struct irq_cfg *cfg = data->chip_data;
3398 struct irq_cfg *cfg;
3399 struct msi_msg msg; 3089 struct msi_msg msg;
3400 unsigned int dest; 3090 unsigned int dest;
3401 3091
3402 if (set_desc_affinity(desc, mask, &dest)) 3092 if (__ioapic_set_affinity(data, mask, &dest))
3403 return -1; 3093 return -1;
3404 3094
3405 cfg = desc->chip_data; 3095 __get_cached_msi_msg(data->msi_desc, &msg);
3406
3407 get_cached_msi_msg_desc(desc, &msg);
3408 3096
3409 msg.data &= ~MSI_DATA_VECTOR_MASK; 3097 msg.data &= ~MSI_DATA_VECTOR_MASK;
3410 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3098 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3411 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3099 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3412 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3100 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3413 3101
3414 write_msi_msg_desc(desc, &msg); 3102 __write_msi_msg(data->msi_desc, &msg);
3415 3103
3416 return 0; 3104 return 0;
3417} 3105}
@@ -3421,17 +3109,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3421 * done in the process context using interrupt-remapping hardware. 3109 * done in the process context using interrupt-remapping hardware.
3422 */ 3110 */
3423static int 3111static int
3424ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3112ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3113 bool force)
3425{ 3114{
3426 struct irq_desc *desc = irq_to_desc(irq); 3115 struct irq_cfg *cfg = data->chip_data;
3427 struct irq_cfg *cfg = desc->chip_data; 3116 unsigned int dest, irq = data->irq;
3428 unsigned int dest;
3429 struct irte irte; 3117 struct irte irte;
3430 3118
3431 if (get_irte(irq, &irte)) 3119 if (get_irte(irq, &irte))
3432 return -1; 3120 return -1;
3433 3121
3434 if (set_desc_affinity(desc, mask, &dest)) 3122 if (__ioapic_set_affinity(data, mask, &dest))
3435 return -1; 3123 return -1;
3436 3124
3437 irte.vector = cfg->vector; 3125 irte.vector = cfg->vector;
@@ -3461,27 +3149,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3461 * which implement the MSI or MSI-X Capability Structure. 3149 * which implement the MSI or MSI-X Capability Structure.
3462 */ 3150 */
3463static struct irq_chip msi_chip = { 3151static struct irq_chip msi_chip = {
3464 .name = "PCI-MSI", 3152 .name = "PCI-MSI",
3465 .unmask = unmask_msi_irq, 3153 .irq_unmask = unmask_msi_irq,
3466 .mask = mask_msi_irq, 3154 .irq_mask = mask_msi_irq,
3467 .ack = ack_apic_edge, 3155 .irq_ack = ack_apic_edge,
3468#ifdef CONFIG_SMP 3156#ifdef CONFIG_SMP
3469 .set_affinity = set_msi_irq_affinity, 3157 .irq_set_affinity = msi_set_affinity,
3470#endif 3158#endif
3471 .retrigger = ioapic_retrigger_irq, 3159 .irq_retrigger = ioapic_retrigger_irq,
3472}; 3160};
3473 3161
3474static struct irq_chip msi_ir_chip = { 3162static struct irq_chip msi_ir_chip = {
3475 .name = "IR-PCI-MSI", 3163 .name = "IR-PCI-MSI",
3476 .unmask = unmask_msi_irq, 3164 .irq_unmask = unmask_msi_irq,
3477 .mask = mask_msi_irq, 3165 .irq_mask = mask_msi_irq,
3478#ifdef CONFIG_INTR_REMAP 3166#ifdef CONFIG_INTR_REMAP
3479 .ack = ir_ack_apic_edge, 3167 .irq_ack = ir_ack_apic_edge,
3480#ifdef CONFIG_SMP 3168#ifdef CONFIG_SMP
3481 .set_affinity = ir_set_msi_irq_affinity, 3169 .irq_set_affinity = ir_msi_set_affinity,
3482#endif 3170#endif
3483#endif 3171#endif
3484 .retrigger = ioapic_retrigger_irq, 3172 .irq_retrigger = ioapic_retrigger_irq,
3485}; 3173};
3486 3174
3487/* 3175/*
@@ -3513,40 +3201,35 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3513 3201
3514static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3202static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3515{ 3203{
3516 int ret; 3204 struct irq_chip *chip = &msi_chip;
3517 struct msi_msg msg; 3205 struct msi_msg msg;
3206 int ret;
3518 3207
3519 ret = msi_compose_msg(dev, irq, &msg, -1); 3208 ret = msi_compose_msg(dev, irq, &msg, -1);
3520 if (ret < 0) 3209 if (ret < 0)
3521 return ret; 3210 return ret;
3522 3211
3523 set_irq_msi(irq, msidesc); 3212 irq_set_msi_desc(irq, msidesc);
3524 write_msi_msg(irq, &msg); 3213 write_msi_msg(irq, &msg);
3525 3214
3526 if (irq_remapped(irq)) { 3215 if (irq_remapped(irq_get_chip_data(irq))) {
3527 struct irq_desc *desc = irq_to_desc(irq); 3216 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3528 /* 3217 chip = &msi_ir_chip;
3529 * irq migration in process context 3218 }
3530 */ 3219
3531 desc->status |= IRQ_MOVE_PCNTXT; 3220 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3532 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3533 } else
3534 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3535 3221
3536 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3222 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3537 3223
3538 return 0; 3224 return 0;
3539} 3225}
3540 3226
3541int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3227int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3542{ 3228{
3543 unsigned int irq; 3229 int node, ret, sub_handle, index = 0;
3544 int ret, sub_handle; 3230 unsigned int irq, irq_want;
3545 struct msi_desc *msidesc; 3231 struct msi_desc *msidesc;
3546 unsigned int irq_want;
3547 struct intel_iommu *iommu = NULL; 3232 struct intel_iommu *iommu = NULL;
3548 int index = 0;
3549 int node;
3550 3233
3551 /* x86 doesn't support multiple MSI yet */ 3234 /* x86 doesn't support multiple MSI yet */
3552 if (type == PCI_CAP_ID_MSI && nvec > 1) 3235 if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3599,31 +3282,31 @@ error:
3599 return ret; 3282 return ret;
3600} 3283}
3601 3284
3602void arch_teardown_msi_irq(unsigned int irq) 3285void native_teardown_msi_irq(unsigned int irq)
3603{ 3286{
3604 destroy_irq(irq); 3287 destroy_irq(irq);
3605} 3288}
3606 3289
3607#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3290#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3608#ifdef CONFIG_SMP 3291#ifdef CONFIG_SMP
3609static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3292static int
3293dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3294 bool force)
3610{ 3295{
3611 struct irq_desc *desc = irq_to_desc(irq); 3296 struct irq_cfg *cfg = data->chip_data;
3612 struct irq_cfg *cfg; 3297 unsigned int dest, irq = data->irq;
3613 struct msi_msg msg; 3298 struct msi_msg msg;
3614 unsigned int dest;
3615 3299
3616 if (set_desc_affinity(desc, mask, &dest)) 3300 if (__ioapic_set_affinity(data, mask, &dest))
3617 return -1; 3301 return -1;
3618 3302
3619 cfg = desc->chip_data;
3620
3621 dmar_msi_read(irq, &msg); 3303 dmar_msi_read(irq, &msg);
3622 3304
3623 msg.data &= ~MSI_DATA_VECTOR_MASK; 3305 msg.data &= ~MSI_DATA_VECTOR_MASK;
3624 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3306 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3625 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3307 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3626 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3308 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3309 msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
3627 3310
3628 dmar_msi_write(irq, &msg); 3311 dmar_msi_write(irq, &msg);
3629 3312
@@ -3633,14 +3316,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3633#endif /* CONFIG_SMP */ 3316#endif /* CONFIG_SMP */
3634 3317
3635static struct irq_chip dmar_msi_type = { 3318static struct irq_chip dmar_msi_type = {
3636 .name = "DMAR_MSI", 3319 .name = "DMAR_MSI",
3637 .unmask = dmar_msi_unmask, 3320 .irq_unmask = dmar_msi_unmask,
3638 .mask = dmar_msi_mask, 3321 .irq_mask = dmar_msi_mask,
3639 .ack = ack_apic_edge, 3322 .irq_ack = ack_apic_edge,
3640#ifdef CONFIG_SMP 3323#ifdef CONFIG_SMP
3641 .set_affinity = dmar_msi_set_affinity, 3324 .irq_set_affinity = dmar_msi_set_affinity,
3642#endif 3325#endif
3643 .retrigger = ioapic_retrigger_irq, 3326 .irq_retrigger = ioapic_retrigger_irq,
3644}; 3327};
3645 3328
3646int arch_setup_dmar_msi(unsigned int irq) 3329int arch_setup_dmar_msi(unsigned int irq)
@@ -3652,8 +3335,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3652 if (ret < 0) 3335 if (ret < 0)
3653 return ret; 3336 return ret;
3654 dmar_msi_write(irq, &msg); 3337 dmar_msi_write(irq, &msg);
3655 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3338 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3656 "edge"); 3339 "edge");
3657 return 0; 3340 return 0;
3658} 3341}
3659#endif 3342#endif
@@ -3661,26 +3344,24 @@ int arch_setup_dmar_msi(unsigned int irq)
3661#ifdef CONFIG_HPET_TIMER 3344#ifdef CONFIG_HPET_TIMER
3662 3345
3663#ifdef CONFIG_SMP 3346#ifdef CONFIG_SMP
3664static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3347static int hpet_msi_set_affinity(struct irq_data *data,
3348 const struct cpumask *mask, bool force)
3665{ 3349{
3666 struct irq_desc *desc = irq_to_desc(irq); 3350 struct irq_cfg *cfg = data->chip_data;
3667 struct irq_cfg *cfg;
3668 struct msi_msg msg; 3351 struct msi_msg msg;
3669 unsigned int dest; 3352 unsigned int dest;
3670 3353
3671 if (set_desc_affinity(desc, mask, &dest)) 3354 if (__ioapic_set_affinity(data, mask, &dest))
3672 return -1; 3355 return -1;
3673 3356
3674 cfg = desc->chip_data; 3357 hpet_msi_read(data->handler_data, &msg);
3675
3676 hpet_msi_read(irq, &msg);
3677 3358
3678 msg.data &= ~MSI_DATA_VECTOR_MASK; 3359 msg.data &= ~MSI_DATA_VECTOR_MASK;
3679 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3360 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3680 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3361 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3681 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3362 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3682 3363
3683 hpet_msi_write(irq, &msg); 3364 hpet_msi_write(data->handler_data, &msg);
3684 3365
3685 return 0; 3366 return 0;
3686} 3367}
@@ -3688,34 +3369,34 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3688#endif /* CONFIG_SMP */ 3369#endif /* CONFIG_SMP */
3689 3370
3690static struct irq_chip ir_hpet_msi_type = { 3371static struct irq_chip ir_hpet_msi_type = {
3691 .name = "IR-HPET_MSI", 3372 .name = "IR-HPET_MSI",
3692 .unmask = hpet_msi_unmask, 3373 .irq_unmask = hpet_msi_unmask,
3693 .mask = hpet_msi_mask, 3374 .irq_mask = hpet_msi_mask,
3694#ifdef CONFIG_INTR_REMAP 3375#ifdef CONFIG_INTR_REMAP
3695 .ack = ir_ack_apic_edge, 3376 .irq_ack = ir_ack_apic_edge,
3696#ifdef CONFIG_SMP 3377#ifdef CONFIG_SMP
3697 .set_affinity = ir_set_msi_irq_affinity, 3378 .irq_set_affinity = ir_msi_set_affinity,
3698#endif 3379#endif
3699#endif 3380#endif
3700 .retrigger = ioapic_retrigger_irq, 3381 .irq_retrigger = ioapic_retrigger_irq,
3701}; 3382};
3702 3383
3703static struct irq_chip hpet_msi_type = { 3384static struct irq_chip hpet_msi_type = {
3704 .name = "HPET_MSI", 3385 .name = "HPET_MSI",
3705 .unmask = hpet_msi_unmask, 3386 .irq_unmask = hpet_msi_unmask,
3706 .mask = hpet_msi_mask, 3387 .irq_mask = hpet_msi_mask,
3707 .ack = ack_apic_edge, 3388 .irq_ack = ack_apic_edge,
3708#ifdef CONFIG_SMP 3389#ifdef CONFIG_SMP
3709 .set_affinity = hpet_msi_set_affinity, 3390 .irq_set_affinity = hpet_msi_set_affinity,
3710#endif 3391#endif
3711 .retrigger = ioapic_retrigger_irq, 3392 .irq_retrigger = ioapic_retrigger_irq,
3712}; 3393};
3713 3394
3714int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3395int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3715{ 3396{
3716 int ret; 3397 struct irq_chip *chip = &hpet_msi_type;
3717 struct msi_msg msg; 3398 struct msi_msg msg;
3718 struct irq_desc *desc = irq_to_desc(irq); 3399 int ret;
3719 3400
3720 if (intr_remapping_enabled) { 3401 if (intr_remapping_enabled) {
3721 struct intel_iommu *iommu = map_hpet_to_ir(id); 3402 struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,15 +3414,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3733 if (ret < 0) 3414 if (ret < 0)
3734 return ret; 3415 return ret;
3735 3416
3736 hpet_msi_write(irq, &msg); 3417 hpet_msi_write(irq_get_handler_data(irq), &msg);
3737 desc->status |= IRQ_MOVE_PCNTXT; 3418 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3738 if (irq_remapped(irq)) 3419 if (irq_remapped(irq_get_chip_data(irq)))
3739 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3420 chip = &ir_hpet_msi_type;
3740 handle_edge_irq, "edge");
3741 else
3742 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3743 handle_edge_irq, "edge");
3744 3421
3422 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3745 return 0; 3423 return 0;
3746} 3424}
3747#endif 3425#endif
@@ -3768,33 +3446,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3768 write_ht_irq_msg(irq, &msg); 3446 write_ht_irq_msg(irq, &msg);
3769} 3447}
3770 3448
3771static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3449static int
3450ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3772{ 3451{
3773 struct irq_desc *desc = irq_to_desc(irq); 3452 struct irq_cfg *cfg = data->chip_data;
3774 struct irq_cfg *cfg;
3775 unsigned int dest; 3453 unsigned int dest;
3776 3454
3777 if (set_desc_affinity(desc, mask, &dest)) 3455 if (__ioapic_set_affinity(data, mask, &dest))
3778 return -1; 3456 return -1;
3779 3457
3780 cfg = desc->chip_data; 3458 target_ht_irq(data->irq, dest, cfg->vector);
3781
3782 target_ht_irq(irq, dest, cfg->vector);
3783
3784 return 0; 3459 return 0;
3785} 3460}
3786 3461
3787#endif 3462#endif
3788 3463
3789static struct irq_chip ht_irq_chip = { 3464static struct irq_chip ht_irq_chip = {
3790 .name = "PCI-HT", 3465 .name = "PCI-HT",
3791 .mask = mask_ht_irq, 3466 .irq_mask = mask_ht_irq,
3792 .unmask = unmask_ht_irq, 3467 .irq_unmask = unmask_ht_irq,
3793 .ack = ack_apic_edge, 3468 .irq_ack = ack_apic_edge,
3794#ifdef CONFIG_SMP 3469#ifdef CONFIG_SMP
3795 .set_affinity = set_ht_irq_affinity, 3470 .irq_set_affinity = ht_set_affinity,
3796#endif 3471#endif
3797 .retrigger = ioapic_retrigger_irq, 3472 .irq_retrigger = ioapic_retrigger_irq,
3798}; 3473};
3799 3474
3800int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3831,7 +3506,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3831 3506
3832 write_ht_irq_msg(irq, &msg); 3507 write_ht_irq_msg(irq, &msg);
3833 3508
3834 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3509 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3835 handle_edge_irq, "edge"); 3510 handle_edge_irq, "edge");
3836 3511
3837 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3512 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3840,7 +3515,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3840} 3515}
3841#endif /* CONFIG_HT_IRQ */ 3516#endif /* CONFIG_HT_IRQ */
3842 3517
3843int __init io_apic_get_redir_entries (int ioapic) 3518static int
3519io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3520{
3521 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3522 int ret;
3523
3524 if (!cfg)
3525 return -EINVAL;
3526 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3527 if (!ret)
3528 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3529 attr->trigger, attr->polarity);
3530 return ret;
3531}
3532
3533int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3534 struct io_apic_irq_attr *attr)
3535{
3536 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3537 int ret;
3538
3539 /* Avoid redundant programming */
3540 if (test_bit(pin, ioapics[id].pin_programmed)) {
3541 pr_debug("Pin %d-%d already programmed\n",
3542 mpc_ioapic_id(id), pin);
3543 return 0;
3544 }
3545 ret = io_apic_setup_irq_pin(irq, node, attr);
3546 if (!ret)
3547 set_bit(pin, ioapics[id].pin_programmed);
3548 return ret;
3549}
3550
3551static int __init io_apic_get_redir_entries(int ioapic)
3844{ 3552{
3845 union IO_APIC_reg_01 reg_01; 3553 union IO_APIC_reg_01 reg_01;
3846 unsigned long flags; 3554 unsigned long flags;
@@ -3856,7 +3564,7 @@ int __init io_apic_get_redir_entries (int ioapic)
3856 return reg_01.bits.entries + 1; 3564 return reg_01.bits.entries + 1;
3857} 3565}
3858 3566
3859void __init probe_nr_irqs_gsi(void) 3567static void __init probe_nr_irqs_gsi(void)
3860{ 3568{
3861 int nr; 3569 int nr;
3862 3570
@@ -3867,6 +3575,11 @@ void __init probe_nr_irqs_gsi(void)
3867 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3575 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3868} 3576}
3869 3577
3578int get_nr_irqs_gsi(void)
3579{
3580 return nr_irqs_gsi;
3581}
3582
3870#ifdef CONFIG_SPARSE_IRQ 3583#ifdef CONFIG_SPARSE_IRQ
3871int __init arch_probe_nr_irqs(void) 3584int __init arch_probe_nr_irqs(void)
3872{ 3585{
@@ -3885,104 +3598,28 @@ int __init arch_probe_nr_irqs(void)
3885 if (nr < nr_irqs) 3598 if (nr < nr_irqs)
3886 nr_irqs = nr; 3599 nr_irqs = nr;
3887 3600
3888 return 0; 3601 return NR_IRQS_LEGACY;
3889} 3602}
3890#endif 3603#endif
3891 3604
3892static int __io_apic_set_pci_routing(struct device *dev, int irq, 3605int io_apic_set_pci_routing(struct device *dev, int irq,
3893 struct io_apic_irq_attr *irq_attr) 3606 struct io_apic_irq_attr *irq_attr)
3894{ 3607{
3895 struct irq_desc *desc;
3896 struct irq_cfg *cfg;
3897 int node; 3608 int node;
3898 int ioapic, pin;
3899 int trigger, polarity;
3900 3609
3901 ioapic = irq_attr->ioapic;
3902 if (!IO_APIC_IRQ(irq)) { 3610 if (!IO_APIC_IRQ(irq)) {
3903 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3611 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3904 ioapic); 3612 irq_attr->ioapic);
3905 return -EINVAL; 3613 return -EINVAL;
3906 } 3614 }
3907 3615
3908 if (dev) 3616 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3909 node = dev_to_node(dev);
3910 else
3911 node = cpu_to_node(boot_cpu_id);
3912
3913 desc = irq_to_desc_alloc_node(irq, node);
3914 if (!desc) {
3915 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3916 return 0;
3917 }
3918
3919 pin = irq_attr->ioapic_pin;
3920 trigger = irq_attr->trigger;
3921 polarity = irq_attr->polarity;
3922 3617
3923 /* 3618 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3924 * IRQs < 16 are already in the irq_2_pin[] map
3925 */
3926 if (irq >= legacy_pic->nr_legacy_irqs) {
3927 cfg = desc->chip_data;
3928 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3929 printk(KERN_INFO "can not add pin %d for irq %d\n",
3930 pin, irq);
3931 return 0;
3932 }
3933 }
3934
3935 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3936
3937 return 0;
3938}
3939
3940int io_apic_set_pci_routing(struct device *dev, int irq,
3941 struct io_apic_irq_attr *irq_attr)
3942{
3943 int ioapic, pin;
3944 /*
3945 * Avoid pin reprogramming. PRTs typically include entries
3946 * with redundant pin->gsi mappings (but unique PCI devices);
3947 * we only program the IOAPIC on the first.
3948 */
3949 ioapic = irq_attr->ioapic;
3950 pin = irq_attr->ioapic_pin;
3951 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3952 pr_debug("Pin %d-%d already programmed\n",
3953 mp_ioapics[ioapic].apicid, pin);
3954 return 0;
3955 }
3956 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3957
3958 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3959}
3960
3961u8 __init io_apic_unique_id(u8 id)
3962{
3963#ifdef CONFIG_X86_32
3964 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3965 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3966 return io_apic_get_unique_id(nr_ioapics, id);
3967 else
3968 return id;
3969#else
3970 int i;
3971 DECLARE_BITMAP(used, 256);
3972
3973 bitmap_zero(used, 256);
3974 for (i = 0; i < nr_ioapics; i++) {
3975 struct mpc_ioapic *ia = &mp_ioapics[i];
3976 __set_bit(ia->apicid, used);
3977 }
3978 if (!test_bit(id, used))
3979 return id;
3980 return find_first_zero_bit(used, 256);
3981#endif
3982} 3619}
3983 3620
3984#ifdef CONFIG_X86_32 3621#ifdef CONFIG_X86_32
3985int __init io_apic_get_unique_id(int ioapic, int apic_id) 3622static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3986{ 3623{
3987 union IO_APIC_reg_00 reg_00; 3624 union IO_APIC_reg_00 reg_00;
3988 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3625 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -4055,9 +3692,32 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4055 3692
4056 return apic_id; 3693 return apic_id;
4057} 3694}
3695
3696static u8 __init io_apic_unique_id(u8 id)
3697{
3698 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3699 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3700 return io_apic_get_unique_id(nr_ioapics, id);
3701 else
3702 return id;
3703}
3704#else
3705static u8 __init io_apic_unique_id(u8 id)
3706{
3707 int i;
3708 DECLARE_BITMAP(used, 256);
3709
3710 bitmap_zero(used, 256);
3711 for (i = 0; i < nr_ioapics; i++) {
3712 __set_bit(mpc_ioapic_id(i), used);
3713 }
3714 if (!test_bit(id, used))
3715 return id;
3716 return find_first_zero_bit(used, 256);
3717}
4058#endif 3718#endif
4059 3719
4060int __init io_apic_get_version(int ioapic) 3720static int __init io_apic_get_version(int ioapic)
4061{ 3721{
4062 union IO_APIC_reg_01 reg_01; 3722 union IO_APIC_reg_01 reg_01;
4063 unsigned long flags; 3723 unsigned long flags;
@@ -4102,14 +3762,14 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4102void __init setup_ioapic_dest(void) 3762void __init setup_ioapic_dest(void)
4103{ 3763{
4104 int pin, ioapic, irq, irq_entry; 3764 int pin, ioapic, irq, irq_entry;
4105 struct irq_desc *desc;
4106 const struct cpumask *mask; 3765 const struct cpumask *mask;
3766 struct irq_data *idata;
4107 3767
4108 if (skip_ioapic_setup == 1) 3768 if (skip_ioapic_setup == 1)
4109 return; 3769 return;
4110 3770
4111 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) 3771 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4112 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 3772 for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
4113 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 3773 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4114 if (irq_entry == -1) 3774 if (irq_entry == -1)
4115 continue; 3775 continue;
@@ -4118,21 +3778,20 @@ void __init setup_ioapic_dest(void)
4118 if ((ioapic > 0) && (irq > 16)) 3778 if ((ioapic > 0) && (irq > 16))
4119 continue; 3779 continue;
4120 3780
4121 desc = irq_to_desc(irq); 3781 idata = irq_get_irq_data(irq);
4122 3782
4123 /* 3783 /*
4124 * Honour affinities which have been set in early boot 3784 * Honour affinities which have been set in early boot
4125 */ 3785 */
4126 if (desc->status & 3786 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
4127 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3787 mask = idata->affinity;
4128 mask = desc->affinity;
4129 else 3788 else
4130 mask = apic->target_cpus(); 3789 mask = apic->target_cpus();
4131 3790
4132 if (intr_remapping_enabled) 3791 if (intr_remapping_enabled)
4133 set_ir_ioapic_affinity_irq_desc(desc, mask); 3792 ir_ioapic_set_affinity(idata, mask, false);
4134 else 3793 else
4135 set_ioapic_affinity_irq_desc(desc, mask); 3794 ioapic_set_affinity(idata, mask, false);
4136 } 3795 }
4137 3796
4138} 3797}
@@ -4172,7 +3831,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4172 return res; 3831 return res;
4173} 3832}
4174 3833
4175void __init ioapic_init_mappings(void) 3834void __init ioapic_and_gsi_init(void)
4176{ 3835{
4177 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3836 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
4178 struct resource *ioapic_res; 3837 struct resource *ioapic_res;
@@ -4181,7 +3840,7 @@ void __init ioapic_init_mappings(void)
4181 ioapic_res = ioapic_setup_resources(nr_ioapics); 3840 ioapic_res = ioapic_setup_resources(nr_ioapics);
4182 for (i = 0; i < nr_ioapics; i++) { 3841 for (i = 0; i < nr_ioapics; i++) {
4183 if (smp_found_config) { 3842 if (smp_found_config) {
4184 ioapic_phys = mp_ioapics[i].apicaddr; 3843 ioapic_phys = mpc_ioapic_addr(i);
4185#ifdef CONFIG_X86_32 3844#ifdef CONFIG_X86_32
4186 if (!ioapic_phys) { 3845 if (!ioapic_phys) {
4187 printk(KERN_ERR 3846 printk(KERN_ERR
@@ -4210,6 +3869,8 @@ fake_ioapic_page:
4210 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; 3869 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4211 ioapic_res++; 3870 ioapic_res++;
4212 } 3871 }
3872
3873 probe_nr_irqs_gsi();
4213} 3874}
4214 3875
4215void __init ioapic_insert_resources(void) 3876void __init ioapic_insert_resources(void)
@@ -4234,10 +3895,14 @@ int mp_find_ioapic(u32 gsi)
4234{ 3895{
4235 int i = 0; 3896 int i = 0;
4236 3897
3898 if (nr_ioapics == 0)
3899 return -1;
3900
4237 /* Find the IOAPIC that manages this GSI. */ 3901 /* Find the IOAPIC that manages this GSI. */
4238 for (i = 0; i < nr_ioapics; i++) { 3902 for (i = 0; i < nr_ioapics; i++) {
4239 if ((gsi >= mp_gsi_routing[i].gsi_base) 3903 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
4240 && (gsi <= mp_gsi_routing[i].gsi_end)) 3904 if ((gsi >= gsi_cfg->gsi_base)
3905 && (gsi <= gsi_cfg->gsi_end))
4241 return i; 3906 return i;
4242 } 3907 }
4243 3908
@@ -4247,18 +3912,22 @@ int mp_find_ioapic(u32 gsi)
4247 3912
4248int mp_find_ioapic_pin(int ioapic, u32 gsi) 3913int mp_find_ioapic_pin(int ioapic, u32 gsi)
4249{ 3914{
3915 struct mp_ioapic_gsi *gsi_cfg;
3916
4250 if (WARN_ON(ioapic == -1)) 3917 if (WARN_ON(ioapic == -1))
4251 return -1; 3918 return -1;
4252 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) 3919
3920 gsi_cfg = mp_ioapic_gsi_routing(ioapic);
3921 if (WARN_ON(gsi > gsi_cfg->gsi_end))
4253 return -1; 3922 return -1;
4254 3923
4255 return gsi - mp_gsi_routing[ioapic].gsi_base; 3924 return gsi - gsi_cfg->gsi_base;
4256} 3925}
4257 3926
4258static int bad_ioapic(unsigned long address) 3927static __init int bad_ioapic(unsigned long address)
4259{ 3928{
4260 if (nr_ioapics >= MAX_IO_APICS) { 3929 if (nr_ioapics >= MAX_IO_APICS) {
4261 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3930 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
4262 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3931 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4263 return 1; 3932 return 1;
4264 } 3933 }
@@ -4274,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4274{ 3943{
4275 int idx = 0; 3944 int idx = 0;
4276 int entries; 3945 int entries;
3946 struct mp_ioapic_gsi *gsi_cfg;
4277 3947
4278 if (bad_ioapic(address)) 3948 if (bad_ioapic(address))
4279 return; 3949 return;
4280 3950
4281 idx = nr_ioapics; 3951 idx = nr_ioapics;
4282 3952
4283 mp_ioapics[idx].type = MP_IOAPIC; 3953 ioapics[idx].mp_config.type = MP_IOAPIC;
4284 mp_ioapics[idx].flags = MPC_APIC_USABLE; 3954 ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
4285 mp_ioapics[idx].apicaddr = address; 3955 ioapics[idx].mp_config.apicaddr = address;
4286 3956
4287 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 3957 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4288 mp_ioapics[idx].apicid = io_apic_unique_id(id); 3958 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
4289 mp_ioapics[idx].apicver = io_apic_get_version(idx); 3959 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4290 3960
4291 /* 3961 /*
4292 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 3962 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4293 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 3963 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4294 */ 3964 */
4295 entries = io_apic_get_redir_entries(idx); 3965 entries = io_apic_get_redir_entries(idx);
4296 mp_gsi_routing[idx].gsi_base = gsi_base; 3966 gsi_cfg = mp_ioapic_gsi_routing(idx);
4297 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; 3967 gsi_cfg->gsi_base = gsi_base;
3968 gsi_cfg->gsi_end = gsi_base + entries - 1;
4298 3969
4299 /* 3970 /*
4300 * The number of IO-APIC IRQ registers (== #pins): 3971 * The number of IO-APIC IRQ registers (== #pins):
4301 */ 3972 */
4302 nr_ioapic_registers[idx] = entries; 3973 ioapics[idx].nr_registers = entries;
4303 3974
4304 if (mp_gsi_routing[idx].gsi_end >= gsi_top) 3975 if (gsi_cfg->gsi_end >= gsi_top)
4305 gsi_top = mp_gsi_routing[idx].gsi_end + 1; 3976 gsi_top = gsi_cfg->gsi_end + 1;
4306 3977
4307 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 3978 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4308 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 3979 "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
4309 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, 3980 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4310 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); 3981 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4311 3982
4312 nr_ioapics++; 3983 nr_ioapics++;
4313} 3984}
@@ -4315,20 +3986,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4315/* Enable IOAPIC early just for system timer */ 3986/* Enable IOAPIC early just for system timer */
4316void __init pre_init_apic_IRQ0(void) 3987void __init pre_init_apic_IRQ0(void)
4317{ 3988{
4318 struct irq_cfg *cfg; 3989 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4319 struct irq_desc *desc;
4320 3990
4321 printk(KERN_INFO "Early APIC setup for system timer0\n"); 3991 printk(KERN_INFO "Early APIC setup for system timer0\n");
4322#ifndef CONFIG_SMP 3992#ifndef CONFIG_SMP
4323 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 3993 physid_set_mask_of_physid(boot_cpu_physical_apicid,
3994 &phys_cpu_present_map);
4324#endif 3995#endif
4325 desc = irq_to_desc_alloc_node(0, 0);
4326
4327 setup_local_APIC(); 3996 setup_local_APIC();
4328 3997
4329 cfg = irq_cfg(0); 3998 io_apic_setup_irq_pin(0, 0, &attr);
4330 add_pin_to_irq_node(cfg, 0, 0, 0); 3999 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4331 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4000 "edge");
4332
4333 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4334} 4001}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index a43f71cb30f8..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <asm/apic.h>
15
16#include <linux/nmi.h>
17#include <linux/mm.h>
18#include <linux/delay.h>
19#include <linux/interrupt.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/sysdev.h>
23#include <linux/sysctl.h>
24#include <linux/percpu.h>
25#include <linux/kprobes.h>
26#include <linux/cpumask.h>
27#include <linux/kernel_stat.h>
28#include <linux/kdebug.h>
29#include <linux/smp.h>
30
31#include <asm/i8259.h>
32#include <asm/io_apic.h>
33#include <asm/proto.h>
34#include <asm/timer.h>
35
36#include <asm/mce.h>
37
38#include <asm/mach_traps.h>
39
40int unknown_nmi_panic;
41int nmi_watchdog_enabled;
42
43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
45
46/* nmi_active:
47 * >0: the lapic NMI watchdog is active, but can be disabled
48 * <0: the lapic NMI watchdog has not been set up, and cannot
49 * be enabled
50 * 0: the lapic NMI watchdog is disabled, but can be enabled
51 */
52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
58static int panic_on_timeout;
59
60static unsigned int nmi_hz = HZ;
61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
63
64static inline unsigned int get_nmi_count(int cpu)
65{
66 return per_cpu(irq_stat, cpu).__nmi_count;
67}
68
69static inline int mce_in_progress(void)
70{
71#if defined(CONFIG_X86_MCE)
72 return atomic_read(&mce_entry) > 0;
73#endif
74 return 0;
75}
76
77/*
78 * Take the local apic timer and PIT/HPET into account. We don't
79 * know which one is active, when we have highres/dyntick on
80 */
81static inline unsigned int get_timer_irqs(int cpu)
82{
83 return per_cpu(irq_stat, cpu).apic_timer_irqs +
84 per_cpu(irq_stat, cpu).irq0_irqs;
85}
86
87#ifdef CONFIG_SMP
88/*
89 * The performance counters used by NMI_LOCAL_APIC don't trigger when
90 * the CPU is idle. To make sure the NMI watchdog really ticks on all
91 * CPUs during the test make them busy.
92 */
93static __init void nmi_cpu_busy(void *data)
94{
95 local_irq_enable_in_hardirq();
96 /*
97 * Intentionally don't use cpu_relax here. This is
98 * to make sure that the performance counter really ticks,
99 * even if there is a simulator or similar that catches the
100 * pause instruction. On a real HT machine this is fine because
101 * all other CPUs are busy with "useless" delay loops and don't
102 * care if they get somewhat less cycles.
103 */
104 while (endflag == 0)
105 mb();
106}
107#endif
108
109static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
110{
111 printk(KERN_CONT "\n");
112
113 printk(KERN_WARNING
114 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
116
117 printk(KERN_WARNING
118 "Please report this to bugzilla.kernel.org,\n");
119 printk(KERN_WARNING
120 "and attach the output of the 'dmesg' command.\n");
121
122 per_cpu(wd_enabled, cpu) = 0;
123 atomic_dec(&nmi_active);
124}
125
126static void __acpi_nmi_disable(void *__unused)
127{
128 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
129}
130
131int __init check_nmi_watchdog(void)
132{
133 unsigned int *prev_nmi_count;
134 int cpu;
135
136 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
137 return 0;
138
139 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
140 if (!prev_nmi_count)
141 goto error;
142
143 printk(KERN_INFO "Testing NMI watchdog ... ");
144
145#ifdef CONFIG_SMP
146 if (nmi_watchdog == NMI_LOCAL_APIC)
147 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
148#endif
149
150 for_each_possible_cpu(cpu)
151 prev_nmi_count[cpu] = get_nmi_count(cpu);
152 local_irq_enable();
153 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
154
155 for_each_online_cpu(cpu) {
156 if (!per_cpu(wd_enabled, cpu))
157 continue;
158 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
159 report_broken_nmi(cpu, prev_nmi_count);
160 }
161 endflag = 1;
162 if (!atomic_read(&nmi_active)) {
163 kfree(prev_nmi_count);
164 atomic_set(&nmi_active, -1);
165 goto error;
166 }
167 printk("OK.\n");
168
169 /*
170 * now that we know it works we can reduce NMI frequency to
171 * something more reasonable; makes a difference in some configs
172 */
173 if (nmi_watchdog == NMI_LOCAL_APIC)
174 nmi_hz = lapic_adjust_nmi_hz(1);
175
176 kfree(prev_nmi_count);
177 return 0;
178error:
179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259)
181 legacy_pic->chip->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 }
184
185#ifdef CONFIG_X86_32
186 timer_ack = 0;
187#endif
188 return -1;
189}
190
191static int __init setup_nmi_watchdog(char *str)
192{
193 unsigned int nmi;
194
195 if (!strncmp(str, "panic", 5)) {
196 panic_on_timeout = 1;
197 str = strchr(str, ',');
198 if (!str)
199 return 1;
200 ++str;
201 }
202
203 if (!strncmp(str, "lapic", 5))
204 nmi_watchdog = NMI_LOCAL_APIC;
205 else if (!strncmp(str, "ioapic", 6))
206 nmi_watchdog = NMI_IO_APIC;
207 else {
208 get_option(&str, &nmi);
209 if (nmi >= NMI_INVALID)
210 return 0;
211 nmi_watchdog = nmi;
212 }
213
214 return 1;
215}
216__setup("nmi_watchdog=", setup_nmi_watchdog);
217
218/*
219 * Suspend/resume support
220 */
221#ifdef CONFIG_PM
222
223static int nmi_pm_active; /* nmi_active before suspend */
224
225static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
226{
227 /* only CPU0 goes here, other CPUs should be offline */
228 nmi_pm_active = atomic_read(&nmi_active);
229 stop_apic_nmi_watchdog(NULL);
230 BUG_ON(atomic_read(&nmi_active) != 0);
231 return 0;
232}
233
234static int lapic_nmi_resume(struct sys_device *dev)
235{
236 /* only CPU0 goes here, other CPUs should be offline */
237 if (nmi_pm_active > 0) {
238 setup_apic_nmi_watchdog(NULL);
239 touch_nmi_watchdog();
240 }
241 return 0;
242}
243
244static struct sysdev_class nmi_sysclass = {
245 .name = "lapic_nmi",
246 .resume = lapic_nmi_resume,
247 .suspend = lapic_nmi_suspend,
248};
249
250static struct sys_device device_lapic_nmi = {
251 .id = 0,
252 .cls = &nmi_sysclass,
253};
254
255static int __init init_lapic_nmi_sysfs(void)
256{
257 int error;
258
259 /*
260 * should really be a BUG_ON but b/c this is an
261 * init call, it just doesn't work. -dcz
262 */
263 if (nmi_watchdog != NMI_LOCAL_APIC)
264 return 0;
265
266 if (atomic_read(&nmi_active) < 0)
267 return 0;
268
269 error = sysdev_class_register(&nmi_sysclass);
270 if (!error)
271 error = sysdev_register(&device_lapic_nmi);
272 return error;
273}
274
275/* must come after the local APIC's device_initcall() */
276late_initcall(init_lapic_nmi_sysfs);
277
278#endif /* CONFIG_PM */
279
280static void __acpi_nmi_enable(void *__unused)
281{
282 apic_write(APIC_LVT0, APIC_DM_NMI);
283}
284
285/*
286 * Enable timer based NMIs on all CPUs:
287 */
288void acpi_nmi_enable(void)
289{
290 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
291 on_each_cpu(__acpi_nmi_enable, NULL, 1);
292}
293
294/*
295 * Disable timer based NMIs on all CPUs:
296 */
297void acpi_nmi_disable(void)
298{
299 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
300 on_each_cpu(__acpi_nmi_disable, NULL, 1);
301}
302
303/*
304 * This function is called as soon the LAPIC NMI watchdog driver has everything
305 * in place and it's ready to check if the NMIs belong to the NMI watchdog
306 */
307void cpu_nmi_set_wd_enabled(void)
308{
309 __get_cpu_var(wd_enabled) = 1;
310}
311
312void setup_apic_nmi_watchdog(void *unused)
313{
314 if (__get_cpu_var(wd_enabled))
315 return;
316
317 /* cheap hack to support suspend/resume */
318 /* if cpu0 is not active neither should the other cpus */
319 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
320 return;
321
322 switch (nmi_watchdog) {
323 case NMI_LOCAL_APIC:
324 if (lapic_watchdog_init(nmi_hz) < 0) {
325 __get_cpu_var(wd_enabled) = 0;
326 return;
327 }
328 /* FALL THROUGH */
329 case NMI_IO_APIC:
330 __get_cpu_var(wd_enabled) = 1;
331 atomic_inc(&nmi_active);
332 }
333}
334
335void stop_apic_nmi_watchdog(void *unused)
336{
337 /* only support LOCAL and IO APICs for now */
338 if (!nmi_watchdog_active())
339 return;
340 if (__get_cpu_var(wd_enabled) == 0)
341 return;
342 if (nmi_watchdog == NMI_LOCAL_APIC)
343 lapic_watchdog_stop();
344 else
345 __acpi_nmi_disable(NULL);
346 __get_cpu_var(wd_enabled) = 0;
347 atomic_dec(&nmi_active);
348}
349
350/*
351 * the best way to detect whether a CPU has a 'hard lockup' problem
352 * is to check it's local APIC timer IRQ counts. If they are not
353 * changing then that CPU has some problem.
354 *
355 * as these watchdog NMI IRQs are generated on every CPU, we only
356 * have to check the current processor.
357 *
358 * since NMIs don't listen to _any_ locks, we have to be extremely
359 * careful not to rely on unsafe variables. The printk might lock
360 * up though, so we have to break up any console locks first ...
361 * [when there will be more tty-related locks, break them up here too!]
362 */
363
364static DEFINE_PER_CPU(unsigned, last_irq_sum);
365static DEFINE_PER_CPU(long, alert_counter);
366static DEFINE_PER_CPU(int, nmi_touch);
367
368void touch_nmi_watchdog(void)
369{
370 if (nmi_watchdog_active()) {
371 unsigned cpu;
372
373 /*
374 * Tell other CPUs to reset their alert counters. We cannot
375 * do it ourselves because the alert count increase is not
376 * atomic.
377 */
378 for_each_present_cpu(cpu) {
379 if (per_cpu(nmi_touch, cpu) != 1)
380 per_cpu(nmi_touch, cpu) = 1;
381 }
382 }
383
384 /*
385 * Tickle the softlockup detector too:
386 */
387 touch_softlockup_watchdog();
388}
389EXPORT_SYMBOL(touch_nmi_watchdog);
390
391notrace __kprobes int
392nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
393{
394 /*
395 * Since current_thread_info()-> is always on the stack, and we
396 * always switch the stack NMI-atomically, it's safe to use
397 * smp_processor_id().
398 */
399 unsigned int sum;
400 int touched = 0;
401 int cpu = smp_processor_id();
402 int rc = 0;
403
404 sum = get_timer_irqs(cpu);
405
406 if (__get_cpu_var(nmi_touch)) {
407 __get_cpu_var(nmi_touch) = 0;
408 touched = 1;
409 }
410
411 /* We can be called before check_nmi_watchdog, hence NULL check. */
412 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
413 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
414
415 raw_spin_lock(&lock);
416 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
417 show_regs(regs);
418 dump_stack();
419 raw_spin_unlock(&lock);
420 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
421
422 rc = 1;
423 }
424
425 /* Could check oops_in_progress here too, but it's safer not to */
426 if (mce_in_progress())
427 touched = 1;
428
429 /* if the none of the timers isn't firing, this cpu isn't doing much */
430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
431 /*
432 * Ayiee, looks like this CPU is stuck ...
433 * wait a few IRQs (5 seconds) before doing the oops ...
434 */
435 __this_cpu_inc(alert_counter);
436 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
437 /*
438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
442 } else {
443 __get_cpu_var(last_irq_sum) = sum;
444 __this_cpu_write(alert_counter, 0);
445 }
446
447 /* see if the nmi watchdog went off */
448 if (!__get_cpu_var(wd_enabled))
449 return rc;
450 switch (nmi_watchdog) {
451 case NMI_LOCAL_APIC:
452 rc |= lapic_wd_event(nmi_hz);
453 break;
454 case NMI_IO_APIC:
455 /*
456 * don't know how to accurately check for this.
457 * just assume it was a watchdog timer interrupt
458 * This matches the old behaviour.
459 */
460 rc = 1;
461 break;
462 }
463 return rc;
464}
465
466#ifdef CONFIG_SYSCTL
467
468static void enable_ioapic_nmi_watchdog_single(void *unused)
469{
470 __get_cpu_var(wd_enabled) = 1;
471 atomic_inc(&nmi_active);
472 __acpi_nmi_enable(NULL);
473}
474
475static void enable_ioapic_nmi_watchdog(void)
476{
477 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
478 touch_nmi_watchdog();
479}
480
481static void disable_ioapic_nmi_watchdog(void)
482{
483 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
484}
485
486static int __init setup_unknown_nmi_panic(char *str)
487{
488 unknown_nmi_panic = 1;
489 return 1;
490}
491__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
492
493static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
494{
495 unsigned char reason = get_nmi_reason();
496 char buf[64];
497
498 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
499 die_nmi(buf, regs, 1); /* Always panic here */
500 return 0;
501}
502
503/*
504 * proc handler for /proc/sys/kernel/nmi
505 */
506int proc_nmi_enabled(struct ctl_table *table, int write,
507 void __user *buffer, size_t *length, loff_t *ppos)
508{
509 int old_state;
510
511 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
512 old_state = nmi_watchdog_enabled;
513 proc_dointvec(table, write, buffer, length, ppos);
514 if (!!old_state == !!nmi_watchdog_enabled)
515 return 0;
516
517 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
518 printk(KERN_WARNING
519 "NMI watchdog is permanently disabled\n");
520 return -EIO;
521 }
522
523 if (nmi_watchdog == NMI_LOCAL_APIC) {
524 if (nmi_watchdog_enabled)
525 enable_lapic_nmi_watchdog();
526 else
527 disable_lapic_nmi_watchdog();
528 } else if (nmi_watchdog == NMI_IO_APIC) {
529 if (nmi_watchdog_enabled)
530 enable_ioapic_nmi_watchdog();
531 else
532 disable_ioapic_nmi_watchdog();
533 } else {
534 printk(KERN_WARNING
535 "NMI watchdog doesn't know what hardware to touch\n");
536 return -EIO;
537 }
538 return 0;
539}
540
541#endif /* CONFIG_SYSCTL */
542
543int do_nmi_callback(struct pt_regs *regs, int cpu)
544{
545#ifdef CONFIG_SYSCTL
546 if (unknown_nmi_panic)
547 return unknown_nmi_panic_callback(regs, cpu);
548#endif
549 return 0;
550}
551
552void arch_trigger_all_cpu_backtrace(void)
553{
554 int i;
555
556 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
557
558 printk(KERN_INFO "sending NMI to all CPUs:\n");
559 apic->send_IPI_all(NMI_VECTOR);
560
561 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
562 for (i = 0; i < 10 * 1000; i++) {
563 if (cpumask_empty(to_cpumask(backtrace_mask)))
564 break;
565 mdelay(1);
566 }
567}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161c..c4a61ca1349a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
26#include <linux/nodemask.h> 26#include <linux/nodemask.h>
27#include <linux/topology.h> 27#include <linux/topology.h>
28#include <linux/bootmem.h> 28#include <linux/bootmem.h>
29#include <linux/memblock.h>
29#include <linux/threads.h> 30#include <linux/threads.h>
30#include <linux/cpumask.h> 31#include <linux/cpumask.h>
31#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -47,8 +48,6 @@
47#include <asm/e820.h> 48#include <asm/e820.h>
48#include <asm/ipi.h> 49#include <asm/ipi.h>
49 50
50#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
51
52int found_numaq; 51int found_numaq;
53 52
54/* 53/*
@@ -78,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
78static inline void numaq_register_node(int node, struct sys_cfg_data *scd) 77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
79{ 78{
80 struct eachquadmem *eq = scd->eq + node; 79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
81 83
82 node_set_online(node); 84 node_set(node, numa_nodes_parsed);
83 85 ret = numa_add_memblk(node, start, end);
84 /* Convert to pages */ 86 BUG_ON(ret < 0);
85 node_start_pfn[node] =
86 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
87
88 node_end_pfn[node] =
89 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
90
91 e820_register_active_regions(node, node_start_pfn[node],
92 node_end_pfn[node]);
93
94 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
95
96 node_remap_size[node] = node_memmap_size_bytes(node,
97 node_start_pfn[node],
98 node_end_pfn[node]);
99} 87}
100 88
101/* 89/*
102 * Function: smp_dump_qct() 90 * Function: smp_dump_qct()
103 * 91 *
104 * Description: gets memory layout from the quad config table. This 92 * Description: gets memory layout from the quad config table. This
105 * function also updates node_online_map with the nodes (quads) present. 93 * function also updates numa_nodes_parsed with the nodes (quads) present.
106 */ 94 */
107static void __init smp_dump_qct(void) 95static void __init smp_dump_qct(void)
108{ 96{
@@ -111,7 +99,6 @@ static void __init smp_dump_qct(void)
111 99
112 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); 100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
113 101
114 nodes_clear(node_online_map);
115 for_each_node(node) { 102 for_each_node(node) {
116 if (scd->quads_present31_0 & (1 << node)) 103 if (scd->quads_present31_0 & (1 << node))
117 numaq_register_node(node, scd); 104 numaq_register_node(node, scd);
@@ -281,14 +268,14 @@ static __init void early_check_numaq(void)
281 } 268 }
282} 269}
283 270
284int __init get_memcfg_numaq(void) 271int __init numaq_numa_init(void)
285{ 272{
286 early_check_numaq(); 273 early_check_numaq();
287 if (!found_numaq) 274 if (!found_numaq)
288 return 0; 275 return -ENOENT;
289 smp_dump_qct(); 276 smp_dump_qct();
290 277
291 return 1; 278 return 0;
292} 279}
293 280
294#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -372,13 +359,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
372 return physids_promote(0xFUL, retmap); 359 return physids_promote(0xFUL, retmap);
373} 360}
374 361
375static inline int numaq_cpu_to_logical_apicid(int cpu)
376{
377 if (cpu >= nr_cpu_ids)
378 return BAD_APICID;
379 return cpu_2_logical_apicid[cpu];
380}
381
382/* 362/*
383 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 363 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
384 * cpu to APIC ID relation to properly interact with the intelligent 364 * cpu to APIC ID relation to properly interact with the intelligent
@@ -397,6 +377,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
397 return logical_apicid >> 4; 377 return logical_apicid >> 4;
398} 378}
399 379
380static int numaq_numa_cpu_node(int cpu)
381{
382 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
383
384 if (logical_apicid != BAD_APICID)
385 return numaq_apicid_to_node(logical_apicid);
386 return NUMA_NO_NODE;
387}
388
400static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 389static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
401{ 390{
402 int node = numaq_apicid_to_node(logical_apicid); 391 int node = numaq_apicid_to_node(logical_apicid);
@@ -483,8 +472,8 @@ static void numaq_setup_portio_remap(void)
483 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 472 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
484} 473}
485 474
486/* Use __refdata to keep false positive warning calm. */ 475/* Use __refdata to keep false positive warning calm. */
487struct apic __refdata apic_numaq = { 476static struct apic __refdata apic_numaq = {
488 477
489 .name = "NUMAQ", 478 .name = "NUMAQ",
490 .probe = probe_numaq, 479 .probe = probe_numaq,
@@ -507,8 +496,6 @@ struct apic __refdata apic_numaq = {
507 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 496 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
508 .setup_apic_routing = numaq_setup_apic_routing, 497 .setup_apic_routing = numaq_setup_apic_routing,
509 .multi_timer_check = numaq_multi_timer_check, 498 .multi_timer_check = numaq_multi_timer_check,
510 .apicid_to_node = numaq_apicid_to_node,
511 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
512 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 499 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
513 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 500 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
514 .setup_portio_remap = numaq_setup_portio_remap, 501 .setup_portio_remap = numaq_setup_portio_remap,
@@ -546,4 +533,9 @@ struct apic __refdata apic_numaq = {
546 .icr_write = native_apic_icr_write, 533 .icr_write = native_apic_icr_write,
547 .wait_icr_idle = native_apic_wait_icr_idle, 534 .wait_icr_idle = native_apic_wait_icr_idle,
548 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 535 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
536
537 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
538 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
549}; 539};
540
541apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..b5254ad044ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,29 +52,9 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void __init default_setup_apic_routing(void) 55static int default_x86_32_early_logical_apicid(int cpu)
56{ 56{
57 int version = apic_version[boot_cpu_physical_apicid]; 57 return 1 << cpu;
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78} 58}
79 59
80static void setup_apic_flat_routing(void) 60static void setup_apic_flat_routing(void)
@@ -107,7 +87,7 @@ static int probe_default(void)
107 return 1; 87 return 1;
108} 88}
109 89
110struct apic apic_default = { 90static struct apic apic_default = {
111 91
112 .name = "default", 92 .name = "default",
113 .probe = probe_default, 93 .probe = probe_default,
@@ -130,8 +110,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 110 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 111 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 112 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 113 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 114 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 115 .setup_portio_remap = NULL,
@@ -167,46 +145,26 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 145 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 146 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 147 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
148
149 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
170}; 150};
171 151
172extern struct apic apic_numaq; 152apic_driver(apic_default);
173extern struct apic apic_summit;
174extern struct apic apic_bigsmp;
175extern struct apic apic_es7000;
176extern struct apic apic_es7000_cluster;
177 153
178struct apic *apic = &apic_default; 154struct apic *apic = &apic_default;
179EXPORT_SYMBOL_GPL(apic); 155EXPORT_SYMBOL_GPL(apic);
180 156
181static struct apic *apic_probe[] __initdata = {
182#ifdef CONFIG_X86_NUMAQ
183 &apic_numaq,
184#endif
185#ifdef CONFIG_X86_SUMMIT
186 &apic_summit,
187#endif
188#ifdef CONFIG_X86_BIGSMP
189 &apic_bigsmp,
190#endif
191#ifdef CONFIG_X86_ES7000
192 &apic_es7000,
193 &apic_es7000_cluster,
194#endif
195 &apic_default, /* must be last */
196 NULL,
197};
198
199static int cmdline_apic __initdata; 157static int cmdline_apic __initdata;
200static int __init parse_apic(char *arg) 158static int __init parse_apic(char *arg)
201{ 159{
202 int i; 160 struct apic **drv;
203 161
204 if (!arg) 162 if (!arg)
205 return -EINVAL; 163 return -EINVAL;
206 164
207 for (i = 0; apic_probe[i]; i++) { 165 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
208 if (!strcmp(apic_probe[i]->name, arg)) { 166 if (!strcmp((*drv)->name, arg)) {
209 apic = apic_probe[i]; 167 apic = *drv;
210 cmdline_apic = 1; 168 cmdline_apic = 1;
211 return 0; 169 return 0;
212 } 170 }
@@ -217,38 +175,58 @@ static int __init parse_apic(char *arg)
217} 175}
218early_param("apic", parse_apic); 176early_param("apic", parse_apic);
219 177
220void __init generic_bigsmp_probe(void) 178void __init default_setup_apic_routing(void)
221{ 179{
180 int version = apic_version[boot_cpu_physical_apicid];
181
182 if (num_possible_cpus() > 8) {
183 switch (boot_cpu_data.x86_vendor) {
184 case X86_VENDOR_INTEL:
185 if (!APIC_XAPIC(version)) {
186 def_to_bigsmp = 0;
187 break;
188 }
189 /* If P4 and above fall through */
190 case X86_VENDOR_AMD:
191 def_to_bigsmp = 1;
192 }
193 }
194
222#ifdef CONFIG_X86_BIGSMP 195#ifdef CONFIG_X86_BIGSMP
223 /* 196 /*
224 * This routine is used to switch to bigsmp mode when 197 * This is used to switch to bigsmp mode when
225 * - There is no apic= option specified by the user 198 * - There is no apic= option specified by the user
226 * - generic_apic_probe() has chosen apic_default as the sub_arch 199 * - generic_apic_probe() has chosen apic_default as the sub_arch
227 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
228 */ 201 */
229 202
230 if (!cmdline_apic && apic == &apic_default) { 203 if (!cmdline_apic && apic == &apic_default) {
231 if (apic_bigsmp.probe()) { 204 struct apic *bigsmp = generic_bigsmp_probe();
232 apic = &apic_bigsmp; 205 if (bigsmp) {
206 apic = bigsmp;
233 printk(KERN_INFO "Overriding APIC driver with %s\n", 207 printk(KERN_INFO "Overriding APIC driver with %s\n",
234 apic->name); 208 apic->name);
235 } 209 }
236 } 210 }
237#endif 211#endif
212
213 if (apic->setup_apic_routing)
214 apic->setup_apic_routing();
238} 215}
239 216
240void __init generic_apic_probe(void) 217void __init generic_apic_probe(void)
241{ 218{
242 if (!cmdline_apic) { 219 if (!cmdline_apic) {
243 int i; 220 struct apic **drv;
244 for (i = 0; apic_probe[i]; i++) { 221
245 if (apic_probe[i]->probe()) { 222 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
246 apic = apic_probe[i]; 223 if ((*drv)->probe()) {
224 apic = *drv;
247 break; 225 break;
248 } 226 }
249 } 227 }
250 /* Not visible without early console */ 228 /* Not visible without early console */
251 if (!apic_probe[i]) 229 if (drv == __apicdrivers_end)
252 panic("Didn't find an APIC driver"); 230 panic("Didn't find an APIC driver");
253 } 231 }
254 printk(KERN_INFO "Using APIC driver %s\n", apic->name); 232 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -259,16 +237,16 @@ void __init generic_apic_probe(void)
259int __init 237int __init
260generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) 238generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
261{ 239{
262 int i; 240 struct apic **drv;
263 241
264 for (i = 0; apic_probe[i]; ++i) { 242 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
265 if (!apic_probe[i]->mps_oem_check) 243 if (!((*drv)->mps_oem_check))
266 continue; 244 continue;
267 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) 245 if (!(*drv)->mps_oem_check(mpc, oem, productid))
268 continue; 246 continue;
269 247
270 if (!cmdline_apic) { 248 if (!cmdline_apic) {
271 apic = apic_probe[i]; 249 apic = *drv;
272 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 250 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
273 apic->name); 251 apic->name);
274 } 252 }
@@ -279,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
279 257
280int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 258int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
281{ 259{
282 int i; 260 struct apic **drv;
283 261
284 for (i = 0; apic_probe[i]; ++i) { 262 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
285 if (!apic_probe[i]->acpi_madt_oem_check) 263 if (!(*drv)->acpi_madt_oem_check)
286 continue; 264 continue;
287 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) 265 if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
288 continue; 266 continue;
289 267
290 if (!cmdline_apic) { 268 if (!cmdline_apic) {
291 apic = apic_probe[i]; 269 apic = *drv;
292 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 270 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
293 apic->name); 271 apic->name);
294 } 272 }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..3fe986698929 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26extern struct apic apic_flat;
27extern struct apic apic_physflat;
28extern struct apic apic_x2xpic_uv_x;
29extern struct apic apic_x2apic_phys;
30extern struct apic apic_x2apic_cluster;
31
32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
34
35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
40 &apic_x2apic_phys,
41 &apic_x2apic_cluster,
42#endif
43 &apic_physflat,
44 NULL,
45};
46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) 26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{ 27{
49 return hard_smp_processor_id() >> index_msb; 28 return hard_smp_processor_id() >> index_msb;
@@ -54,35 +33,25 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
54 */ 33 */
55void __init default_setup_apic_routing(void) 34void __init default_setup_apic_routing(void)
56{ 35{
57#ifdef CONFIG_X86_X2APIC 36 struct apic **drv;
58 if (x2apic_mode
59#ifdef CONFIG_X86_UV
60 && apic != &apic_x2apic_uv_x
61#endif
62 ) {
63 if (x2apic_phys)
64 apic = &apic_x2apic_phys;
65 else
66 apic = &apic_x2apic_cluster;
67 }
68#endif
69 37
70 if (apic == &apic_flat && num_possible_cpus() > 8) 38 enable_IR_x2apic();
71 apic = &apic_physflat;
72 39
73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 40 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
41 if ((*drv)->probe && (*drv)->probe()) {
42 if (apic != *drv) {
43 apic = *drv;
44 pr_info("Switched APIC routing to %s.\n",
45 apic->name);
46 }
47 break;
48 }
49 }
74 50
75 if (is_vsmp_box()) { 51 if (is_vsmp_box()) {
76 /* need to update phys_pkg_id */ 52 /* need to update phys_pkg_id */
77 apic->phys_pkg_id = apicid_phys_pkg_id; 53 apic->phys_pkg_id = apicid_phys_pkg_id;
78 } 54 }
79
80 /*
81 * Now that apic routing model is selected, configure the
82 * fault handling for intr remapping.
83 */
84 if (intr_remapping_enabled)
85 enable_drhd_fault_handling();
86} 55}
87 56
88/* Same for both flat and physical. */ 57/* Same for both flat and physical. */
@@ -94,13 +63,15 @@ void apic_send_IPI_self(int vector)
94 63
95int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 64int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
96{ 65{
97 int i; 66 struct apic **drv;
98 67
99 for (i = 0; apic_probe[i]; ++i) { 68 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
100 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 69 if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
101 apic = apic_probe[i]; 70 if (apic != *drv) {
102 printk(KERN_INFO "Setting APIC routing to %s.\n", 71 apic = *drv;
103 apic->name); 72 pr_info("Setting APIC routing to %s.\n",
73 apic->name);
74 }
104 return 1; 75 return 1;
105 } 76 }
106 } 77 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..19114423c58c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -505,7 +491,7 @@ void setup_summit(void)
505} 491}
506#endif 492#endif
507 493
508struct apic apic_summit = { 494static struct apic apic_summit = {
509 495
510 .name = "summit", 496 .name = "summit",
511 .probe = probe_summit, 497 .probe = probe_summit,
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,8 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
568}; 554};
555
556apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..500795875827 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8#include <linux/cpu.h>
8 9
9#include <asm/smp.h> 10#include <asm/smp.h>
10#include <asm/apic.h> 11#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 12
13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
15static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
14 16
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 17static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 18{
17 return x2apic_enabled(); 19 return x2apic_enabled();
18} 20}
19 21
20/* 22static inline u32 x2apic_cluster(int cpu)
21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
24static const struct cpumask *x2apic_target_cpus(void)
25{ 23{
26 return cpu_online_mask; 24 return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
27}
28
29/*
30 * for now each logical cpu is in its own vector allocation domain.
31 */
32static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
33{
34 cpumask_clear(retmask);
35 cpumask_set_cpu(cpu, retmask);
36} 25}
37 26
38static void 27static void
39 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 28__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
40{ 29{
41 unsigned long cfg; 30 struct cpumask *cpus_in_cluster_ptr;
31 struct cpumask *ipi_mask_ptr;
32 unsigned int cpu, this_cpu;
33 unsigned long flags;
34 u32 dest;
35
36 x2apic_wrmsr_fence();
37
38 local_irq_save(flags);
42 39
43 cfg = __prepare_ICR(0, vector, dest); 40 this_cpu = smp_processor_id();
44 41
45 /* 42 /*
46 * send the IPI. 43 * We are to modify mask, so we need an own copy
44 * and be sure it's manipulated with irq off.
47 */ 45 */
48 native_x2apic_icr_write(cfg, apicid); 46 ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
49} 47 cpumask_copy(ipi_mask_ptr, mask);
50 48
51/* 49 /*
52 * for now, we send the IPI's one by one in the cpumask. 50 * The idea is to send one IPI per cluster.
53 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group 51 */
54 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 52 for_each_cpu(cpu, ipi_mask_ptr) {
55 * writes. 53 unsigned long i;
56 */
57static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{
59 unsigned long query_cpu;
60 unsigned long flags;
61 54
62 x2apic_wrmsr_fence(); 55 cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
56 dest = 0;
63 57
64 local_irq_save(flags); 58 /* Collect cpus in cluster. */
65 for_each_cpu(query_cpu, mask) { 59 for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
66 __x2apic_send_IPI_dest( 60 if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
67 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 61 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
68 vector, apic->dest_logical); 62 }
63
64 if (!dest)
65 continue;
66
67 __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
68 /*
69 * Cluster sibling cpus should be discared now so
70 * we would not send IPI them second time.
71 */
72 cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
69 } 73 }
74
70 local_irq_restore(flags); 75 local_irq_restore(flags);
71} 76}
72 77
78static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
79{
80 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
81}
82
73static void 83static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 85{
76 unsigned long this_cpu = smp_processor_id(); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu == this_cpu)
85 continue;
86 __x2apic_send_IPI_dest(
87 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
88 vector, apic->dest_logical);
89 }
90 local_irq_restore(flags);
91} 87}
92 88
93static void x2apic_send_IPI_allbutself(int vector) 89static void x2apic_send_IPI_allbutself(int vector)
94{ 90{
95 unsigned long this_cpu = smp_processor_id(); 91 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
96 unsigned long query_cpu;
97 unsigned long flags;
98
99 x2apic_wrmsr_fence();
100
101 local_irq_save(flags);
102 for_each_online_cpu(query_cpu) {
103 if (query_cpu == this_cpu)
104 continue;
105 __x2apic_send_IPI_dest(
106 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
107 vector, apic->dest_logical);
108 }
109 local_irq_restore(flags);
110} 92}
111 93
112static void x2apic_send_IPI_all(int vector) 94static void x2apic_send_IPI_all(int vector)
113{ 95{
114 x2apic_send_IPI_mask(cpu_online_mask, vector); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
115}
116
117static int x2apic_apic_id_registered(void)
118{
119 return 1;
120} 97}
121 98
122static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
151 return per_cpu(x86_cpu_to_logical_apicid, cpu); 128 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152} 129}
153 130
154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 131static void init_x2apic_ldr(void)
155{ 132{
156 unsigned int id; 133 unsigned int this_cpu = smp_processor_id();
134 unsigned int cpu;
157 135
158 id = x; 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
159 return id; 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 }
160} 145}
161 146
162static unsigned long set_apic_id(unsigned int id) 147 /*
148 * At CPU state changes, update the x2apic cluster sibling info.
149 */
150static int __cpuinit
151update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
163{ 152{
164 unsigned long x; 153 unsigned int this_cpu = (unsigned long)hcpu;
154 unsigned int cpu;
155 int err = 0;
156
157 switch (action) {
158 case CPU_UP_PREPARE:
159 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
160 GFP_KERNEL)) {
161 err = -ENOMEM;
162 } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
163 GFP_KERNEL)) {
164 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
165 err = -ENOMEM;
166 }
167 break;
168 case CPU_UP_CANCELED:
169 case CPU_UP_CANCELED_FROZEN:
170 case CPU_DEAD:
171 for_each_online_cpu(cpu) {
172 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
173 continue;
174 __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
175 __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
176 }
177 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
178 free_cpumask_var(per_cpu(ipi_mask, this_cpu));
179 break;
180 }
165 181
166 x = id; 182 return notifier_from_errno(err);
167 return x;
168} 183}
169 184
170static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 185static struct notifier_block __refdata x2apic_cpu_notifier = {
171{ 186 .notifier_call = update_clusterinfo,
172 return initial_apicid >> index_msb; 187};
173}
174 188
175static void x2apic_send_IPI_self(int vector) 189static int x2apic_init_cpu_notifier(void)
176{ 190{
177 apic_write(APIC_SELF_IPI, vector); 191 int cpu = smp_processor_id();
192
193 zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
194 zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1;
178} 201}
179 202
180static void init_x2apic_ldr(void) 203static int x2apic_cluster_probe(void)
181{ 204{
182 int cpu = smp_processor_id(); 205 if (x2apic_mode)
183 206 return x2apic_init_cpu_notifier();
184 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 207 else
208 return 0;
185} 209}
186 210
187struct apic apic_x2apic_cluster = { 211static struct apic apic_x2apic_cluster = {
188 212
189 .name = "cluster x2apic", 213 .name = "cluster x2apic",
190 .probe = NULL, 214 .probe = x2apic_cluster_probe,
191 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
192 .apic_id_registered = x2apic_apic_id_registered, 216 .apic_id_registered = x2apic_apic_id_registered,
193 217
@@ -206,18 +230,16 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 230 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 231 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 232 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 233 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 234 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 235 .setup_portio_remap = NULL,
214 .check_phys_apicid_present = default_check_phys_apicid_present, 236 .check_phys_apicid_present = default_check_phys_apicid_present,
215 .enable_apic_mode = NULL, 237 .enable_apic_mode = NULL,
216 .phys_pkg_id = x2apic_cluster_phys_pkg_id, 238 .phys_pkg_id = x2apic_phys_pkg_id,
217 .mps_oem_check = NULL, 239 .mps_oem_check = NULL,
218 240
219 .get_apic_id = x2apic_cluster_phys_get_apic_id, 241 .get_apic_id = x2apic_get_apic_id,
220 .set_apic_id = set_apic_id, 242 .set_apic_id = x2apic_set_apic_id,
221 .apic_id_mask = 0xFFFFFFFFu, 243 .apic_id_mask = 0xFFFFFFFFu,
222 244
223 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -242,3 +264,5 @@ struct apic apic_x2apic_cluster = {
242 .wait_icr_idle = native_x2apic_wait_icr_idle, 264 .wait_icr_idle = native_x2apic_wait_icr_idle,
243 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 265 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
244}; 266};
267
268apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..f5373dfde21e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h> 10#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 11
13int x2apic_phys; 12int x2apic_phys;
14 13
14static struct apic apic_x2apic_phys;
15
15static int set_x2apic_phys_mode(char *arg) 16static int set_x2apic_phys_mode(char *arg)
16{ 17{
17 x2apic_phys = 1; 18 x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 28 return 0;
28} 29}
29 30
30/* 31static void
31 * need to use more than cpu 0, because we need more vectors when 32__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
32 * MSI-X are used.
33 */
34static const struct cpumask *x2apic_target_cpus(void)
35{
36 return cpu_online_mask;
37}
38
39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
40{
41 cpumask_clear(retmask);
42 cpumask_set_cpu(cpu, retmask);
43}
44
45static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
46 unsigned int dest)
47{
48 unsigned long cfg;
49
50 cfg = __prepare_ICR(0, vector, dest);
51
52 /*
53 * send the IPI.
54 */
55 native_x2apic_icr_write(cfg, apicid);
56}
57
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{ 33{
60 unsigned long query_cpu; 34 unsigned long query_cpu;
35 unsigned long this_cpu;
61 unsigned long flags; 36 unsigned long flags;
62 37
63 x2apic_wrmsr_fence(); 38 x2apic_wrmsr_fence();
64 39
65 local_irq_save(flags); 40 local_irq_save(flags);
41
42 this_cpu = smp_processor_id();
66 for_each_cpu(query_cpu, mask) { 43 for_each_cpu(query_cpu, mask) {
44 if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
45 continue;
67 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 46 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
68 vector, APIC_DEST_PHYSICAL); 47 vector, APIC_DEST_PHYSICAL);
69 } 48 }
70 local_irq_restore(flags); 49 local_irq_restore(flags);
71} 50}
72 51
52static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
53{
54 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
55}
56
73static void 57static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 58 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 59{
76 unsigned long this_cpu = smp_processor_id(); 60 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu != this_cpu)
85 __x2apic_send_IPI_dest(
86 per_cpu(x86_cpu_to_apicid, query_cpu),
87 vector, APIC_DEST_PHYSICAL);
88 }
89 local_irq_restore(flags);
90} 61}
91 62
92static void x2apic_send_IPI_allbutself(int vector) 63static void x2apic_send_IPI_allbutself(int vector)
93{ 64{
94 unsigned long this_cpu = smp_processor_id(); 65 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
95 unsigned long query_cpu;
96 unsigned long flags;
97
98 x2apic_wrmsr_fence();
99
100 local_irq_save(flags);
101 for_each_online_cpu(query_cpu) {
102 if (query_cpu == this_cpu)
103 continue;
104 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
105 vector, APIC_DEST_PHYSICAL);
106 }
107 local_irq_restore(flags);
108} 66}
109 67
110static void x2apic_send_IPI_all(int vector) 68static void x2apic_send_IPI_all(int vector)
111{ 69{
112 x2apic_send_IPI_mask(cpu_online_mask, vector); 70 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
113}
114
115static int x2apic_apic_id_registered(void)
116{
117 return 1;
118} 71}
119 72
120static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 73static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
149 return per_cpu(x86_cpu_to_apicid, cpu); 102 return per_cpu(x86_cpu_to_apicid, cpu);
150} 103}
151 104
152static unsigned int x2apic_phys_get_apic_id(unsigned long x) 105static void init_x2apic_ldr(void)
153{
154 return x;
155}
156
157static unsigned long set_apic_id(unsigned int id)
158{
159 return id;
160}
161
162static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
163{ 106{
164 return initial_apicid >> index_msb;
165} 107}
166 108
167static void x2apic_send_IPI_self(int vector) 109static int x2apic_phys_probe(void)
168{ 110{
169 apic_write(APIC_SELF_IPI, vector); 111 if (x2apic_mode && x2apic_phys)
170} 112 return 1;
171 113
172static void init_x2apic_ldr(void) 114 return apic == &apic_x2apic_phys;
173{
174} 115}
175 116
176struct apic apic_x2apic_phys = { 117static struct apic apic_x2apic_phys = {
177 118
178 .name = "physical x2apic", 119 .name = "physical x2apic",
179 .probe = NULL, 120 .probe = x2apic_phys_probe,
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
181 .apic_id_registered = x2apic_apic_id_registered, 122 .apic_id_registered = x2apic_apic_id_registered,
182 123
@@ -195,8 +136,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 136 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 137 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 138 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 139 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 140 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 141 .setup_portio_remap = NULL,
@@ -205,8 +144,8 @@ struct apic apic_x2apic_phys = {
205 .phys_pkg_id = x2apic_phys_pkg_id, 144 .phys_pkg_id = x2apic_phys_pkg_id,
206 .mps_oem_check = NULL, 145 .mps_oem_check = NULL,
207 146
208 .get_apic_id = x2apic_phys_get_apic_id, 147 .get_apic_id = x2apic_get_apic_id,
209 .set_apic_id = set_apic_id, 148 .set_apic_id = x2apic_set_apic_id,
210 .apic_id_mask = 0xFFFFFFFFu, 149 .apic_id_mask = 0xFFFFFFFFu,
211 150
212 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 151 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -231,3 +170,5 @@ struct apic apic_x2apic_phys = {
231 .wait_icr_idle = native_x2apic_wait_icr_idle, 170 .wait_icr_idle = native_x2apic_wait_icr_idle,
232 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 171 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
233}; 172};
173
174apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f744f54cb248..adc66c3a1fef 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -23,6 +23,8 @@
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <linux/delay.h>
27#include <linux/crash_dump.h>
26 28
27#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
28#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
@@ -34,6 +36,14 @@
34#include <asm/ipi.h> 36#include <asm/ipi.h>
35#include <asm/smp.h> 37#include <asm/smp.h>
36#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h>
40#include <asm/nmi.h>
41
42/* BMC sets a bit this MMR non-zero before sending an NMI */
43#define UVH_NMI_MMR UVH_SCRATCH5
44#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
45#define UV_NMI_PENDING_MASK (1UL << 63)
46DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
37 47
38DEFINE_PER_CPU(int, x2apic_extra_bits); 48DEFINE_PER_CPU(int, x2apic_extra_bits);
39 49
@@ -41,10 +51,25 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
41 51
42static enum uv_system_type uv_system_type; 52static enum uv_system_type uv_system_type;
43static u64 gru_start_paddr, gru_end_paddr; 53static u64 gru_start_paddr, gru_end_paddr;
54static union uvh_apicid uvh_apicid;
44int uv_min_hub_revision_id; 55int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 56EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
57unsigned int uv_apicid_hibits;
58EXPORT_SYMBOL_GPL(uv_apicid_hibits);
46static DEFINE_SPINLOCK(uv_nmi_lock); 59static DEFINE_SPINLOCK(uv_nmi_lock);
47 60
61static struct apic apic_x2apic_uv_x;
62
63static unsigned long __init uv_early_read_mmr(unsigned long addr)
64{
65 unsigned long val, *mmr;
66
67 mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
68 val = *mmr;
69 early_iounmap(mmr, sizeof(*mmr));
70 return val;
71}
72
48static inline bool is_GRU_range(u64 start, u64 end) 73static inline bool is_GRU_range(u64 start, u64 end)
49{ 74{
50 return start >= gru_start_paddr && end <= gru_end_paddr; 75 return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -55,27 +80,63 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
55 return is_ISA_range(start, end) || is_GRU_range(start, end); 80 return is_ISA_range(start, end) || is_GRU_range(start, end);
56} 81}
57 82
58static int early_get_nodeid(void) 83static int __init early_get_pnodeid(void)
59{ 84{
60 union uvh_node_id_u node_id; 85 union uvh_node_id_u node_id;
61 unsigned long *mmr; 86 union uvh_rh_gam_config_mmr_u m_n_config;
62 87 int pnode;
63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
64 node_id.v = *mmr;
65 early_iounmap(mmr, sizeof(*mmr));
66 88
67 /* Currently, all blades have same revision number */ 89 /* Currently, all blades have same revision number */
90 node_id.v = uv_early_read_mmr(UVH_NODE_ID);
91 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
68 uv_min_hub_revision_id = node_id.s.revision; 92 uv_min_hub_revision_id = node_id.s.revision;
69 93
70 return node_id.s.node_id; 94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96
97 uv_hub_info->hub_revision = uv_min_hub_revision_id;
98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
99 return pnode;
100}
101
102static void __init early_get_apic_pnode_shift(void)
103{
104 uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
105 if (!uvh_apicid.v)
106 /*
107 * Old bios, use default value
108 */
109 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
110}
111
112/*
113 * Add an extra bit as dictated by bios to the destination apicid of
114 * interrupts potentially passing through the UV HUB. This prevents
115 * a deadlock between interrupts and IO port operations.
116 */
117static void __init uv_set_apicid_hibit(void)
118{
119 union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
120
121 if (is_uv1_hub()) {
122 apicid_mask.v =
123 uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
124 uv_apicid_hibits =
125 apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
126 }
71} 127}
72 128
73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 129static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
74{ 130{
75 int nodeid; 131 int pnodeid, is_uv1, is_uv2;
76 132
77 if (!strcmp(oem_id, "SGI")) { 133 is_uv1 = !strcmp(oem_id, "SGI");
78 nodeid = early_get_nodeid(); 134 is_uv2 = !strcmp(oem_id, "SGI2");
135 if (is_uv1 || is_uv2) {
136 uv_hub_info->hub_revision =
137 is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
138 pnodeid = early_get_pnodeid();
139 early_get_apic_pnode_shift();
79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 140 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init; 141 x86_platform.nmi_init = uv_nmi_init;
81 if (!strcmp(oem_table_id, "UVL")) 142 if (!strcmp(oem_table_id, "UVL"))
@@ -83,9 +144,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
83 else if (!strcmp(oem_table_id, "UVX")) 144 else if (!strcmp(oem_table_id, "UVX"))
84 uv_system_type = UV_X2APIC; 145 uv_system_type = UV_X2APIC;
85 else if (!strcmp(oem_table_id, "UVH")) { 146 else if (!strcmp(oem_table_id, "UVH")) {
86 __get_cpu_var(x2apic_extra_bits) = 147 __this_cpu_write(x2apic_extra_bits,
87 nodeid << (UV_APIC_PNODE_SHIFT - 1); 148 pnodeid << uvh_apicid.s.pnode_shift);
88 uv_system_type = UV_NON_UNIQUE_APIC; 149 uv_system_type = UV_NON_UNIQUE_APIC;
150 uv_set_apicid_hibit();
89 return 1; 151 return 1;
90 } 152 }
91 } 153 }
@@ -139,6 +201,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
139 int pnode; 201 int pnode;
140 202
141 pnode = uv_apicid_to_pnode(phys_apicid); 203 pnode = uv_apicid_to_pnode(phys_apicid);
204 phys_apicid |= uv_apicid_hibits;
142 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 205 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
143 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 206 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
144 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 207 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -220,7 +283,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
220 int cpu = cpumask_first(cpumask); 283 int cpu = cpumask_first(cpumask);
221 284
222 if ((unsigned)cpu < nr_cpu_ids) 285 if ((unsigned)cpu < nr_cpu_ids)
223 return per_cpu(x86_cpu_to_apicid, cpu); 286 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
224 else 287 else
225 return BAD_APICID; 288 return BAD_APICID;
226} 289}
@@ -239,7 +302,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
239 if (cpumask_test_cpu(cpu, cpu_online_mask)) 302 if (cpumask_test_cpu(cpu, cpu_online_mask))
240 break; 303 break;
241 } 304 }
242 return per_cpu(x86_cpu_to_apicid, cpu); 305 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
243} 306}
244 307
245static unsigned int x2apic_get_apic_id(unsigned long x) 308static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -247,7 +310,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
247 unsigned int id; 310 unsigned int id;
248 311
249 WARN_ON(preemptible() && num_online_cpus() > 1); 312 WARN_ON(preemptible() && num_online_cpus() > 1);
250 id = x | __get_cpu_var(x2apic_extra_bits); 313 id = x | __this_cpu_read(x2apic_extra_bits);
251 314
252 return id; 315 return id;
253} 316}
@@ -277,10 +340,15 @@ static void uv_send_IPI_self(int vector)
277 apic_write(APIC_SELF_IPI, vector); 340 apic_write(APIC_SELF_IPI, vector);
278} 341}
279 342
280struct apic __refdata apic_x2apic_uv_x = { 343static int uv_probe(void)
344{
345 return apic == &apic_x2apic_uv_x;
346}
347
348static struct apic __refdata apic_x2apic_uv_x = {
281 349
282 .name = "UV large system", 350 .name = "UV large system",
283 .probe = NULL, 351 .probe = uv_probe,
284 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 352 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
285 .apic_id_registered = uv_apic_id_registered, 353 .apic_id_registered = uv_apic_id_registered,
286 354
@@ -299,8 +367,6 @@ struct apic __refdata apic_x2apic_uv_x = {
299 .ioapic_phys_id_map = NULL, 367 .ioapic_phys_id_map = NULL,
300 .setup_apic_routing = NULL, 368 .setup_apic_routing = NULL,
301 .multi_timer_check = NULL, 369 .multi_timer_check = NULL,
302 .apicid_to_node = NULL,
303 .cpu_to_logical_apicid = NULL,
304 .cpu_present_to_apicid = default_cpu_present_to_apicid, 370 .cpu_present_to_apicid = default_cpu_present_to_apicid,
305 .apicid_to_cpu_present = NULL, 371 .apicid_to_cpu_present = NULL,
306 .setup_portio_remap = NULL, 372 .setup_portio_remap = NULL,
@@ -339,7 +405,7 @@ struct apic __refdata apic_x2apic_uv_x = {
339 405
340static __cpuinit void set_x2apic_extra_bits(int pnode) 406static __cpuinit void set_x2apic_extra_bits(int pnode)
341{ 407{
342 __get_cpu_var(x2apic_extra_bits) = (pnode << 6); 408 __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
343} 409}
344 410
345/* 411/*
@@ -363,14 +429,14 @@ struct redir_addr {
363#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 429#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
364 430
365static __initdata struct redir_addr redir_addrs[] = { 431static __initdata struct redir_addr redir_addrs[] = {
366 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, 432 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
367 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, 433 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
368 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, 434 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
369}; 435};
370 436
371static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) 437static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
372{ 438{
373 union uvh_si_alias0_overlay_config_u alias; 439 union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
374 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; 440 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
375 int i; 441 int i;
376 442
@@ -430,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
430static __init void map_mmioh_high(int max_pnode) 496static __init void map_mmioh_high(int max_pnode)
431{ 497{
432 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 498 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
433 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; 499 int shift;
434 500
435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 501 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
436 if (mmioh.s.enable) 502 if (is_uv1_hub() && mmioh.s1.enable) {
437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, 503 shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
504 map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
438 max_pnode, map_uc); 505 max_pnode, map_uc);
506 }
507 if (is_uv2_hub() && mmioh.s2.enable) {
508 shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
509 map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
510 max_pnode, map_uc);
511 }
439} 512}
440 513
441static __init void map_low_mmrs(void) 514static __init void map_low_mmrs(void)
@@ -559,14 +632,14 @@ late_initcall(uv_init_heartbeat);
559 632
560/* Direct Legacy VGA I/O traffic to designated IOH */ 633/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode, 634int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge) 635 unsigned int command_bits, u32 flags)
563{ 636{
564 int domain, bus, rc; 637 int domain, bus, rc;
565 638
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", 639 PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
567 pdev->devfn, decode, command_bits, change_bridge); 640 pdev->devfn, decode, command_bits, flags);
568 641
569 if (!change_bridge) 642 if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
570 return 0; 643 return 0;
571 644
572 if ((command_bits & PCI_COMMAND_IO) == 0) 645 if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -602,18 +675,46 @@ void __cpuinit uv_cpu_init(void)
602 */ 675 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 676int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{ 677{
605 if (reason != DIE_NMI_IPI) 678 unsigned long real_uv_nmi;
679 int bid;
680
681 if (reason != DIE_NMIUNKNOWN)
606 return NOTIFY_OK; 682 return NOTIFY_OK;
607 683
608 if (in_crash_kexec) 684 if (in_crash_kexec)
609 /* do nothing if entering the crash kernel */ 685 /* do nothing if entering the crash kernel */
610 return NOTIFY_OK; 686 return NOTIFY_OK;
687
688 /*
689 * Each blade has an MMR that indicates when an NMI has been sent
690 * to cpus on the blade. If an NMI is detected, atomically
691 * clear the MMR and update a per-blade NMI count used to
692 * cause each cpu on the blade to notice a new NMI.
693 */
694 bid = uv_numa_blade_id();
695 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
696
697 if (unlikely(real_uv_nmi)) {
698 spin_lock(&uv_blade_info[bid].nmi_lock);
699 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
700 if (real_uv_nmi) {
701 uv_blade_info[bid].nmi_count++;
702 uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
703 }
704 spin_unlock(&uv_blade_info[bid].nmi_lock);
705 }
706
707 if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
708 return NOTIFY_DONE;
709
710 __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
711
611 /* 712 /*
612 * Use a lock so only one cpu prints at a time 713 * Use a lock so only one cpu prints at a time.
613 * to prevent intermixed output. 714 * This prevents intermixed output.
614 */ 715 */
615 spin_lock(&uv_nmi_lock); 716 spin_lock(&uv_nmi_lock);
616 pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); 717 pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
617 dump_stack(); 718 dump_stack();
618 spin_unlock(&uv_nmi_lock); 719 spin_unlock(&uv_nmi_lock);
619 720
@@ -621,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
621} 722}
622 723
623static struct notifier_block uv_dump_stack_nmi_nb = { 724static struct notifier_block uv_dump_stack_nmi_nb = {
624 .notifier_call = uv_handle_nmi 725 .notifier_call = uv_handle_nmi,
726 .priority = NMI_LOCAL_LOW_PRIOR - 1,
625}; 727};
626 728
627void uv_register_nmi_notifier(void) 729void uv_register_nmi_notifier(void)
@@ -644,28 +746,34 @@ void uv_nmi_init(void)
644 746
645void __init uv_system_init(void) 747void __init uv_system_init(void)
646{ 748{
647 union uvh_si_addr_map_config_u m_n_config; 749 union uvh_rh_gam_config_mmr_u m_n_config;
750 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
648 union uvh_node_id_u node_id; 751 union uvh_node_id_u node_id;
649 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 752 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
650 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 753 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
651 int gnode_extra, max_pnode = 0; 754 int gnode_extra, max_pnode = 0;
652 unsigned long mmr_base, present, paddr; 755 unsigned long mmr_base, present, paddr;
653 unsigned short pnode_mask; 756 unsigned short pnode_mask, pnode_io_mask;
654 757
758 printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
655 map_low_mmrs(); 759 map_low_mmrs();
656 760
657 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 761 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
658 m_val = m_n_config.s.m_skt; 762 m_val = m_n_config.s.m_skt;
659 n_val = m_n_config.s.n_skt; 763 n_val = m_n_config.s.n_skt;
764 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
765 n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
660 mmr_base = 766 mmr_base =
661 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 767 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
662 ~UV_MMR_ENABLE; 768 ~UV_MMR_ENABLE;
663 pnode_mask = (1 << n_val) - 1; 769 pnode_mask = (1 << n_val) - 1;
770 pnode_io_mask = (1 << n_io) - 1;
771
664 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 772 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
665 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 773 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
666 gnode_upper = ((unsigned long)gnode_extra << m_val); 774 gnode_upper = ((unsigned long)gnode_extra << m_val);
667 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", 775 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
668 n_val, m_val, gnode_upper, gnode_extra); 776 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
669 777
670 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 778 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
671 779
@@ -675,8 +783,9 @@ void __init uv_system_init(void)
675 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 783 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
676 784
677 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 785 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
678 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 786 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
679 BUG_ON(!uv_blade_info); 787 BUG_ON(!uv_blade_info);
788
680 for (blade = 0; blade < uv_num_possible_blades(); blade++) 789 for (blade = 0; blade < uv_num_possible_blades(); blade++)
681 uv_blade_info[blade].memory_nid = -1; 790 uv_blade_info[blade].memory_nid = -1;
682 791
@@ -698,10 +807,11 @@ void __init uv_system_init(void)
698 for (j = 0; j < 64; j++) { 807 for (j = 0; j < 64; j++) {
699 if (!test_bit(j, &present)) 808 if (!test_bit(j, &present))
700 continue; 809 continue;
701 pnode = (i * 64 + j); 810 pnode = (i * 64 + j) & pnode_mask;
702 uv_blade_info[blade].pnode = pnode; 811 uv_blade_info[blade].pnode = pnode;
703 uv_blade_info[blade].nr_possible_cpus = 0; 812 uv_blade_info[blade].nr_possible_cpus = 0;
704 uv_blade_info[blade].nr_online_cpus = 0; 813 uv_blade_info[blade].nr_online_cpus = 0;
814 spin_lock_init(&uv_blade_info[blade].nmi_lock);
705 max_pnode = max(pnode, max_pnode); 815 max_pnode = max(pnode, max_pnode);
706 blade++; 816 blade++;
707 } 817 }
@@ -716,6 +826,13 @@ void __init uv_system_init(void)
716 int apicid = per_cpu(x86_cpu_to_apicid, cpu); 826 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
717 827
718 nid = cpu_to_node(cpu); 828 nid = cpu_to_node(cpu);
829 /*
830 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
831 */
832 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
833 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
834 uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
835
719 pnode = uv_apicid_to_pnode(apicid); 836 pnode = uv_apicid_to_pnode(apicid);
720 blade = boot_pnode_to_blade(pnode); 837 blade = boot_pnode_to_blade(pnode);
721 lcpu = uv_blade_info[blade].nr_possible_cpus; 838 lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -731,7 +848,6 @@ void __init uv_system_init(void)
731 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 848 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
732 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 849 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
733 uv_cpu_hub_info(cpu)->pnode = pnode; 850 uv_cpu_hub_info(cpu)->pnode = pnode;
734 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
735 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; 851 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
736 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 852 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
737 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 853 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -755,7 +871,7 @@ void __init uv_system_init(void)
755 871
756 map_gru_high(max_pnode); 872 map_gru_high(max_pnode);
757 map_mmr_high(max_pnode); 873 map_mmr_high(max_pnode);
758 map_mmioh_high(max_pnode); 874 map_mmioh_high(max_pnode & pnode_io_mask);
759 875
760 uv_cpu_init(); 876 uv_cpu_init();
761 uv_scir_register_cpu_notifier(); 877 uv_scir_register_cpu_notifier();
@@ -764,4 +880,13 @@ void __init uv_system_init(void)
764 880
765 /* register Legacy VGA I/O redirection handler */ 881 /* register Legacy VGA I/O redirection handler */
766 pci_register_set_vga_state(uv_set_vga_state); 882 pci_register_set_vga_state(uv_set_vga_state);
883
884 /*
885 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
886 * EFI is not enabled in the kdump kernel.
887 */
888 if (is_kdump_kernel())
889 reboot_type = BOOT_ACPI;
767} 890}
891
892apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b7..965a7666c283 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -189,8 +189,8 @@
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. 189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 * 190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax 191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from 192 * 916.356.6100) or 800.548.4725; or from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also 193 * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
194 * available from Microsoft by calling 206.882.8080.] 194 * available from Microsoft by calling 206.882.8080.]
195 * 195 *
196 * APM 1.2 Reference: 196 * APM 1.2 Reference:
@@ -227,6 +227,8 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
231#include <linux/syscore_ops.h>
230 232
231#include <asm/system.h> 233#include <asm/system.h>
232#include <asm/uaccess.h> 234#include <asm/uaccess.h>
@@ -359,6 +361,7 @@ struct apm_user {
359 * idle percentage above which bios idle calls are done 361 * idle percentage above which bios idle calls are done
360 */ 362 */
361#ifdef CONFIG_APM_CPU_IDLE 363#ifdef CONFIG_APM_CPU_IDLE
364#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
362#define DEFAULT_IDLE_THRESHOLD 95 365#define DEFAULT_IDLE_THRESHOLD 95
363#else 366#else
364#define DEFAULT_IDLE_THRESHOLD 100 367#define DEFAULT_IDLE_THRESHOLD 100
@@ -902,6 +905,7 @@ static void apm_cpu_idle(void)
902 unsigned int jiffies_since_last_check = jiffies - last_jiffies; 905 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
903 unsigned int bucket; 906 unsigned int bucket;
904 907
908 WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
905recalc: 909recalc:
906 if (jiffies_since_last_check > IDLE_CALC_LIMIT) { 910 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
907 use_apm_idle = 0; 911 use_apm_idle = 0;
@@ -975,20 +979,10 @@ recalc:
975 979
976static void apm_power_off(void) 980static void apm_power_off(void)
977{ 981{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 982 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 983 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 984 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 985 machine_real_restart(MRR_APM);
992 } else { 986 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 987 (void)set_system_power_state(APM_STATE_OFF);
994 } 988 }
@@ -1246,7 +1240,7 @@ static int suspend(int vetoable)
1246 dpm_suspend_noirq(PMSG_SUSPEND); 1240 dpm_suspend_noirq(PMSG_SUSPEND);
1247 1241
1248 local_irq_disable(); 1242 local_irq_disable();
1249 sysdev_suspend(PMSG_SUSPEND); 1243 syscore_suspend();
1250 1244
1251 local_irq_enable(); 1245 local_irq_enable();
1252 1246
@@ -1264,7 +1258,7 @@ static int suspend(int vetoable)
1264 apm_error("suspend", err); 1258 apm_error("suspend", err);
1265 err = (err == APM_SUCCESS) ? 0 : -EIO; 1259 err = (err == APM_SUCCESS) ? 0 : -EIO;
1266 1260
1267 sysdev_resume(); 1261 syscore_resume();
1268 local_irq_enable(); 1262 local_irq_enable();
1269 1263
1270 dpm_resume_noirq(PMSG_RESUME); 1264 dpm_resume_noirq(PMSG_RESUME);
@@ -1288,7 +1282,7 @@ static void standby(void)
1288 dpm_suspend_noirq(PMSG_SUSPEND); 1282 dpm_suspend_noirq(PMSG_SUSPEND);
1289 1283
1290 local_irq_disable(); 1284 local_irq_disable();
1291 sysdev_suspend(PMSG_SUSPEND); 1285 syscore_suspend();
1292 local_irq_enable(); 1286 local_irq_enable();
1293 1287
1294 err = set_system_power_state(APM_STATE_STANDBY); 1288 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1296,7 +1290,7 @@ static void standby(void)
1296 apm_error("standby", err); 1290 apm_error("standby", err);
1297 1291
1298 local_irq_disable(); 1292 local_irq_disable();
1299 sysdev_resume(); 1293 syscore_resume();
1300 local_irq_enable(); 1294 local_irq_enable();
1301 1295
1302 dpm_resume_noirq(PMSG_RESUME); 1296 dpm_resume_noirq(PMSG_RESUME);
@@ -1926,6 +1920,7 @@ static const struct file_operations apm_bios_fops = {
1926 .unlocked_ioctl = do_ioctl, 1920 .unlocked_ioctl = do_ioctl,
1927 .open = do_open, 1921 .open = do_open,
1928 .release = do_release, 1922 .release = do_release,
1923 .llseek = noop_llseek,
1929}; 1924};
1930 1925
1931static struct miscdevice apm_device = { 1926static struct miscdevice apm_device = {
@@ -2330,12 +2325,11 @@ static int __init apm_init(void)
2330 apm_info.disabled = 1; 2325 apm_info.disabled = 1;
2331 return -ENODEV; 2326 return -ENODEV;
2332 } 2327 }
2333 if (pm_flags & PM_ACPI) { 2328 if (!acpi_disabled) {
2334 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2329 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2335 apm_info.disabled = 1; 2330 apm_info.disabled = 1;
2336 return -ENODEV; 2331 return -ENODEV;
2337 } 2332 }
2338 pm_flags |= PM_APM;
2339 2333
2340 /* 2334 /*
2341 * Set up the long jump entry point to the APM BIOS, which is called 2335 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2427,7 +2421,6 @@ static void __exit apm_exit(void)
2427 kthread_stop(kapmd_task); 2421 kthread_stop(kapmd_task);
2428 kapmd_task = NULL; 2422 kapmd_task = NULL;
2429 } 2423 }
2430 pm_flags &= ~PM_APM;
2431} 2424}
2432 2425
2433module_init(apm_init); 2426module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,44 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
103 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
104 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
105
106 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
107
108#ifdef CONFIG_PARAVIRT
109 BLANK();
110 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
111 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
112 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
113 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
114 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
115 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
116 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
117 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
118#endif
119
120#ifdef CONFIG_XEN
121 BLANK();
122 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
123 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
124#endif
125
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 63 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -141,11 +77,4 @@ void foo(void)
141 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
142 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
143#endif 79#endif
144
145 BLANK();
146 OFFSET(BP_scratch, boot_params, scratch);
147 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
148 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
149 OFFSET(BP_version, boot_params, hdr.version);
150 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
151} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
deleted file mode 100644
index 8bc57baaa9ad..000000000000
--- a/arch/x86/kernel/bios_uv.c
+++ /dev/null
@@ -1,215 +0,0 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */
21
22#include <linux/efi.h>
23#include <asm/efi.h>
24#include <linux/io.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h>
27
28static struct uv_systab uv_systab;
29
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{
32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
34
35 if (!tab->function)
36 /*
37 * BIOS does not support UV systab
38 */
39 return BIOS_STATUS_UNIMPLEMENTED;
40
41 ret = efi_call6((void *)__va(tab->function), (u64)which,
42 a1, a2, a3, a4, a5);
43 return ret;
44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
46
47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
48 u64 a4, u64 a5)
49{
50 unsigned long bios_flags;
51 s64 ret;
52
53 local_irq_save(bios_flags);
54 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
55 local_irq_restore(bios_flags);
56
57 return ret;
58}
59
60s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
61 u64 a4, u64 a5)
62{
63 s64 ret;
64
65 preempt_disable();
66 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
67 preempt_enable();
68
69 return ret;
70}
71
72
73long sn_partition_id;
74EXPORT_SYMBOL_GPL(sn_partition_id);
75long sn_coherency_id;
76EXPORT_SYMBOL_GPL(sn_coherency_id);
77long sn_region_size;
78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
83
84
85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
86 long *region, long *ssn)
87{
88 s64 ret;
89 u64 v0, v1;
90 union partition_info_u part;
91
92 ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
93 (u64)(&v0), (u64)(&v1), 0, 0);
94 if (ret != BIOS_STATUS_SUCCESS)
95 return ret;
96
97 part.val = v0;
98 if (uvtype)
99 *uvtype = part.hub_version;
100 if (partid)
101 *partid = part.partition_id;
102 if (coher)
103 *coher = part.coherence_id;
104 if (region)
105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
108 return ret;
109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
111
112int
113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
114 unsigned long *intr_mmr_offset)
115{
116 u64 watchlist;
117 s64 ret;
118
119 /*
120 * bios returns watchlist number or negative error number.
121 */
122 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
123 mq_size, (u64)intr_mmr_offset,
124 (u64)&watchlist, 0);
125 if (ret < BIOS_STATUS_SUCCESS)
126 return ret;
127
128 return watchlist;
129}
130EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_alloc);
131
132int
133uv_bios_mq_watchlist_free(int blade, int watchlist_num)
134{
135 return (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_FREE,
136 blade, watchlist_num, 0, 0, 0);
137}
138EXPORT_SYMBOL_GPL(uv_bios_mq_watchlist_free);
139
140s64
141uv_bios_change_memprotect(u64 paddr, u64 len, enum uv_memprotect perms)
142{
143 return uv_bios_call_irqsave(UV_BIOS_MEMPROTECT, paddr, len,
144 perms, 0, 0);
145}
146EXPORT_SYMBOL_GPL(uv_bios_change_memprotect);
147
148s64
149uv_bios_reserved_page_pa(u64 buf, u64 *cookie, u64 *addr, u64 *len)
150{
151 s64 ret;
152
153 ret = uv_bios_call_irqsave(UV_BIOS_GET_PARTITION_ADDR, (u64)cookie,
154 (u64)addr, buf, (u64)len, 0);
155 return ret;
156}
157EXPORT_SYMBOL_GPL(uv_bios_reserved_page_pa);
158
159s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
160{
161 return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
162 (u64)ticks_per_second, 0, 0, 0);
163}
164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
185
186#ifdef CONFIG_EFI
187void uv_bios_init(void)
188{
189 struct uv_systab *tab;
190
191 if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
192 (efi.uv_systab == (unsigned long)NULL)) {
193 printk(KERN_CRIT "No EFI UV System Table.\n");
194 uv_systab.function = (unsigned long)NULL;
195 return;
196 }
197
198 tab = (struct uv_systab *)ioremap(efi.uv_systab,
199 sizeof(struct uv_systab));
200 if (strncmp(tab->signature, "UVST", 4) != 0)
201 printk(KERN_ERR "bad signature in UV system table!");
202
203 /*
204 * Copy table to permanent spot for later use.
205 */
206 memcpy(&uv_systab, tab, sizeof(struct uv_systab));
207 iounmap(tab);
208
209 printk(KERN_INFO "EFI UV System Table Revision %d\n",
210 uv_systab.revision);
211}
212#else /* !CONFIG_EFI */
213
214void uv_bios_init(void) { }
215#endif
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..452932d34730 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/kthread.h> 3#include <linux/kthread.h>
4#include <linux/workqueue.h> 4#include <linux/workqueue.h>
5#include <asm/e820.h> 5#include <linux/memblock.h>
6
6#include <asm/proto.h> 7#include <asm/proto.h>
7 8
8/* 9/*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
18static unsigned __read_mostly corruption_check_size = 64*1024; 19static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */ 20static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20 21
21static struct e820entry scan_areas[MAX_SCAN_AREAS]; 22static struct scan_area {
23 u64 addr;
24 u64 size;
25} scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas; 26static int num_scan_areas;
23 27
24
25static __init int set_corruption_check(char *arg) 28static __init int set_corruption_check(char *arg)
26{ 29{
27 char *end; 30 char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
81 84
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size; 86 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
85 88
86 if (!(addr + 1)) 89 if (addr == MEMBLOCK_ERROR)
87 break; 90 break;
88 91
89 if (addr >= corruption_check_size) 92 if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
92 if ((addr + size) > corruption_check_size) 95 if ((addr + size) > corruption_check_size)
93 size = corruption_check_size - addr; 96 size = corruption_check_size - addr;
94 97
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
96 scan_areas[num_scan_areas].addr = addr; 99 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 100 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++; 101 num_scan_areas++;
@@ -103,9 +106,8 @@ void __init setup_bios_corruption_check(void)
103 addr += size; 106 addr += size;
104 } 107 }
105 108
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", 109 if (num_scan_areas)
107 num_scan_areas); 110 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
108 update_e820();
109} 111}
110 112
111 113
@@ -141,12 +143,12 @@ static void check_corruption(struct work_struct *dummy)
141{ 143{
142 check_for_bios_corruption(); 144 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work, 145 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ)); 146 round_jiffies_relative(corruption_check_period*HZ));
145} 147}
146 148
147static int start_periodic_check_for_corruption(void) 149static int start_periodic_check_for_corruption(void)
148{ 150{
149 if (!memory_corruption_check || corruption_check_period == 0) 151 if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
150 return 0; 152 return 0;
151 153
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", 154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a01..6042981d0309 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
30 30
31obj-$(CONFIG_X86_MCE) += mcheck/ 31obj-$(CONFIG_X86_MCE) += mcheck/
32obj-$(CONFIG_MTRR) += mtrr/ 32obj-$(CONFIG_MTRR) += mtrr/
33obj-$(CONFIG_CPU_FREQ) += cpufreq/
34 33
35obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 34obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
36 35
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f01..b13ed393dfce 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
148{ 148{
149#ifdef CONFIG_SMP 149#ifdef CONFIG_SMP
150 /* calling is from identify_secondary_cpu() ? */ 150 /* calling is from identify_secondary_cpu() ? */
151 if (c->cpu_index == boot_cpu_id) 151 if (!c->cpu_index)
152 return; 152 return;
153 153
154 /* 154 /*
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -253,37 +257,55 @@ static int __cpuinit nearby_node(int apicid)
253#endif 257#endif
254 258
255/* 259/*
256 * Fixup core topology information for AMD multi-node processors. 260 * Fixup core topology information for
257 * Assumption: Number of cores in each internal node is the same. 261 * (1) AMD multi-node processors
262 * Assumption: Number of cores in each internal node is the same.
263 * (2) AMD processors supporting compute units
258 */ 264 */
259#ifdef CONFIG_X86_HT 265#ifdef CONFIG_X86_HT
260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 266static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
261{ 267{
262 unsigned long long value; 268 u32 nodes, cores_per_cu = 1;
263 u32 nodes, cores_per_node; 269 u8 node_id;
264 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
265 271
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) 272 /* get information required for multi-node processors */
267 return; 273 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
268 274 u32 eax, ebx, ecx, edx;
269 /* fixup topology information only once for a core */ 275
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 276 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
277 nodes = ((ecx >> 8) & 7) + 1;
278 node_id = ecx & 7;
279
280 /* get compute unit information */
281 smp_num_siblings = ((ebx >> 8) & 3) + 1;
282 c->compute_unit_id = ebx & 0xff;
283 cores_per_cu += ((ebx >> 8) & 3);
284 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
285 u64 value;
286
287 rdmsrl(MSR_FAM10H_NODE_ID, value);
288 nodes = ((value >> 3) & 7) + 1;
289 node_id = value & 7;
290 } else
271 return; 291 return;
272 292
273 rdmsrl(MSR_FAM10H_NODE_ID, value); 293 /* fixup multi-node processor information */
274 294 if (nodes > 1) {
275 nodes = ((value >> 3) & 7) + 1; 295 u32 cores_per_node;
276 if (nodes == 1) 296 u32 cus_per_node;
277 return;
278 297
279 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 298 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes; 299 cores_per_node = c->x86_max_cores / nodes;
300 cus_per_node = cores_per_node / cores_per_cu;
281 301
282 /* store NodeID, use llc_shared_map to store sibling info */ 302 /* store NodeID, use llc_shared_map to store sibling info */
283 per_cpu(cpu_llc_id, cpu) = value & 7; 303 per_cpu(cpu_llc_id, cpu) = node_id;
284 304
285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */ 305 /* core id has to be in the [0 .. cores_per_node - 1] range */
286 c->cpu_core_id = c->cpu_core_id % cores_per_node; 306 c->cpu_core_id %= cores_per_node;
307 c->compute_unit_id %= cus_per_node;
308 }
287} 309}
288#endif 310#endif
289 311
@@ -304,9 +326,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
304 c->phys_proc_id = c->initial_apicid >> bits; 326 c->phys_proc_id = c->initial_apicid >> bits;
305 /* use socket ID also for last level cache */ 327 /* use socket ID also for last level cache */
306 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 328 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
307 /* fixup topology information on multi-node processors */ 329 amd_get_topology(c);
308 if ((c->x86 == 0x10) && (c->x86_model == 9))
309 amd_fixup_dcm(c);
310#endif 330#endif
311} 331}
312 332
@@ -322,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
322 342
323static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
324{ 344{
325#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
326 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
327 int node; 347 int node;
328 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
329 349
330 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
331 353
332 if (apicid_to_node[apicid] != NUMA_NO_NODE)
333 node = apicid_to_node[apicid];
334 if (!node_online(node)) { 354 if (!node_online(node)) {
335 /* Two possibilities here: 355 /*
336 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
337 In that case try picking one from a nearby CPU 357 *
338 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
339 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
340 Assume they are all increased by a constant offset, 360 *
341 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
342 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
343 path for the previous case. */ 363 * they are all increased by a constant offset, but in
344 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
345 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
346 375
347 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
348 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
349 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
350 /* Pick a nearby node */ 379 /* Pick a nearby node */
351 if (!node_online(node)) 380 if (!node_online(node))
352 node = nearby_node(apicid); 381 node = nearby_node(apicid);
@@ -412,6 +441,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
412 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 441 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
413 } 442 }
414#endif 443#endif
444
445 /* We need to do the following only once */
446 if (c != &boot_cpu_data)
447 return;
448
449 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
450
451 if (c->x86 > 0x10 ||
452 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
453 u64 val;
454
455 rdmsrl(MSR_K7_HWCR, val);
456 if (!(val & BIT(24)))
457 printk(KERN_WARNING FW_BUG "TSC doesn't count "
458 "with P0 frequency!\n");
459 }
460 }
415} 461}
416 462
417static void __cpuinit init_amd(struct cpuinfo_x86 *c) 463static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +569,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
523#endif 569#endif
524 570
525 if (c->extended_cpuid_level >= 0x80000006) { 571 if (c->extended_cpuid_level >= 0x80000006) {
526 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) 572 if (cpuid_edx(0x80000006) & 0xf000)
527 num_cache_leaves = 4; 573 num_cache_leaves = 4;
528 else 574 else
529 num_cache_leaves = 3; 575 num_cache_leaves = 3;
@@ -565,6 +611,35 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
565 } 611 }
566 } 612 }
567#endif 613#endif
614
615 /*
616 * Family 0x12 and above processors have APIC timer
617 * running in deep C states.
618 */
619 if (c->x86 > 0x11)
620 set_cpu_cap(c, X86_FEATURE_ARAT);
621
622 /*
623 * Disable GART TLB Walk Errors on Fam10h. We do this here
624 * because this is always needed when GART is enabled, even in a
625 * kernel which has no MCE support built in.
626 */
627 if (c->x86 == 0x10) {
628 /*
629 * BIOS should disable GartTlbWlk Errors themself. If
630 * it doesn't do it here as suggested by the BKDG.
631 *
632 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
633 */
634 u64 mask;
635 int err;
636
637 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
638 if (err == 0) {
639 mask |= (1 << 10);
640 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
641 }
642 }
568} 643}
569 644
570#ifdef CONFIG_X86_32 645#ifdef CONFIG_X86_32
@@ -639,7 +714,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
639 714
640bool cpu_has_amd_erratum(const int *erratum) 715bool cpu_has_amd_erratum(const int *erratum)
641{ 716{
642 struct cpuinfo_x86 *cpu = &current_cpu_data; 717 struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
643 int osvw_id = *erratum++; 718 int osvw_id = *erratum++;
644 u32 range; 719 u32 range;
645 u32 ms; 720 u32 ms;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb3018..525514cf33c3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
19 19
20static int __init no_halt(char *s) 20static int __init no_halt(char *s)
21{ 21{
22 WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
22 boot_cpu_data.hlt_works_ok = 0; 23 boot_cpu_data.hlt_works_ok = 0;
23 return 1; 24 return 1;
24} 25}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25c..22a073d7fbff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
254} 254}
255#endif 255#endif
256 256
257static int disable_smep __cpuinitdata;
258static __init int setup_disable_smep(char *arg)
259{
260 disable_smep = 1;
261 return 1;
262}
263__setup("nosmep", setup_disable_smep);
264
265static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
266{
267 if (cpu_has(c, X86_FEATURE_SMEP)) {
268 if (unlikely(disable_smep)) {
269 setup_clear_cpu_cap(X86_FEATURE_SMEP);
270 clear_in_cr4(X86_CR4_SMEP);
271 } else
272 set_in_cr4(X86_CR4_SMEP);
273 }
274}
275
257/* 276/*
258 * Some CPU features depend on higher CPUID levels, which may not always 277 * Some CPU features depend on higher CPUID levels, which may not always
259 * be available due to CPUID level capping or broken virtualization 278 * be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
458 if (smp_num_siblings <= 1) 477 if (smp_num_siblings <= 1)
459 goto out; 478 goto out;
460 479
461 if (smp_num_siblings > nr_cpu_ids) {
462 pr_warning("CPU: Unsupported number of siblings %d",
463 smp_num_siblings);
464 smp_num_siblings = 1;
465 return;
466 }
467
468 index_msb = get_count_order(smp_num_siblings); 480 index_msb = get_count_order(smp_num_siblings);
469 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); 481 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
470 482
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
565 577
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); 578 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567 579
568 if (eax > 0) 580 c->x86_capability[9] = ebx;
569 c->x86_capability[9] = ebx;
570 } 581 }
571 582
572 /* AMD-defined flags: level 0x80000001 */ 583 /* AMD-defined flags: level 0x80000001 */
@@ -665,9 +676,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
665 this_cpu->c_early_init(c); 676 this_cpu->c_early_init(c);
666 677
667#ifdef CONFIG_SMP 678#ifdef CONFIG_SMP
668 c->cpu_index = boot_cpu_id; 679 c->cpu_index = 0;
669#endif 680#endif
670 filter_cpuid_features(c, false); 681 filter_cpuid_features(c, false);
682
683 setup_smep(c);
671} 684}
672 685
673void __init early_cpu_init(void) 686void __init early_cpu_init(void)
@@ -675,7 +688,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 688 const struct cpu_dev *const *cdev;
676 int count = 0; 689 int count = 0;
677 690
678#ifdef PROCESSOR_SELECT 691#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 692 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 693#endif
681 694
@@ -687,7 +700,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 700 cpu_devs[count] = cpudev;
688 count++; 701 count++;
689 702
690#ifdef PROCESSOR_SELECT 703#ifdef CONFIG_PROCESSOR_SELECT
691 { 704 {
692 unsigned int j; 705 unsigned int j;
693 706
@@ -704,16 +717,21 @@ void __init early_cpu_init(void)
704} 717}
705 718
706/* 719/*
707 * The NOPL instruction is supposed to exist on all CPUs with 720 * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
708 * family >= 6; unfortunately, that's not true in practice because 721 * unfortunately, that's not true in practice because of early VIA
709 * of early VIA chips and (more importantly) broken virtualizers that 722 * chips and (more importantly) broken virtualizers that are not easy
710 * are not easy to detect. In the latter case it doesn't even *fail* 723 * to detect. In the latter case it doesn't even *fail* reliably, so
711 * reliably, so probing for it doesn't even work. Disable it completely 724 * probing for it doesn't even work. Disable it completely on 32-bit
712 * unless we can find a reliable way to detect all the broken cases. 725 * unless we can find a reliable way to detect all the broken cases.
726 * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
713 */ 727 */
714static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 728static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
715{ 729{
730#ifdef CONFIG_X86_32
716 clear_cpu_cap(c, X86_FEATURE_NOPL); 731 clear_cpu_cap(c, X86_FEATURE_NOPL);
732#else
733 set_cpu_cap(c, X86_FEATURE_NOPL);
734#endif
717} 735}
718 736
719static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 737static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -748,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
748#endif 766#endif
749 } 767 }
750 768
769 setup_smep(c);
770
751 get_model_name(c); /* Default name */ 771 get_model_name(c); /* Default name */
752 772
753 detect_nopl(c); 773 detect_nopl(c);
@@ -864,7 +884,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
864 884
865 select_idle_routine(c); 885 select_idle_routine(c);
866 886
867#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 887#ifdef CONFIG_NUMA
868 numa_add_cpu(smp_processor_id()); 888 numa_add_cpu(smp_processor_id());
869#endif 889#endif
870} 890}
@@ -882,14 +902,13 @@ static void vgetcpu_set_mode(void)
882void __init identify_boot_cpu(void) 902void __init identify_boot_cpu(void)
883{ 903{
884 identify_cpu(&boot_cpu_data); 904 identify_cpu(&boot_cpu_data);
885 init_c1e_mask(); 905 init_amd_e400_c1e_mask();
886#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
887 sysenter_setup(); 907 sysenter_setup();
888 enable_sep_cpu(); 908 enable_sep_cpu();
889#else 909#else
890 vgetcpu_set_mode(); 910 vgetcpu_set_mode();
891#endif 911#endif
892 init_hw_perf_events();
893} 912}
894 913
895void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 914void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1264,13 +1283,6 @@ void __cpuinit cpu_init(void)
1264 clear_all_debug_regs(); 1283 clear_all_debug_regs();
1265 dbg_restore_debug_regs(); 1284 dbg_restore_debug_regs();
1266 1285
1267 /*
1268 * Force FPU initialization:
1269 */
1270 current_thread_info()->status = 0;
1271 clear_used_math();
1272 mxcsr_feature_mask_init();
1273
1274 fpu_init(); 1286 fpu_init();
1275 xsave_init(); 1287 xsave_init();
1276} 1288}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void get_cpu_cap(struct cpuinfo_x86 *c);
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 37extern void get_cpu_cap(struct cpuinfo_x86 *c);
37 38
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
27config X86_ACPI_CPUFREQ
28 tristate "ACPI Processor P-States driver"
29 select CPU_FREQ_TABLE
30 depends on ACPI_PROCESSOR
31 help
32 This driver adds a CPUFreq driver which utilizes the ACPI
33 Processor Performance States.
34 This driver also supports Intel Enhanced Speedstep.
35
36 To compile this driver as a module, choose M here: the
37 module will be called acpi-cpufreq.
38
39 For details, take a look at <file:Documentation/cpu-freq/>.
40
41 If in doubt, say N.
42
43config ELAN_CPUFREQ
44 tristate "AMD Elan SC400 and SC410"
45 select CPU_FREQ_TABLE
46 depends on X86_ELAN
47 ---help---
48 This adds the CPUFreq driver for AMD Elan SC400 and SC410
49 processors.
50
51 You need to specify the processor maximum speed as boot
52 parameter: elanfreq=maxspeed (in kHz) or as module
53 parameter "max_freq".
54
55 For details, take a look at <file:Documentation/cpu-freq/>.
56
57 If in doubt, say N.
58
59config SC520_CPUFREQ
60 tristate "AMD Elan SC520"
61 select CPU_FREQ_TABLE
62 depends on X86_ELAN
63 ---help---
64 This adds the CPUFreq driver for AMD Elan SC520 processor.
65
66 For details, take a look at <file:Documentation/cpu-freq/>.
67
68 If in doubt, say N.
69
70
71config X86_POWERNOW_K6
72 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
73 select CPU_FREQ_TABLE
74 depends on X86_32
75 help
76 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
77 AMD K6-3+ processors.
78
79 For details, take a look at <file:Documentation/cpu-freq/>.
80
81 If in doubt, say N.
82
83config X86_POWERNOW_K7
84 tristate "AMD Mobile Athlon/Duron PowerNow!"
85 select CPU_FREQ_TABLE
86 depends on X86_32
87 help
88 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
89
90 For details, take a look at <file:Documentation/cpu-freq/>.
91
92 If in doubt, say N.
93
94config X86_POWERNOW_K7_ACPI
95 bool
96 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
97 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
98 depends on X86_32
99 default y
100
101config X86_POWERNOW_K8
102 tristate "AMD Opteron/Athlon64 PowerNow!"
103 select CPU_FREQ_TABLE
104 depends on ACPI && ACPI_PROCESSOR
105 help
106 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
107
108 To compile this driver as a module, choose M here: the
109 module will be called powernow-k8.
110
111 For details, take a look at <file:Documentation/cpu-freq/>.
112
113config X86_GX_SUSPMOD
114 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
115 depends on X86_32 && PCI
116 help
117 This add the CPUFreq driver for NatSemi Geode processors which
118 support suspend modulation.
119
120 For details, take a look at <file:Documentation/cpu-freq/>.
121
122 If in doubt, say N.
123
124config X86_SPEEDSTEP_CENTRINO
125 tristate "Intel Enhanced SpeedStep (deprecated)"
126 select CPU_FREQ_TABLE
127 select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
128 depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
129 help
130 This is deprecated and this functionality is now merged into
131 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
132 speedstep_centrino.
133 This adds the CPUFreq driver for Enhanced SpeedStep enabled
134 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
135 or 64bit enabled Intel Xeons.
136
137 To compile this driver as a module, choose M here: the
138 module will be called speedstep-centrino.
139
140 For details, take a look at <file:Documentation/cpu-freq/>.
141
142 If in doubt, say N.
143
144config X86_SPEEDSTEP_CENTRINO_TABLE
145 bool "Built-in tables for Banias CPUs"
146 depends on X86_32 && X86_SPEEDSTEP_CENTRINO
147 default y
148 help
149 Use built-in tables for Banias CPUs if ACPI encoding
150 is not available.
151
152 If in doubt, say N.
153
154config X86_SPEEDSTEP_ICH
155 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
156 select CPU_FREQ_TABLE
157 depends on X86_32
158 help
159 This adds the CPUFreq driver for certain mobile Intel Pentium III
160 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
161 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
162 ICH3 or ICH4 southbridge.
163
164 For details, take a look at <file:Documentation/cpu-freq/>.
165
166 If in doubt, say N.
167
168config X86_SPEEDSTEP_SMI
169 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
170 select CPU_FREQ_TABLE
171 depends on X86_32 && EXPERIMENTAL
172 help
173 This adds the CPUFreq driver for certain mobile Intel Pentium III
174 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
175 on systems which have an Intel 440BX/ZX/MX southbridge.
176
177 For details, take a look at <file:Documentation/cpu-freq/>.
178
179 If in doubt, say N.
180
181config X86_P4_CLOCKMOD
182 tristate "Intel Pentium 4 clock modulation"
183 select CPU_FREQ_TABLE
184 help
185 This adds the CPUFreq driver for Intel Pentium 4 / XEON
186 processors. When enabled it will lower CPU temperature by skipping
187 clocks.
188
189 This driver should be only used in exceptional
190 circumstances when very low power is needed because it causes severe
191 slowdowns and noticeable latencies. Normally Speedstep should be used
192 instead.
193
194 To compile this driver as a module, choose M here: the
195 module will be called p4-clockmod.
196
197 For details, take a look at <file:Documentation/cpu-freq/>.
198
199 Unless you are absolutely sure say N.
200
201config X86_CPUFREQ_NFORCE2
202 tristate "nVidia nForce2 FSB changing"
203 depends on X86_32 && EXPERIMENTAL
204 help
205 This adds the CPUFreq driver for FSB changing on nVidia nForce2
206 platforms.
207
208 For details, take a look at <file:Documentation/cpu-freq/>.
209
210 If in doubt, say N.
211
212config X86_LONGRUN
213 tristate "Transmeta LongRun"
214 depends on X86_32
215 help
216 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
217 which support LongRun.
218
219 For details, take a look at <file:Documentation/cpu-freq/>.
220
221 If in doubt, say N.
222
223config X86_LONGHAUL
224 tristate "VIA Cyrix III Longhaul"
225 select CPU_FREQ_TABLE
226 depends on X86_32 && ACPI_PROCESSOR
227 help
228 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
229 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
230 processors.
231
232 For details, take a look at <file:Documentation/cpu-freq/>.
233
234 If in doubt, say N.
235
236config X86_E_POWERSAVER
237 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
238 select CPU_FREQ_TABLE
239 depends on X86_32 && EXPERIMENTAL
240 help
241 This adds the CPUFreq driver for VIA C7 processors. However, this driver
242 does not have any safeguards to prevent operating the CPU out of spec
243 and is thus considered dangerous. Please use the regular ACPI cpufreq
244 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
245
246 If in doubt, say N.
247
248comment "shared options"
249
250config X86_SPEEDSTEP_LIB
251 tristate
252 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
253
254config X86_SPEEDSTEP_RELAXED_CAP_CHECK
255 bool "Relaxed speedstep capability checks"
256 depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
257 help
258 Don't perform all checks for a speedstep capable system which would
259 normally be done. Some ancient or strange systems, though speedstep
260 capable, don't always indicate that they are speedstep capable. This
261 option lets the probing code bypass some of those checks if the
262 parameter "relaxed_check=1" is passed to the module.
263
264endif # CPU_FREQ
265
266endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
11obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
12obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
13obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
14obj-$(CONFIG_X86_LONGRUN) += longrun.o
15obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
16obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
17obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
18obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
19obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
20obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
21obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index cd8da247dda1..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,775 +0,0 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or (at
14 * your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24 *
25 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 */
27
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
33#include <linux/cpufreq.h>
34#include <linux/compiler.h>
35#include <linux/dmi.h>
36#include <linux/slab.h>
37
38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
43#include <acpi/processor.h>
44
45#include <asm/msr.h>
46#include <asm/processor.h>
47#include <asm/cpufeature.h>
48#include "mperf.h"
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg)
52
53MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
54MODULE_DESCRIPTION("ACPI Processor P-States Driver");
55MODULE_LICENSE("GPL");
56
57enum {
58 UNDEFINED_CAPABLE = 0,
59 SYSTEM_INTEL_MSR_CAPABLE,
60 SYSTEM_IO_CAPABLE,
61};
62
63#define INTEL_MSR_RANGE (0xffff)
64
65struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data;
67 struct cpufreq_frequency_table *freq_table;
68 unsigned int resume;
69 unsigned int cpu_feature;
70};
71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73
74/* acpi_perf_data is a pointer to percpu data. */
75static struct acpi_processor_performance __percpu *acpi_perf_data;
76
77static struct cpufreq_driver acpi_cpufreq_driver;
78
79static unsigned int acpi_pstate_strict;
80
81static int check_est_cpu(unsigned int cpuid)
82{
83 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
84
85 return cpu_has(cpu, X86_FEATURE_EST);
86}
87
88static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
89{
90 struct acpi_processor_performance *perf;
91 int i;
92
93 perf = data->acpi_data;
94
95 for (i = 0; i < perf->state_count; i++) {
96 if (value == perf->states[i].status)
97 return data->freq_table[i].frequency;
98 }
99 return 0;
100}
101
102static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
103{
104 int i;
105 struct acpi_processor_performance *perf;
106
107 msr &= INTEL_MSR_RANGE;
108 perf = data->acpi_data;
109
110 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
111 if (msr == perf->states[data->freq_table[i].index].status)
112 return data->freq_table[i].frequency;
113 }
114 return data->freq_table[0].frequency;
115}
116
117static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
118{
119 switch (data->cpu_feature) {
120 case SYSTEM_INTEL_MSR_CAPABLE:
121 return extract_msr(val, data);
122 case SYSTEM_IO_CAPABLE:
123 return extract_io(val, data);
124 default:
125 return 0;
126 }
127}
128
129struct msr_addr {
130 u32 reg;
131};
132
133struct io_addr {
134 u16 port;
135 u8 bit_width;
136};
137
138struct drv_cmd {
139 unsigned int type;
140 const struct cpumask *mask;
141 union {
142 struct msr_addr msr;
143 struct io_addr io;
144 } addr;
145 u32 val;
146};
147
148/* Called via smp_call_function_single(), on the target CPU */
149static void do_drv_read(void *_cmd)
150{
151 struct drv_cmd *cmd = _cmd;
152 u32 h;
153
154 switch (cmd->type) {
155 case SYSTEM_INTEL_MSR_CAPABLE:
156 rdmsr(cmd->addr.msr.reg, cmd->val, h);
157 break;
158 case SYSTEM_IO_CAPABLE:
159 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
160 &cmd->val,
161 (u32)cmd->addr.io.bit_width);
162 break;
163 default:
164 break;
165 }
166}
167
168/* Called via smp_call_function_many(), on the target CPUs */
169static void do_drv_write(void *_cmd)
170{
171 struct drv_cmd *cmd = _cmd;
172 u32 lo, hi;
173
174 switch (cmd->type) {
175 case SYSTEM_INTEL_MSR_CAPABLE:
176 rdmsr(cmd->addr.msr.reg, lo, hi);
177 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
178 wrmsr(cmd->addr.msr.reg, lo, hi);
179 break;
180 case SYSTEM_IO_CAPABLE:
181 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
182 cmd->val,
183 (u32)cmd->addr.io.bit_width);
184 break;
185 default:
186 break;
187 }
188}
189
190static void drv_read(struct drv_cmd *cmd)
191{
192 int err;
193 cmd->val = 0;
194
195 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
196 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
197}
198
199static void drv_write(struct drv_cmd *cmd)
200{
201 int this_cpu;
202
203 this_cpu = get_cpu();
204 if (cpumask_test_cpu(this_cpu, cmd->mask))
205 do_drv_write(cmd);
206 smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
207 put_cpu();
208}
209
210static u32 get_cur_val(const struct cpumask *mask)
211{
212 struct acpi_processor_performance *perf;
213 struct drv_cmd cmd;
214
215 if (unlikely(cpumask_empty(mask)))
216 return 0;
217
218 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
219 case SYSTEM_INTEL_MSR_CAPABLE:
220 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
221 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
222 break;
223 case SYSTEM_IO_CAPABLE:
224 cmd.type = SYSTEM_IO_CAPABLE;
225 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
226 cmd.addr.io.port = perf->control_register.address;
227 cmd.addr.io.bit_width = perf->control_register.bit_width;
228 break;
229 default:
230 return 0;
231 }
232
233 cmd.mask = mask;
234 drv_read(&cmd);
235
236 dprintk("get_cur_val = %u\n", cmd.val);
237
238 return cmd.val;
239}
240
241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
242{
243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
244 unsigned int freq;
245 unsigned int cached_freq;
246
247 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
248
249 if (unlikely(data == NULL ||
250 data->acpi_data == NULL || data->freq_table == NULL)) {
251 return 0;
252 }
253
254 cached_freq = data->freq_table[data->acpi_data->state].frequency;
255 freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
256 if (freq != cached_freq) {
257 /*
258 * The dreaded BIOS frequency change behind our back.
259 * Force set the frequency on next target call.
260 */
261 data->resume = 1;
262 }
263
264 dprintk("cur freq = %u\n", freq);
265
266 return freq;
267}
268
269static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
270 struct acpi_cpufreq_data *data)
271{
272 unsigned int cur_freq;
273 unsigned int i;
274
275 for (i = 0; i < 100; i++) {
276 cur_freq = extract_freq(get_cur_val(mask), data);
277 if (cur_freq == freq)
278 return 1;
279 udelay(10);
280 }
281 return 0;
282}
283
284static int acpi_cpufreq_target(struct cpufreq_policy *policy,
285 unsigned int target_freq, unsigned int relation)
286{
287 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
288 struct acpi_processor_performance *perf;
289 struct cpufreq_freqs freqs;
290 struct drv_cmd cmd;
291 unsigned int next_state = 0; /* Index into freq_table */
292 unsigned int next_perf_state = 0; /* Index into perf table */
293 unsigned int i;
294 int result = 0;
295
296 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
297
298 if (unlikely(data == NULL ||
299 data->acpi_data == NULL || data->freq_table == NULL)) {
300 return -ENODEV;
301 }
302
303 perf = data->acpi_data;
304 result = cpufreq_frequency_table_target(policy,
305 data->freq_table,
306 target_freq,
307 relation, &next_state);
308 if (unlikely(result)) {
309 result = -ENODEV;
310 goto out;
311 }
312
313 next_perf_state = data->freq_table[next_state].index;
314 if (perf->state == next_perf_state) {
315 if (unlikely(data->resume)) {
316 dprintk("Called after resume, resetting to P%d\n",
317 next_perf_state);
318 data->resume = 0;
319 } else {
320 dprintk("Already at target state (P%d)\n",
321 next_perf_state);
322 goto out;
323 }
324 }
325
326 switch (data->cpu_feature) {
327 case SYSTEM_INTEL_MSR_CAPABLE:
328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
329 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
330 cmd.val = (u32) perf->states[next_perf_state].control;
331 break;
332 case SYSTEM_IO_CAPABLE:
333 cmd.type = SYSTEM_IO_CAPABLE;
334 cmd.addr.io.port = perf->control_register.address;
335 cmd.addr.io.bit_width = perf->control_register.bit_width;
336 cmd.val = (u32) perf->states[next_perf_state].control;
337 break;
338 default:
339 result = -ENODEV;
340 goto out;
341 }
342
343 /* cpufreq holds the hotplug lock, so we are safe from here on */
344 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
345 cmd.mask = policy->cpus;
346 else
347 cmd.mask = cpumask_of(policy->cpu);
348
349 freqs.old = perf->states[perf->state].core_frequency * 1000;
350 freqs.new = data->freq_table[next_state].frequency;
351 for_each_cpu(i, policy->cpus) {
352 freqs.cpu = i;
353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
354 }
355
356 drv_write(&cmd);
357
358 if (acpi_pstate_strict) {
359 if (!check_freqs(cmd.mask, freqs.new, data)) {
360 dprintk("acpi_cpufreq_target failed (%d)\n",
361 policy->cpu);
362 result = -EAGAIN;
363 goto out;
364 }
365 }
366
367 for_each_cpu(i, policy->cpus) {
368 freqs.cpu = i;
369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
370 }
371 perf->state = next_perf_state;
372
373out:
374 return result;
375}
376
377static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
378{
379 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
380
381 dprintk("acpi_cpufreq_verify\n");
382
383 return cpufreq_frequency_table_verify(policy, data->freq_table);
384}
385
386static unsigned long
387acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
388{
389 struct acpi_processor_performance *perf = data->acpi_data;
390
391 if (cpu_khz) {
392 /* search the closest match to cpu_khz */
393 unsigned int i;
394 unsigned long freq;
395 unsigned long freqn = perf->states[0].core_frequency * 1000;
396
397 for (i = 0; i < (perf->state_count-1); i++) {
398 freq = freqn;
399 freqn = perf->states[i+1].core_frequency * 1000;
400 if ((2 * cpu_khz) > (freqn + freq)) {
401 perf->state = i;
402 return freq;
403 }
404 }
405 perf->state = perf->state_count-1;
406 return freqn;
407 } else {
408 /* assume CPU is at P0... */
409 perf->state = 0;
410 return perf->states[0].core_frequency * 1000;
411 }
412}
413
414static void free_acpi_perf_data(void)
415{
416 unsigned int i;
417
418 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
419 for_each_possible_cpu(i)
420 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
421 ->shared_cpu_map);
422 free_percpu(acpi_perf_data);
423}
424
425/*
426 * acpi_cpufreq_early_init - initialize ACPI P-States library
427 *
428 * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
429 * in order to determine correct frequency and voltage pairings. We can
430 * do _PDC and _PSD and find out the processor dependency for the
431 * actual init that will happen later...
432 */
433static int __init acpi_cpufreq_early_init(void)
434{
435 unsigned int i;
436 dprintk("acpi_cpufreq_early_init\n");
437
438 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
439 if (!acpi_perf_data) {
440 dprintk("Memory allocation error for acpi_perf_data.\n");
441 return -ENOMEM;
442 }
443 for_each_possible_cpu(i) {
444 if (!zalloc_cpumask_var_node(
445 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
446 GFP_KERNEL, cpu_to_node(i))) {
447
448 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
449 free_acpi_perf_data();
450 return -ENOMEM;
451 }
452 }
453
454 /* Do initialization in ACPI core */
455 acpi_processor_preregister_performance(acpi_perf_data);
456 return 0;
457}
458
459#ifdef CONFIG_SMP
460/*
461 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
462 * or do it in BIOS firmware and won't inform about it to OS. If not
463 * detected, this has a side effect of making CPU run at a different speed
464 * than OS intended it to run at. Detect it and handle it cleanly.
465 */
466static int bios_with_sw_any_bug;
467
468static int sw_any_bug_found(const struct dmi_system_id *d)
469{
470 bios_with_sw_any_bug = 1;
471 return 0;
472}
473
474static const struct dmi_system_id sw_any_bug_dmi_table[] = {
475 {
476 .callback = sw_any_bug_found,
477 .ident = "Supermicro Server X6DLP",
478 .matches = {
479 DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
480 DMI_MATCH(DMI_BIOS_VERSION, "080010"),
481 DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
482 },
483 },
484 { }
485};
486
487static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
488{
489 /* Intel Xeon Processor 7100 Series Specification Update
490 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
491 * AL30: A Machine Check Exception (MCE) Occurring during an
492 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
493 * Both Processor Cores to Lock Up. */
494 if (c->x86_vendor == X86_VENDOR_INTEL) {
495 if ((c->x86 == 15) &&
496 (c->x86_model == 6) &&
497 (c->x86_mask == 8)) {
498 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
499 "Xeon(R) 7100 Errata AL30, processors may "
500 "lock up on frequency changes: disabling "
501 "acpi-cpufreq.\n");
502 return -ENODEV;
503 }
504 }
505 return 0;
506}
507#endif
508
509static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
510{
511 unsigned int i;
512 unsigned int valid_states = 0;
513 unsigned int cpu = policy->cpu;
514 struct acpi_cpufreq_data *data;
515 unsigned int result = 0;
516 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
517 struct acpi_processor_performance *perf;
518#ifdef CONFIG_SMP
519 static int blacklisted;
520#endif
521
522 dprintk("acpi_cpufreq_cpu_init\n");
523
524#ifdef CONFIG_SMP
525 if (blacklisted)
526 return blacklisted;
527 blacklisted = acpi_cpufreq_blacklist(c);
528 if (blacklisted)
529 return blacklisted;
530#endif
531
532 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
533 if (!data)
534 return -ENOMEM;
535
536 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
537 per_cpu(acfreq_data, cpu) = data;
538
539 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
540 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
541
542 result = acpi_processor_register_performance(data->acpi_data, cpu);
543 if (result)
544 goto err_free;
545
546 perf = data->acpi_data;
547 policy->shared_type = perf->shared_type;
548
549 /*
550 * Will let policy->cpus know about dependency only when software
551 * coordination is required.
552 */
553 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
554 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
555 cpumask_copy(policy->cpus, perf->shared_cpu_map);
556 }
557 cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
558
559#ifdef CONFIG_SMP
560 dmi_check_system(sw_any_bug_dmi_table);
561 if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
562 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
563 cpumask_copy(policy->cpus, cpu_core_mask(cpu));
564 }
565#endif
566
567 /* capability check */
568 if (perf->state_count <= 1) {
569 dprintk("No P-States\n");
570 result = -ENODEV;
571 goto err_unreg;
572 }
573
574 if (perf->control_register.space_id != perf->status_register.space_id) {
575 result = -ENODEV;
576 goto err_unreg;
577 }
578
579 switch (perf->control_register.space_id) {
580 case ACPI_ADR_SPACE_SYSTEM_IO:
581 dprintk("SYSTEM IO addr space\n");
582 data->cpu_feature = SYSTEM_IO_CAPABLE;
583 break;
584 case ACPI_ADR_SPACE_FIXED_HARDWARE:
585 dprintk("HARDWARE addr space\n");
586 if (!check_est_cpu(cpu)) {
587 result = -ENODEV;
588 goto err_unreg;
589 }
590 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
591 break;
592 default:
593 dprintk("Unknown addr space %d\n",
594 (u32) (perf->control_register.space_id));
595 result = -ENODEV;
596 goto err_unreg;
597 }
598
599 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
600 (perf->state_count+1), GFP_KERNEL);
601 if (!data->freq_table) {
602 result = -ENOMEM;
603 goto err_unreg;
604 }
605
606 /* detect transition latency */
607 policy->cpuinfo.transition_latency = 0;
608 for (i = 0; i < perf->state_count; i++) {
609 if ((perf->states[i].transition_latency * 1000) >
610 policy->cpuinfo.transition_latency)
611 policy->cpuinfo.transition_latency =
612 perf->states[i].transition_latency * 1000;
613 }
614
615 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
616 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
617 policy->cpuinfo.transition_latency > 20 * 1000) {
618 policy->cpuinfo.transition_latency = 20 * 1000;
619 printk_once(KERN_INFO
620 "P-state transition latency capped at 20 uS\n");
621 }
622
623 /* table init */
624 for (i = 0; i < perf->state_count; i++) {
625 if (i > 0 && perf->states[i].core_frequency >=
626 data->freq_table[valid_states-1].frequency / 1000)
627 continue;
628
629 data->freq_table[valid_states].index = i;
630 data->freq_table[valid_states].frequency =
631 perf->states[i].core_frequency * 1000;
632 valid_states++;
633 }
634 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
635 perf->state = 0;
636
637 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
638 if (result)
639 goto err_freqfree;
640
641 if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
642 printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
643
644 switch (perf->control_register.space_id) {
645 case ACPI_ADR_SPACE_SYSTEM_IO:
646 /* Current speed is unknown and not detectable by IO port */
647 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
648 break;
649 case ACPI_ADR_SPACE_FIXED_HARDWARE:
650 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
651 policy->cur = get_cur_freq_on_cpu(cpu);
652 break;
653 default:
654 break;
655 }
656
657 /* notify BIOS that we exist */
658 acpi_processor_notify_smm(THIS_MODULE);
659
660 /* Check for APERF/MPERF support in hardware */
661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
663
664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
665 for (i = 0; i < perf->state_count; i++)
666 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
667 (i == perf->state ? '*' : ' '), i,
668 (u32) perf->states[i].core_frequency,
669 (u32) perf->states[i].power,
670 (u32) perf->states[i].transition_latency);
671
672 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
673
674 /*
675 * the first call to ->target() should result in us actually
676 * writing something to the appropriate registers.
677 */
678 data->resume = 1;
679
680 return result;
681
682err_freqfree:
683 kfree(data->freq_table);
684err_unreg:
685 acpi_processor_unregister_performance(perf, cpu);
686err_free:
687 kfree(data);
688 per_cpu(acfreq_data, cpu) = NULL;
689
690 return result;
691}
692
693static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
694{
695 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
696
697 dprintk("acpi_cpufreq_cpu_exit\n");
698
699 if (data) {
700 cpufreq_frequency_table_put_attr(policy->cpu);
701 per_cpu(acfreq_data, policy->cpu) = NULL;
702 acpi_processor_unregister_performance(data->acpi_data,
703 policy->cpu);
704 kfree(data);
705 }
706
707 return 0;
708}
709
710static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
711{
712 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
713
714 dprintk("acpi_cpufreq_resume\n");
715
716 data->resume = 1;
717
718 return 0;
719}
720
721static struct freq_attr *acpi_cpufreq_attr[] = {
722 &cpufreq_freq_attr_scaling_available_freqs,
723 NULL,
724};
725
726static struct cpufreq_driver acpi_cpufreq_driver = {
727 .verify = acpi_cpufreq_verify,
728 .target = acpi_cpufreq_target,
729 .bios_limit = acpi_processor_get_bios_limit,
730 .init = acpi_cpufreq_cpu_init,
731 .exit = acpi_cpufreq_cpu_exit,
732 .resume = acpi_cpufreq_resume,
733 .name = "acpi-cpufreq",
734 .owner = THIS_MODULE,
735 .attr = acpi_cpufreq_attr,
736};
737
738static int __init acpi_cpufreq_init(void)
739{
740 int ret;
741
742 if (acpi_disabled)
743 return 0;
744
745 dprintk("acpi_cpufreq_init\n");
746
747 ret = acpi_cpufreq_early_init();
748 if (ret)
749 return ret;
750
751 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
752 if (ret)
753 free_acpi_perf_data();
754
755 return ret;
756}
757
758static void __exit acpi_cpufreq_exit(void)
759{
760 dprintk("acpi_cpufreq_exit\n");
761
762 cpufreq_unregister_driver(&acpi_cpufreq_driver);
763
764 free_percpu(acpi_perf_data);
765}
766
767module_param(acpi_pstate_strict, uint, 0644);
768MODULE_PARM_DESC(acpi_pstate_strict,
769 "value 0 or non-zero. non-zero -> strict ACPI checks are "
770 "performed during frequency changes.");
771
772late_initcall(acpi_cpufreq_init);
773module_exit(acpi_cpufreq_exit);
774
775MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 733093d60436..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29/* #define NFORCE2_DELAY 10 */
30
31/*
32 * nforce2_chipset:
33 * FSB is changed using the chipset
34 */
35static struct pci_dev *nforce2_dev;
36
37/* fid:
38 * multiplier * 10
39 */
40static int fid;
41
42/* min_fsb, max_fsb:
43 * minimum and maximum FSB (= FSB at boot time)
44 */
45static int min_fsb;
46static int max_fsb;
47
48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
50MODULE_LICENSE("GPL");
51
52module_param(fid, int, 0444);
53module_param(min_fsb, int, 0444);
54
55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50");
58
59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
62
63/**
64 * nforce2_calc_fsb - calculate FSB
65 * @pll: PLL value
66 *
67 * Calculates FSB from PLL value
68 */
69static int nforce2_calc_fsb(int pll)
70{
71 unsigned char mul, div;
72
73 mul = (pll >> 8) & 0xff;
74 div = pll & 0xff;
75
76 if (div > 0)
77 return NFORCE2_XTAL * mul / div;
78
79 return 0;
80}
81
82/**
83 * nforce2_calc_pll - calculate PLL value
84 * @fsb: FSB
85 *
86 * Calculate PLL value for given FSB
87 */
88static int nforce2_calc_pll(unsigned int fsb)
89{
90 unsigned char xmul, xdiv;
91 unsigned char mul = 0, div = 0;
92 int tried = 0;
93
94 /* Try to calculate multiplier and divider up to 4 times */
95 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
96 for (xdiv = 2; xdiv <= 0x80; xdiv++)
97 for (xmul = 1; xmul <= 0xfe; xmul++)
98 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
99 fsb + tried) {
100 mul = xmul;
101 div = xdiv;
102 }
103 tried++;
104 }
105
106 if ((mul == 0) || (div == 0))
107 return -1;
108
109 return NFORCE2_PLL(mul, div);
110}
111
112/**
113 * nforce2_write_pll - write PLL value to chipset
114 * @pll: PLL value
115 *
116 * Writes new FSB PLL value to chipset
117 */
118static void nforce2_write_pll(int pll)
119{
120 int temp;
121
122 /* Set the pll addr. to 0x00 */
123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
124
125 /* Now write the value in all 64 registers */
126 for (temp = 0; temp <= 0x3f; temp++)
127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
128
129 return;
130}
131
132/**
133 * nforce2_fsb_read - Read FSB
134 *
135 * Read FSB from chipset
136 * If bootfsb != 0, return FSB at boot-time
137 */
138static unsigned int nforce2_fsb_read(int bootfsb)
139{
140 struct pci_dev *nforce2_sub5;
141 u32 fsb, temp = 0;
142
143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
145 PCI_ANY_ID, PCI_ANY_ID, NULL);
146 if (!nforce2_sub5)
147 return 0;
148
149 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
150 fsb /= 1000000;
151
152 /* Check if PLL register is already set */
153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
154
155 if (bootfsb || !temp)
156 return fsb;
157
158 /* Use PLL register FSB value */
159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
160 fsb = nforce2_calc_fsb(temp);
161
162 return fsb;
163}
164
165/**
166 * nforce2_set_fsb - set new FSB
167 * @fsb: New FSB
168 *
169 * Sets new FSB
170 */
171static int nforce2_set_fsb(unsigned int fsb)
172{
173 u32 temp = 0;
174 unsigned int tfsb;
175 int diff;
176 int pll = 0;
177
178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
180 return -EINVAL;
181 }
182
183 tfsb = nforce2_fsb_read(0);
184 if (!tfsb) {
185 printk(KERN_ERR PFX "Error while reading the FSB\n");
186 return -EINVAL;
187 }
188
189 /* First write? Then set actual value */
190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
191 if (!temp) {
192 pll = nforce2_calc_pll(tfsb);
193
194 if (pll < 0)
195 return -EINVAL;
196
197 nforce2_write_pll(pll);
198 }
199
200 /* Enable write access */
201 temp = 0x01;
202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
203
204 diff = tfsb - fsb;
205
206 if (!diff)
207 return 0;
208
209 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
210 if (diff < 0)
211 tfsb++;
212 else
213 tfsb--;
214
215 /* Calculate the PLL reg. value */
216 pll = nforce2_calc_pll(tfsb);
217 if (pll == -1)
218 return -EINVAL;
219
220 nforce2_write_pll(pll);
221#ifdef NFORCE2_DELAY
222 mdelay(NFORCE2_DELAY);
223#endif
224 }
225
226 temp = 0x40;
227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
228
229 return 0;
230}
231
232/**
233 * nforce2_get - get the CPU frequency
234 * @cpu: CPU number
235 *
236 * Returns the CPU frequency
237 */
238static unsigned int nforce2_get(unsigned int cpu)
239{
240 if (cpu)
241 return 0;
242 return nforce2_fsb_read(0) * fid * 100;
243}
244
245/**
246 * nforce2_target - set a new CPUFreq policy
247 * @policy: new policy
248 * @target_freq: the target frequency
249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
251 *
252 * Sets a new CPUFreq policy.
253 */
254static int nforce2_target(struct cpufreq_policy *policy,
255 unsigned int target_freq, unsigned int relation)
256{
257/* unsigned long flags; */
258 struct cpufreq_freqs freqs;
259 unsigned int target_fsb;
260
261 if ((target_freq > policy->max) || (target_freq < policy->min))
262 return -EINVAL;
263
264 target_fsb = target_freq / (fid * 100);
265
266 freqs.old = nforce2_get(policy->cpu);
267 freqs.new = target_fsb * fid * 100;
268 freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
269
270 if (freqs.old == freqs.new)
271 return 0;
272
273 dprintk("Old CPU frequency %d kHz, new %d kHz\n",
274 freqs.old, freqs.new);
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 /* Disable IRQs */
279 /* local_irq_save(flags); */
280
281 if (nforce2_set_fsb(target_fsb) < 0)
282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
283 target_fsb);
284 else
285 dprintk("Changed FSB successfully to %d\n",
286 target_fsb);
287
288 /* Enable IRQs */
289 /* local_irq_restore(flags); */
290
291 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
292
293 return 0;
294}
295
296/**
297 * nforce2_verify - verifies a new CPUFreq policy
298 * @policy: new policy
299 */
300static int nforce2_verify(struct cpufreq_policy *policy)
301{
302 unsigned int fsb_pol_max;
303
304 fsb_pol_max = policy->max / (fid * 100);
305
306 if (policy->min < (fsb_pol_max * fid * 100))
307 policy->max = (fsb_pol_max + 1) * fid * 100;
308
309 cpufreq_verify_within_limits(policy,
310 policy->cpuinfo.min_freq,
311 policy->cpuinfo.max_freq);
312 return 0;
313}
314
315static int nforce2_cpu_init(struct cpufreq_policy *policy)
316{
317 unsigned int fsb;
318 unsigned int rfid;
319
320 /* capability check */
321 if (policy->cpu != 0)
322 return -ENODEV;
323
324 /* Get current FSB */
325 fsb = nforce2_fsb_read(0);
326
327 if (!fsb)
328 return -EIO;
329
330 /* FIX: Get FID from CPU */
331 if (!fid) {
332 if (!cpu_khz) {
333 printk(KERN_WARNING PFX
334 "cpu_khz not set, can't calculate multiplier!\n");
335 return -ENODEV;
336 }
337
338 fid = cpu_khz / (fsb * 100);
339 rfid = fid % 5;
340
341 if (rfid) {
342 if (rfid > 2)
343 fid += 5 - rfid;
344 else
345 fid -= rfid;
346 }
347 }
348
349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
350 fid / 10, fid % 10);
351
352 /* Set maximum FSB to FSB at boot time */
353 max_fsb = nforce2_fsb_read(1);
354
355 if (!max_fsb)
356 return -EIO;
357
358 if (!min_fsb)
359 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
360
361 if (min_fsb < NFORCE2_MIN_FSB)
362 min_fsb = NFORCE2_MIN_FSB;
363
364 /* cpuinfo and default policy values */
365 policy->cpuinfo.min_freq = min_fsb * fid * 100;
366 policy->cpuinfo.max_freq = max_fsb * fid * 100;
367 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
368 policy->cur = nforce2_get(policy->cpu);
369 policy->min = policy->cpuinfo.min_freq;
370 policy->max = policy->cpuinfo.max_freq;
371
372 return 0;
373}
374
375static int nforce2_cpu_exit(struct cpufreq_policy *policy)
376{
377 return 0;
378}
379
380static struct cpufreq_driver nforce2_driver = {
381 .name = "nforce2",
382 .verify = nforce2_verify,
383 .target = nforce2_target,
384 .get = nforce2_get,
385 .init = nforce2_cpu_init,
386 .exit = nforce2_cpu_exit,
387 .owner = THIS_MODULE,
388};
389
390/**
391 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
392 *
393 * Detects nForce2 A2 and C1 stepping
394 *
395 */
396static unsigned int nforce2_detect_chipset(void)
397{
398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
400 PCI_ANY_ID, PCI_ANY_ID, NULL);
401
402 if (nforce2_dev == NULL)
403 return -ENODEV;
404
405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
406 nforce2_dev->revision);
407 printk(KERN_INFO PFX
408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
410
411 return 0;
412}
413
414/**
415 * nforce2_init - initializes the nForce2 CPUFreq driver
416 *
417 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
418 * devices, -EINVAL on problems during initiatization, and zero on
419 * success.
420 */
421static int __init nforce2_init(void)
422{
423 /* TODO: do we need to detect the processor? */
424
425 /* detect chipset */
426 if (nforce2_detect_chipset()) {
427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
428 return -ENODEV;
429 }
430
431 return cpufreq_register_driver(&nforce2_driver);
432}
433
434/**
435 * nforce2_exit - unregisters cpufreq module
436 *
437 * Unregisters nForce2 FSB change support.
438 */
439static void __exit nforce2_exit(void)
440{
441 cpufreq_unregister_driver(&nforce2_driver);
442}
443
444module_init(nforce2_init);
445module_exit(nforce2_exit);
446
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
18
19#include <asm/msr.h>
20#include <asm/tsc.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26#define EPS_BRAND_C7D 4
27
28struct eps_cpu_data {
29 u32 fsb;
30 struct cpufreq_frequency_table freq_table[];
31};
32
33static struct eps_cpu_data *eps_cpu[NR_CPUS];
34
35
36static unsigned int eps_get(unsigned int cpu)
37{
38 struct eps_cpu_data *centaur;
39 u32 lo, hi;
40
41 if (cpu)
42 return 0;
43 centaur = eps_cpu[cpu];
44 if (centaur == NULL)
45 return 0;
46
47 /* Return current frequency */
48 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
49 return centaur->fsb * ((lo >> 8) & 0xff);
50}
51
52static int eps_set_state(struct eps_cpu_data *centaur,
53 unsigned int cpu,
54 u32 dest_state)
55{
56 struct cpufreq_freqs freqs;
57 u32 lo, hi;
58 int err = 0;
59 int i;
60
61 freqs.old = eps_get(cpu);
62 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
63 freqs.cpu = cpu;
64 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
65
66 /* Wait while CPU is busy */
67 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
68 i = 0;
69 while (lo & ((1 << 16) | (1 << 17))) {
70 udelay(16);
71 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
72 i++;
73 if (unlikely(i > 64)) {
74 err = -ENODEV;
75 goto postchange;
76 }
77 }
78 /* Set new multiplier and voltage */
79 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
80 /* Wait until transition end */
81 i = 0;
82 do {
83 udelay(16);
84 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
85 i++;
86 if (unlikely(i > 64)) {
87 err = -ENODEV;
88 goto postchange;
89 }
90 } while (lo & ((1 << 16) | (1 << 17)));
91
92 /* Return current frequency */
93postchange:
94 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
95 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
96
97#ifdef DEBUG
98 {
99 u8 current_multiplier, current_voltage;
100
101 /* Print voltage and multiplier */
102 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
103 current_voltage = lo & 0xff;
104 printk(KERN_INFO "eps: Current voltage = %dmV\n",
105 current_voltage * 16 + 700);
106 current_multiplier = (lo >> 8) & 0xff;
107 printk(KERN_INFO "eps: Current multiplier = %d\n",
108 current_multiplier);
109 }
110#endif
111 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
112 return err;
113}
114
115static int eps_target(struct cpufreq_policy *policy,
116 unsigned int target_freq,
117 unsigned int relation)
118{
119 struct eps_cpu_data *centaur;
120 unsigned int newstate = 0;
121 unsigned int cpu = policy->cpu;
122 unsigned int dest_state;
123 int ret;
124
125 if (unlikely(eps_cpu[cpu] == NULL))
126 return -ENODEV;
127 centaur = eps_cpu[cpu];
128
129 if (unlikely(cpufreq_frequency_table_target(policy,
130 &eps_cpu[cpu]->freq_table[0],
131 target_freq,
132 relation,
133 &newstate))) {
134 return -EINVAL;
135 }
136
137 /* Make frequency transition */
138 dest_state = centaur->freq_table[newstate].index & 0xffff;
139 ret = eps_set_state(centaur, cpu, dest_state);
140 if (ret)
141 printk(KERN_ERR "eps: Timeout!\n");
142 return ret;
143}
144
145static int eps_verify(struct cpufreq_policy *policy)
146{
147 return cpufreq_frequency_table_verify(policy,
148 &eps_cpu[policy->cpu]->freq_table[0]);
149}
150
151static int eps_cpu_init(struct cpufreq_policy *policy)
152{
153 unsigned int i;
154 u32 lo, hi;
155 u64 val;
156 u8 current_multiplier, current_voltage;
157 u8 max_multiplier, max_voltage;
158 u8 min_multiplier, min_voltage;
159 u8 brand = 0;
160 u32 fsb;
161 struct eps_cpu_data *centaur;
162 struct cpuinfo_x86 *c = &cpu_data(0);
163 struct cpufreq_frequency_table *f_table;
164 int k, step, voltage;
165 int ret;
166 int states;
167
168 if (policy->cpu != 0)
169 return -ENODEV;
170
171 /* Check brand */
172 printk(KERN_INFO "eps: Detected VIA ");
173
174 switch (c->x86_model) {
175 case 10:
176 rdmsr(0x1153, lo, hi);
177 brand = (((lo >> 2) ^ lo) >> 18) & 3;
178 printk(KERN_CONT "Model A ");
179 break;
180 case 13:
181 rdmsr(0x1154, lo, hi);
182 brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
183 printk(KERN_CONT "Model D ");
184 break;
185 }
186
187 switch (brand) {
188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n");
190 break;
191 case EPS_BRAND_C7:
192 printk(KERN_CONT "C7\n");
193 break;
194 case EPS_BRAND_EDEN:
195 printk(KERN_CONT "Eden\n");
196 break;
197 case EPS_BRAND_C7D:
198 printk(KERN_CONT "C7-D\n");
199 break;
200 case EPS_BRAND_C3:
201 printk(KERN_CONT "C3\n");
202 return -ENODEV;
203 break;
204 }
205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV;
215 }
216 }
217
218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
223 current_multiplier = (lo >> 8) & 0xff;
224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
225
226 /* Print limits */
227 max_voltage = hi & 0xff;
228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
230 max_multiplier = (hi >> 8) & 0xff;
231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
232 min_voltage = (hi >> 16) & 0xff;
233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
235 min_multiplier = (hi >> 24) & 0xff;
236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
237
238 /* Sanity checks */
239 if (current_multiplier == 0 || max_multiplier == 0
240 || min_multiplier == 0)
241 return -EINVAL;
242 if (current_multiplier > max_multiplier
243 || max_multiplier <= min_multiplier)
244 return -EINVAL;
245 if (current_voltage > 0x1f || max_voltage > 0x1f)
246 return -EINVAL;
247 if (max_voltage < min_voltage)
248 return -EINVAL;
249
250 /* Calc FSB speed */
251 fsb = cpu_khz / current_multiplier;
252 /* Calc number of p-states supported */
253 if (brand == EPS_BRAND_C7M)
254 states = max_multiplier - min_multiplier + 1;
255 else
256 states = 2;
257
258 /* Allocate private data and frequency table for current cpu */
259 centaur = kzalloc(sizeof(struct eps_cpu_data)
260 + (states + 1) * sizeof(struct cpufreq_frequency_table),
261 GFP_KERNEL);
262 if (!centaur)
263 return -ENOMEM;
264 eps_cpu[0] = centaur;
265
266 /* Copy basic values */
267 centaur->fsb = fsb;
268
269 /* Fill frequency and MSR value table */
270 f_table = &centaur->freq_table[0];
271 if (brand != EPS_BRAND_C7M) {
272 f_table[0].frequency = fsb * min_multiplier;
273 f_table[0].index = (min_multiplier << 8) | min_voltage;
274 f_table[1].frequency = fsb * max_multiplier;
275 f_table[1].index = (max_multiplier << 8) | max_voltage;
276 f_table[2].frequency = CPUFREQ_TABLE_END;
277 } else {
278 k = 0;
279 step = ((max_voltage - min_voltage) * 256)
280 / (max_multiplier - min_multiplier);
281 for (i = min_multiplier; i <= max_multiplier; i++) {
282 voltage = (k * step) / 256 + min_voltage;
283 f_table[k].frequency = fsb * i;
284 f_table[k].index = (i << 8) | voltage;
285 k++;
286 }
287 f_table[k].frequency = CPUFREQ_TABLE_END;
288 }
289
290 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
291 policy->cur = fsb * current_multiplier;
292
293 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
294 if (ret) {
295 kfree(centaur);
296 return ret;
297 }
298
299 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
300 return 0;
301}
302
303static int eps_cpu_exit(struct cpufreq_policy *policy)
304{
305 unsigned int cpu = policy->cpu;
306 struct eps_cpu_data *centaur;
307 u32 lo, hi;
308
309 if (eps_cpu[cpu] == NULL)
310 return -ENODEV;
311 centaur = eps_cpu[cpu];
312
313 /* Get max frequency */
314 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
315 /* Set max frequency */
316 eps_set_state(centaur, cpu, hi & 0xffff);
317 /* Bye */
318 cpufreq_frequency_table_put_attr(policy->cpu);
319 kfree(eps_cpu[cpu]);
320 eps_cpu[cpu] = NULL;
321 return 0;
322}
323
324static struct freq_attr *eps_attr[] = {
325 &cpufreq_freq_attr_scaling_available_freqs,
326 NULL,
327};
328
329static struct cpufreq_driver eps_driver = {
330 .verify = eps_verify,
331 .target = eps_target,
332 .init = eps_cpu_init,
333 .exit = eps_cpu_exit,
334 .get = eps_get,
335 .name = "e_powersaver",
336 .owner = THIS_MODULE,
337 .attr = eps_attr,
338};
339
340static int __init eps_init(void)
341{
342 struct cpuinfo_x86 *c = &cpu_data(0);
343
344 /* This driver will work only on Centaur C7 processors with
345 * Enhanced SpeedStep/PowerSaver registers */
346 if (c->x86_vendor != X86_VENDOR_CENTAUR
347 || c->x86 != 6 || c->x86_model < 10)
348 return -ENODEV;
349 if (!cpu_has(c, X86_FEATURE_EST))
350 return -ENODEV;
351
352 if (cpufreq_register_driver(&eps_driver))
353 return -EINVAL;
354 return 0;
355}
356
357static void __exit eps_exit(void)
358{
359 cpufreq_unregister_driver(&eps_driver);
360}
361
362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
364MODULE_LICENSE("GPL");
365
366module_init(eps_init);
367module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/delay.h>
24#include <linux/cpufreq.h>
25
26#include <asm/msr.h>
27#include <linux/timex.h>
28#include <linux/io.h>
29
30#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
31#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
32
33/* Module parameter */
34static int max_freq;
35
36struct s_elan_multiplier {
37 int clock; /* frequency in kHz */
38 int val40h; /* PMU Force Mode register */
39 int val80h; /* CPU Clock Speed Register */
40};
41
42/*
43 * It is important that the frequencies
44 * are listed in ascending order here!
45 */
46static struct s_elan_multiplier elan_multiplier[] = {
47 {1000, 0x02, 0x18},
48 {2000, 0x02, 0x10},
49 {4000, 0x02, 0x08},
50 {8000, 0x00, 0x00},
51 {16000, 0x00, 0x02},
52 {33000, 0x00, 0x04},
53 {66000, 0x01, 0x04},
54 {99000, 0x01, 0x05}
55};
56
57static struct cpufreq_frequency_table elanfreq_table[] = {
58 {0, 1000},
59 {1, 2000},
60 {2, 4000},
61 {3, 8000},
62 {4, 16000},
63 {5, 33000},
64 {6, 66000},
65 {7, 99000},
66 {0, CPUFREQ_TABLE_END},
67};
68
69
70/**
71 * elanfreq_get_cpu_frequency: determine current cpu speed
72 *
73 * Finds out at which frequency the CPU of the Elan SOC runs
74 * at the moment. Frequencies from 1 to 33 MHz are generated
75 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
76 * and have the rest of the chip running with 33 MHz.
77 */
78
79static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
80{
81 u8 clockspeed_reg; /* Clock Speed Register */
82
83 local_irq_disable();
84 outb_p(0x80, REG_CSCIR);
85 clockspeed_reg = inb_p(REG_CSCDR);
86 local_irq_enable();
87
88 if ((clockspeed_reg & 0xE0) == 0xE0)
89 return 0;
90
91 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
92 if ((clockspeed_reg & 0xE0) == 0xC0) {
93 if ((clockspeed_reg & 0x01) == 0)
94 return 66000;
95 else
96 return 99000;
97 }
98
99 /* 33 MHz is not 32 MHz... */
100 if ((clockspeed_reg & 0xE0) == 0xA0)
101 return 33000;
102
103 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
104}
105
106
107/**
108 * elanfreq_set_cpu_frequency: Change the CPU core frequency
109 * @cpu: cpu number
110 * @freq: frequency in kHz
111 *
112 * This function takes a frequency value and changes the CPU frequency
113 * according to this. Note that the frequency has to be checked by
114 * elanfreq_validatespeed() for correctness!
115 *
116 * There is no return value.
117 */
118
119static void elanfreq_set_cpu_state(unsigned int state)
120{
121 struct cpufreq_freqs freqs;
122
123 freqs.old = elanfreq_get_cpu_frequency(0);
124 freqs.new = elan_multiplier[state].clock;
125 freqs.cpu = 0; /* elanfreq.c is UP only driver */
126
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128
129 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
130 elan_multiplier[state].clock);
131
132
133 /*
134 * Access to the Elan's internal registers is indexed via
135 * 0x22: Chip Setup & Control Register Index Register (CSCI)
136 * 0x23: Chip Setup & Control Register Data Register (CSCD)
137 *
138 */
139
140 /*
141 * 0x40 is the Power Management Unit's Force Mode Register.
142 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
143 */
144
145 local_irq_disable();
146 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
147 outb_p(0x00, REG_CSCDR);
148 local_irq_enable(); /* wait till internal pipelines and */
149 udelay(1000); /* buffers have cleaned up */
150
151 local_irq_disable();
152
153 /* now, set the CPU clock speed register (0x80) */
154 outb_p(0x80, REG_CSCIR);
155 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
156
157 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
158 outb_p(0x40, REG_CSCIR);
159 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
160 udelay(10000);
161 local_irq_enable();
162
163 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
164};
165
166
167/**
168 * elanfreq_validatespeed: test if frequency range is valid
169 * @policy: the policy to validate
170 *
171 * This function checks if a given frequency range in kHz is valid
172 * for the hardware supported by the driver.
173 */
174
175static int elanfreq_verify(struct cpufreq_policy *policy)
176{
177 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
178}
179
180static int elanfreq_target(struct cpufreq_policy *policy,
181 unsigned int target_freq,
182 unsigned int relation)
183{
184 unsigned int newstate = 0;
185
186 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
187 target_freq, relation, &newstate))
188 return -EINVAL;
189
190 elanfreq_set_cpu_state(newstate);
191
192 return 0;
193}
194
195
196/*
197 * Module init and exit code
198 */
199
200static int elanfreq_cpu_init(struct cpufreq_policy *policy)
201{
202 struct cpuinfo_x86 *c = &cpu_data(0);
203 unsigned int i;
204 int result;
205
206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV;
210
211 /* max freq */
212 if (!max_freq)
213 max_freq = elanfreq_get_cpu_frequency(0);
214
215 /* table init */
216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 }
220
221 /* cpuinfo and default policy values */
222 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
223 policy->cur = elanfreq_get_cpu_frequency(0);
224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result)
227 return result;
228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0;
231}
232
233
234static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
235{
236 cpufreq_frequency_table_put_attr(policy->cpu);
237 return 0;
238}
239
240
241#ifndef MODULE
242/**
243 * elanfreq_setup - elanfreq command line parameter parsing
244 *
245 * elanfreq command line parameter. Use:
246 * elanfreq=66000
247 * to set the maximum CPU frequency to 66 MHz. Note that in
248 * case you do not give this boot parameter, the maximum
249 * frequency will fall back to _current_ CPU frequency which
250 * might be lower. If you build this as a module, use the
251 * max_freq module parameter instead.
252 */
253static int __init elanfreq_setup(char *str)
254{
255 max_freq = simple_strtoul(str, &str, 0);
256 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
257 return 1;
258}
259__setup("elanfreq=", elanfreq_setup);
260#endif
261
262
263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL,
266};
267
268
269static struct cpufreq_driver elanfreq_driver = {
270 .get = elanfreq_get_cpu_frequency,
271 .verify = elanfreq_verify,
272 .target = elanfreq_target,
273 .init = elanfreq_cpu_init,
274 .exit = elanfreq_cpu_exit,
275 .name = "elanfreq",
276 .owner = THIS_MODULE,
277 .attr = elanfreq_attr,
278};
279
280
281static int __init elanfreq_init(void)
282{
283 struct cpuinfo_x86 *c = &cpu_data(0);
284
285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV;
290 }
291 return cpufreq_register_driver(&elanfreq_driver);
292}
293
294
295static void __exit elanfreq_exit(void)
296{
297 cpufreq_unregister_driver(&elanfreq_driver);
298}
299
300
301module_param(max_freq, int, 0444);
302
303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
305 "Sven Geggus <sven@geggus.net>");
306MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
307
308module_init(elanfreq_init);
309module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf84232..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoretical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Modulation.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <linux/errno.h>
83#include <linux/slab.h>
84
85#include <asm/processor-cyrix.h>
86
87/* PCI config registers, all at F0 */
88#define PCI_PMER1 0x80 /* power management enable register 1 */
89#define PCI_PMER2 0x81 /* power management enable register 2 */
90#define PCI_PMER3 0x82 /* power management enable register 3 */
91#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
92#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
93#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
94#define PCI_MODON 0x95 /* suspend modulation ON counter register */
95#define PCI_SUSCFG 0x96 /* suspend configuration register */
96
97/* PMER1 bits */
98#define GPM (1<<0) /* global power management */
99#define GIT (1<<1) /* globally enable PM device idle timers */
100#define GTR (1<<2) /* globally enable IO traps */
101#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
102#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
103
104/* SUSCFG bits */
105#define SUSMOD (1<<0) /* enable/disable suspend modulation */
106/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
107#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
108 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
109#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
110/* the below is supported only with cs5530A */
111#define PWRSVE_ISA (1<<3) /* stop ISA clock */
112#define PWRSVE (1<<4) /* active idle */
113
114struct gxfreq_params {
115 u8 on_duration;
116 u8 off_duration;
117 u8 pci_suscfg;
118 u8 pci_pmer1;
119 u8 pci_pmer2;
120 struct pci_dev *cs55x0;
121};
122
123static struct gxfreq_params *gx_params;
124static int stock_freq;
125
126/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
127static int pci_busclk;
128module_param(pci_busclk, int, 0444);
129
130/* maximum duration for which the cpu may be suspended
131 * (32us * MAX_DURATION). If no parameter is given, this defaults
132 * to 255.
133 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
134 * is suspended -- processing power is just 0.39% of what it used to be,
135 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
136static int max_duration = 255;
137module_param(max_duration, int, 0444);
138
139/* For the default policy, we want at least some processing power
140 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
141 */
142#define POLICY_MIN_DIV 20
143
144
145#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
146 "gx-suspmod", msg)
147
148/**
149 * we can detect a core multipiler from dir0_lsb
150 * from GX1 datasheet p.56,
151 * MULT[3:0]:
152 * 0000 = SYSCLK multiplied by 4 (test only)
153 * 0001 = SYSCLK multiplied by 10
154 * 0010 = SYSCLK multiplied by 4
155 * 0011 = SYSCLK multiplied by 6
156 * 0100 = SYSCLK multiplied by 9
157 * 0101 = SYSCLK multiplied by 5
158 * 0110 = SYSCLK multiplied by 7
159 * 0111 = SYSCLK multiplied by 8
160 * of 33.3MHz
161 **/
162static int gx_freq_mult[16] = {
163 4, 10, 4, 6, 9, 5, 7, 8,
164 0, 0, 0, 0, 0, 0, 0, 0
165};
166
167
168/****************************************************************
169 * Low Level chipset interface *
170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 { 0, },
176};
177
178static void gx_write_byte(int reg, int value)
179{
180 pci_write_config_byte(gx_params->cs55x0, reg, value);
181}
182
183/**
184 * gx_detect_chipset:
185 *
186 **/
187static __init struct pci_dev *gx_detect_chipset(void)
188{
189 struct pci_dev *gx_pci = NULL;
190
191 /* check if CPU is a MediaGX or a Geode. */
192 if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
193 (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
194 dprintk("error: no MediaGX/Geode processor found!\n");
195 return NULL;
196 }
197
198 /* detect which companion chip is used */
199 for_each_pci_dev(gx_pci) {
200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
201 return gx_pci;
202 }
203
204 dprintk("error: no supported chipset found!\n");
205 return NULL;
206}
207
208/**
209 * gx_get_cpuspeed:
210 *
211 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
212 * Geode CPU runs.
213 */
214static unsigned int gx_get_cpuspeed(unsigned int cpu)
215{
216 if ((gx_params->pci_suscfg & SUSMOD) == 0)
217 return stock_freq;
218
219 return (stock_freq * gx_params->off_duration)
220 / (gx_params->on_duration + gx_params->off_duration);
221}
222
223/**
224 * gx_validate_speed:
225 * determine current cpu speed
226 *
227 **/
228
229static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
230 u8 *off_duration)
231{
232 unsigned int i;
233 u8 tmp_on, tmp_off;
234 int old_tmp_freq = stock_freq;
235 int tmp_freq;
236
237 *off_duration = 1;
238 *on_duration = 0;
239
240 for (i = max_duration; i > 0; i--) {
241 tmp_off = ((khz * i) / stock_freq) & 0xff;
242 tmp_on = i - tmp_off;
243 tmp_freq = (stock_freq * tmp_off) / i;
244 /* if this relation is closer to khz, use this. If it's equal,
245 * prefer it, too - lower latency */
246 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
247 *on_duration = tmp_on;
248 *off_duration = tmp_off;
249 old_tmp_freq = tmp_freq;
250 }
251 }
252
253 return old_tmp_freq;
254}
255
256
257/**
258 * gx_set_cpuspeed:
259 * set cpu speed in khz.
260 **/
261
262static void gx_set_cpuspeed(unsigned int khz)
263{
264 u8 suscfg, pmer1;
265 unsigned int new_khz;
266 unsigned long flags;
267 struct cpufreq_freqs freqs;
268
269 freqs.cpu = 0;
270 freqs.old = gx_get_cpuspeed(0);
271
272 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
273 &gx_params->off_duration);
274
275 freqs.new = new_khz;
276
277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
278 local_irq_save(flags);
279
280
281
282 if (new_khz != stock_freq) {
283 /* if new khz == 100% of CPU speed, it is special case */
284 switch (gx_params->cs55x0->device) {
285 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
286 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
287 /* FIXME: need to test other values -- Zwane,Miura */
288 /* typical 2 to 4ms */
289 gx_write_byte(PCI_IRQTC, 4);
290 /* typical 50 to 100ms */
291 gx_write_byte(PCI_VIDTC, 100);
292 gx_write_byte(PCI_PMER1, pmer1);
293
294 if (gx_params->cs55x0->revision < 0x10) {
295 /* CS5530(rev 1.2, 1.3) */
296 suscfg = gx_params->pci_suscfg|SUSMOD;
297 } else {
298 /* CS5530A,B.. */
299 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
300 }
301 break;
302 case PCI_DEVICE_ID_CYRIX_5520:
303 case PCI_DEVICE_ID_CYRIX_5510:
304 suscfg = gx_params->pci_suscfg | SUSMOD;
305 break;
306 default:
307 local_irq_restore(flags);
308 dprintk("fatal: try to set unknown chipset.\n");
309 return;
310 }
311 } else {
312 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
313 gx_params->off_duration = 0;
314 gx_params->on_duration = 0;
315 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
316 }
317
318 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
319 gx_write_byte(PCI_MODON, gx_params->on_duration);
320
321 gx_write_byte(PCI_SUSCFG, suscfg);
322 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
323
324 local_irq_restore(flags);
325
326 gx_params->pci_suscfg = suscfg;
327
328 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
329
330 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
331 gx_params->on_duration * 32, gx_params->off_duration * 32);
332 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
333}
334
335/****************************************************************
336 * High level functions *
337 ****************************************************************/
338
339/*
340 * cpufreq_gx_verify: test if frequency range is valid
341 *
342 * This function checks if a given frequency range in kHz is valid
343 * for the hardware supported by the driver.
344 */
345
346static int cpufreq_gx_verify(struct cpufreq_policy *policy)
347{
348 unsigned int tmp_freq = 0;
349 u8 tmp1, tmp2;
350
351 if (!stock_freq || !policy)
352 return -EINVAL;
353
354 policy->cpu = 0;
355 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
356 stock_freq);
357
358 /* it needs to be assured that at least one supported frequency is
359 * within policy->min and policy->max. If it is not, policy->max
360 * needs to be increased until one freuqency is supported.
361 * policy->min may not be decreased, though. This way we guarantee a
362 * specific processing capacity.
363 */
364 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
365 if (tmp_freq < policy->min)
366 tmp_freq += stock_freq / max_duration;
367 policy->min = tmp_freq;
368 if (policy->min > policy->max)
369 policy->max = tmp_freq;
370 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
371 if (tmp_freq > policy->max)
372 tmp_freq -= stock_freq / max_duration;
373 policy->max = tmp_freq;
374 if (policy->max < policy->min)
375 policy->max = policy->min;
376 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
377 stock_freq);
378
379 return 0;
380}
381
382/*
383 * cpufreq_gx_target:
384 *
385 */
386static int cpufreq_gx_target(struct cpufreq_policy *policy,
387 unsigned int target_freq,
388 unsigned int relation)
389{
390 u8 tmp1, tmp2;
391 unsigned int tmp_freq;
392
393 if (!stock_freq || !policy)
394 return -EINVAL;
395
396 policy->cpu = 0;
397
398 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
399 while (tmp_freq < policy->min) {
400 tmp_freq += stock_freq / max_duration;
401 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
402 }
403 while (tmp_freq > policy->max) {
404 tmp_freq -= stock_freq / max_duration;
405 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
406 }
407
408 gx_set_cpuspeed(tmp_freq);
409
410 return 0;
411}
412
413static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
414{
415 unsigned int maxfreq, curfreq;
416
417 if (!policy || policy->cpu != 0)
418 return -ENODEV;
419
420 /* determine maximum frequency */
421 if (pci_busclk)
422 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
423 else if (cpu_khz)
424 maxfreq = cpu_khz;
425 else
426 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
427
428 stock_freq = maxfreq;
429 curfreq = gx_get_cpuspeed(0);
430
431 dprintk("cpu max frequency is %d.\n", maxfreq);
432 dprintk("cpu current frequency is %dkHz.\n", curfreq);
433
434 /* setup basic struct for cpufreq API */
435 policy->cpu = 0;
436
437 if (max_duration < POLICY_MIN_DIV)
438 policy->min = maxfreq / max_duration;
439 else
440 policy->min = maxfreq / POLICY_MIN_DIV;
441 policy->max = maxfreq;
442 policy->cur = curfreq;
443 policy->cpuinfo.min_freq = maxfreq / max_duration;
444 policy->cpuinfo.max_freq = maxfreq;
445 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
446
447 return 0;
448}
449
450/*
451 * cpufreq_gx_init:
452 * MediaGX/Geode GX initialize cpufreq driver
453 */
454static struct cpufreq_driver gx_suspmod_driver = {
455 .get = gx_get_cpuspeed,
456 .verify = cpufreq_gx_verify,
457 .target = cpufreq_gx_target,
458 .init = cpufreq_gx_cpu_init,
459 .name = "gx-suspmod",
460 .owner = THIS_MODULE,
461};
462
463static int __init cpufreq_gx_init(void)
464{
465 int ret;
466 struct gxfreq_params *params;
467 struct pci_dev *gx_pci;
468
469 /* Test if we have the right hardware */
470 gx_pci = gx_detect_chipset();
471 if (gx_pci == NULL)
472 return -ENODEV;
473
474 /* check whether module parameters are sane */
475 if (max_duration > 0xff)
476 max_duration = 0xff;
477
478 dprintk("geode suspend modulation available.\n");
479
480 params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
481 if (params == NULL)
482 return -ENOMEM;
483
484 params->cs55x0 = gx_pci;
485 gx_params = params;
486
487 /* keep cs55x0 configurations */
488 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
489 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
490 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
491 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
492 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
493 &(params->off_duration));
494
495 ret = cpufreq_register_driver(&gx_suspmod_driver);
496 if (ret) {
497 kfree(params);
498 return ret; /* register error! */
499 }
500
501 return 0;
502}
503
504static void __exit cpufreq_gx_exit(void)
505{
506 cpufreq_unregister_driver(&gx_suspmod_driver);
507 pci_dev_put(gx_params->cs55x0);
508 kfree(gx_params);
509}
510
511MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
512MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
513MODULE_LICENSE("GPL");
514
515module_init(cpufreq_gx_init);
516module_exit(cpufreq_gx_exit);
517
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index 03162dac6271..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * Version 3 of longhaul got renamed to Powersaver and redesigned
15 * to use only the POWERSAVER MSR at 0x110a.
16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
17 * It's pretty much the same feature wise to longhaul v2, though
18 * there is provision for scaling FSB too, but this doesn't work
19 * too well in practice so we don't even try to use this.
20 *
21 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/init.h>
28#include <linux/cpufreq.h>
29#include <linux/pci.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36
37#include <asm/msr.h>
38#include <acpi/processor.h>
39
40#include "longhaul.h"
41
42#define PFX "longhaul: "
43
44#define TYPE_LONGHAUL_V1 1
45#define TYPE_LONGHAUL_V2 2
46#define TYPE_POWERSAVER 3
47
48#define CPU_SAMUEL 1
49#define CPU_SAMUEL2 2
50#define CPU_EZRA 3
51#define CPU_EZRA_T 4
52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54
55/* Flags */
56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2)
58
59static int cpu_model;
60static unsigned int numscales = 16;
61static unsigned int fsb;
62
63static const struct mV_pos *vrm_mV_table;
64static const unsigned char *mV_vrm_table;
65
66static unsigned int highest_speed, lowest_speed; /* kHz */
67static unsigned int minmult, maxmult;
68static int can_scale_voltage;
69static struct acpi_processor *pr;
70static struct acpi_processor_cx *cx;
71static u32 acpi_regs_addr;
72static u8 longhaul_flags;
73static unsigned int longhaul_index;
74
75/* Module parameters */
76static int scale_voltage;
77static int disable_acpi_c3;
78static int revid_errata;
79
80#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
81 "longhaul", msg)
82
83
84/* Clock ratios multiplied by 10 */
85static int mults[32];
86static int eblcr[32];
87static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table;
89
90#ifdef CONFIG_CPU_FREQ_DEBUG
91static char speedbuffer[8];
92
93static char *print_speed(int speed)
94{
95 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer;
98 }
99
100 if (speed%1000 == 0)
101 snprintf(speedbuffer, sizeof(speedbuffer),
102 "%dGHz", speed/1000);
103 else
104 snprintf(speedbuffer, sizeof(speedbuffer),
105 "%d.%dGHz", speed/1000, (speed%1000)/100);
106
107 return speedbuffer;
108}
109#endif
110
111
112static unsigned int calc_speed(int mult)
113{
114 int khz;
115 khz = (mult/10)*fsb;
116 if (mult%10)
117 khz += fsb/2;
118 khz *= 1000;
119 return khz;
120}
121
122
123static int longhaul_get_cpu_mult(void)
124{
125 unsigned long invalue = 0, lo, hi;
126
127 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version == TYPE_LONGHAUL_V2 ||
130 longhaul_version == TYPE_POWERSAVER) {
131 if (lo & (1<<27))
132 invalue += 16;
133 }
134 return eblcr[invalue];
135}
136
137/* For processor with BCR2 MSR */
138
139static void do_longhaul1(unsigned int mults_index)
140{
141 union msr_bcr2 bcr2;
142
143 rdmsrl(MSR_VIA_BCR2, bcr2.val);
144 /* Enable software clock multiplier */
145 bcr2.bits.ESOFTBF = 1;
146 bcr2.bits.CLOCKMUL = mults_index & 0xff;
147
148 /* Sync to timer tick */
149 safe_halt();
150 /* Change frequency on next halt or sleep */
151 wrmsrl(MSR_VIA_BCR2, bcr2.val);
152 /* Invoke transition */
153 ACPI_FLUSH_CPU_CACHE();
154 halt();
155
156 /* Disable software clock multiplier */
157 local_irq_disable();
158 rdmsrl(MSR_VIA_BCR2, bcr2.val);
159 bcr2.bits.ESOFTBF = 0;
160 wrmsrl(MSR_VIA_BCR2, bcr2.val);
161}
162
163/* For processor with Longhaul MSR */
164
165static void do_powersaver(int cx_address, unsigned int mults_index,
166 unsigned int dir)
167{
168 union msr_longhaul longhaul;
169 u32 t;
170
171 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
172 /* Setup new frequency */
173 if (!revid_errata)
174 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
175 else
176 longhaul.bits.RevisionKey = 0;
177 longhaul.bits.SoftBusRatio = mults_index & 0xf;
178 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
179 /* Setup new voltage */
180 if (can_scale_voltage)
181 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && dir) {
186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 }
203
204 /* Change frequency on next halt or sleep */
205 longhaul.bits.EnableSoftBusRatio = 1;
206 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
207 if (!cx_address) {
208 ACPI_FLUSH_CPU_CACHE();
209 halt();
210 } else {
211 ACPI_FLUSH_CPU_CACHE();
212 /* Invoke C3 */
213 inb(cx_address);
214 /* Dummy op - must do something useless after P_LVL3 read */
215 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
216 }
217 /* Disable bus ratio bit */
218 longhaul.bits.EnableSoftBusRatio = 0;
219 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
220
221 /* Reduce voltage if necessary */
222 if (can_scale_voltage && !dir) {
223 longhaul.bits.EnableSoftVID = 1;
224 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
225 /* Change voltage */
226 if (!cx_address) {
227 ACPI_FLUSH_CPU_CACHE();
228 halt();
229 } else {
230 ACPI_FLUSH_CPU_CACHE();
231 /* Invoke C3 */
232 inb(cx_address);
233 /* Dummy op - must do something useless after P_LVL3
234 * read */
235 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
236 }
237 longhaul.bits.EnableSoftVID = 0;
238 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
239 }
240}
241
242/**
243 * longhaul_set_cpu_frequency()
244 * @mults_index : bitpattern of the new multiplier.
245 *
246 * Sets a new clock ratio.
247 */
248
249static void longhaul_setstate(unsigned int table_index)
250{
251 unsigned int mults_index;
252 int speed, mult;
253 struct cpufreq_freqs freqs;
254 unsigned long flags;
255 unsigned int pic1_mask, pic2_mask;
256 u16 bm_status = 0;
257 u32 bm_timeout = 1000;
258 unsigned int dir = 0;
259
260 mults_index = longhaul_table[table_index].index;
261 /* Safety precautions */
262 mult = mults[mults_index & 0x1f];
263 if (mult == -1)
264 return;
265 speed = calc_speed(mult);
266 if ((speed > highest_speed) || (speed < lowest_speed))
267 return;
268 /* Voltage transition before frequency transition? */
269 if (can_scale_voltage && longhaul_index < table_index)
270 dir = 1;
271
272 freqs.old = calc_speed(longhaul_get_cpu_mult());
273 freqs.new = speed;
274 freqs.cpu = 0; /* longhaul.c is UP only driver */
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
279 fsb, mult/10, mult%10, print_speed(speed/1000));
280retry_loop:
281 preempt_disable();
282 local_irq_save(flags);
283
284 pic2_mask = inb(0xA1);
285 pic1_mask = inb(0x21); /* works on C3. save mask. */
286 outb(0xFF, 0xA1); /* Overkill */
287 outb(0xFE, 0x21); /* TMR0 only */
288
289 /* Wait while PCI bus is busy. */
290 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
291 || ((pr != NULL) && pr->flags.bm_control))) {
292 bm_status = inw(acpi_regs_addr);
293 bm_status &= 1 << 4;
294 while (bm_status && bm_timeout) {
295 outw(1 << 4, acpi_regs_addr);
296 bm_timeout--;
297 bm_status = inw(acpi_regs_addr);
298 bm_status &= 1 << 4;
299 }
300 }
301
302 if (longhaul_flags & USE_NORTHBRIDGE) {
303 /* Disable AGP and PCI arbiters */
304 outb(3, 0x22);
305 } else if ((pr != NULL) && pr->flags.bm_control) {
306 /* Disable bus master arbitration */
307 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
308 }
309 switch (longhaul_version) {
310
311 /*
312 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
313 * Software controlled multipliers only.
314 */
315 case TYPE_LONGHAUL_V1:
316 do_longhaul1(mults_index);
317 break;
318
319 /*
320 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
321 *
322 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
323 * Nehemiah can do FSB scaling too, but this has never been proven
324 * to work in practice.
325 */
326 case TYPE_LONGHAUL_V2:
327 case TYPE_POWERSAVER:
328 if (longhaul_flags & USE_ACPI_C3) {
329 /* Don't allow wakeup */
330 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
331 do_powersaver(cx->address, mults_index, dir);
332 } else {
333 do_powersaver(0, mults_index, dir);
334 }
335 break;
336 }
337
338 if (longhaul_flags & USE_NORTHBRIDGE) {
339 /* Enable arbiters */
340 outb(0, 0x22);
341 } else if ((pr != NULL) && pr->flags.bm_control) {
342 /* Enable bus master arbitration */
343 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
344 }
345 outb(pic2_mask, 0xA1); /* restore mask */
346 outb(pic1_mask, 0x21);
347
348 local_irq_restore(flags);
349 preempt_enable();
350
351 freqs.new = calc_speed(longhaul_get_cpu_mult());
352 /* Check if requested frequency is set. */
353 if (unlikely(freqs.new != speed)) {
354 printk(KERN_INFO PFX "Failed to set requested frequency!\n");
355 /* Revision ID = 1 but processor is expecting revision key
356 * equal to 0. Jumpers at the bottom of processor will change
357 * multiplier and FSB, but will not change bits in Longhaul
358 * MSR nor enable voltage scaling. */
359 if (!revid_errata) {
360 printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
361 "option.\n");
362 revid_errata = 1;
363 msleep(200);
364 goto retry_loop;
365 }
366 /* Why ACPI C3 sometimes doesn't work is a mystery for me.
367 * But it does happen. Processor is entering ACPI C3 state,
368 * but it doesn't change frequency. I tried poking various
369 * bits in northbridge registers, but without success. */
370 if (longhaul_flags & USE_ACPI_C3) {
371 printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
372 longhaul_flags &= ~USE_ACPI_C3;
373 if (revid_errata) {
374 printk(KERN_INFO PFX "Disabling \"Ignore "
375 "Revision ID\" option.\n");
376 revid_errata = 0;
377 }
378 msleep(200);
379 goto retry_loop;
380 }
381 /* This shouldn't happen. Longhaul ver. 2 was reported not
382 * working on processors without voltage scaling, but with
383 * RevID = 1. RevID errata will make things right. Just
384 * to be 100% sure. */
385 if (longhaul_version == TYPE_LONGHAUL_V2) {
386 printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
387 longhaul_version = TYPE_LONGHAUL_V1;
388 msleep(200);
389 goto retry_loop;
390 }
391 }
392 /* Report true CPU frequency */
393 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
394
395 if (!bm_timeout)
396 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
397 "idle PCI bus.\n");
398}
399
400/*
401 * Centaur decided to make life a little more tricky.
402 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
403 * Samuel2 and above have to try and guess what the FSB is.
404 * We do this by assuming we booted at maximum multiplier, and interpolate
405 * between that value multiplied by possible FSBs and cpu_mhz which
406 * was calculated at boot time. Really ugly, but no other way to do this.
407 */
408
409#define ROUNDING 0xf
410
411static int guess_fsb(int mult)
412{
413 int speed = cpu_khz / 1000;
414 int i;
415 int speeds[] = { 666, 1000, 1333, 2000 };
416 int f_max, f_min;
417
418 for (i = 0; i < 4; i++) {
419 f_max = ((speeds[i] * mult) + 50) / 100;
420 f_max += (ROUNDING / 2);
421 f_min = f_max - ROUNDING;
422 if ((speed <= f_max) && (speed >= f_min))
423 return speeds[i] / 10;
424 }
425 return 0;
426}
427
428
429static int __cpuinit longhaul_get_ranges(void)
430{
431 unsigned int i, j, k = 0;
432 unsigned int ratio;
433 int mult;
434
435 /* Get current frequency */
436 mult = longhaul_get_cpu_mult();
437 if (mult == -1) {
438 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
439 return -EINVAL;
440 }
441 fsb = guess_fsb(mult);
442 if (fsb == 0) {
443 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
444 return -EINVAL;
445 }
446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */
449 maxmult = mult;
450 /* Get min multiplier */
451 switch (cpu_model) {
452 case CPU_NEHEMIAH:
453 minmult = 50;
454 break;
455 case CPU_NEHEMIAH_C:
456 minmult = 40;
457 break;
458 default:
459 minmult = 30;
460 break;
461 }
462
463 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
464 minmult/10, minmult%10, maxmult/10, maxmult%10);
465
466 highest_speed = calc_speed(maxmult);
467 lowest_speed = calc_speed(minmult);
468 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
469 print_speed(lowest_speed/1000),
470 print_speed(highest_speed/1000));
471
472 if (lowest_speed == highest_speed) {
473 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
474 return -EINVAL;
475 }
476 if (lowest_speed > highest_speed) {
477 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
478 lowest_speed, highest_speed);
479 return -EINVAL;
480 }
481
482 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
483 GFP_KERNEL);
484 if (!longhaul_table)
485 return -ENOMEM;
486
487 for (j = 0; j < numscales; j++) {
488 ratio = mults[j];
489 if (ratio == -1)
490 continue;
491 if (ratio > maxmult || ratio < minmult)
492 continue;
493 longhaul_table[k].frequency = calc_speed(ratio);
494 longhaul_table[k].index = j;
495 k++;
496 }
497 if (k <= 1) {
498 kfree(longhaul_table);
499 return -ENODEV;
500 }
501 /* Sort */
502 for (j = 0; j < k - 1; j++) {
503 unsigned int min_f, min_i;
504 min_f = longhaul_table[j].frequency;
505 min_i = j;
506 for (i = j + 1; i < k; i++) {
507 if (longhaul_table[i].frequency < min_f) {
508 min_f = longhaul_table[i].frequency;
509 min_i = i;
510 }
511 }
512 if (min_i != j) {
513 swap(longhaul_table[j].frequency,
514 longhaul_table[min_i].frequency);
515 swap(longhaul_table[j].index,
516 longhaul_table[min_i].index);
517 }
518 }
519
520 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
521
522 /* Find index we are running on */
523 for (j = 0; j < k; j++) {
524 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j;
526 break;
527 }
528 }
529 return 0;
530}
531
532
533static void __cpuinit longhaul_setup_voltagescaling(void)
534{
535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid;
537 unsigned int j, speed, pos, kHz_step, numvscales;
538 int min_vid_speed;
539
540 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
541 if (!(longhaul.bits.RevisionID & 1)) {
542 printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
543 return;
544 }
545
546 if (!longhaul.bits.VRMRev) {
547 printk(KERN_INFO PFX "VRM 8.5\n");
548 vrm_mV_table = &vrm85_mV[0];
549 mV_vrm_table = &mV_vrm85[0];
550 } else {
551 printk(KERN_INFO PFX "Mobile VRM\n");
552 if (cpu_model < CPU_NEHEMIAH)
553 return;
554 vrm_mV_table = &mobilevrm_mV[0];
555 mV_vrm_table = &mV_mobilevrm[0];
556 }
557
558 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000,
565 maxvid.mV/1000, maxvid.mV%1000);
566 return;
567 }
568
569 if (minvid.mV == maxvid.mV) {
570 printk(KERN_INFO PFX "Claims to support voltage scaling but "
571 "min & max are both %d.%03d. "
572 "Voltage scaling disabled\n",
573 maxvid.mV/1000, maxvid.mV%1000);
574 return;
575 }
576
577 /* How many voltage steps*/
578 numvscales = maxvid.pos - minvid.pos + 1;
579 printk(KERN_INFO PFX
580 "Max VID=%d.%03d "
581 "Min VID=%d.%03d, "
582 "%d possible voltage scales\n",
583 maxvid.mV/1000, maxvid.mV%1000,
584 minvid.mV/1000, minvid.mV%1000,
585 numvscales);
586
587 /* Calculate max frequency at min voltage */
588 j = longhaul.bits.MinMHzBR;
589 if (longhaul.bits.MinMHzBR4)
590 j += 16;
591 min_vid_speed = eblcr[j];
592 if (min_vid_speed == -1)
593 return;
594 switch (longhaul.bits.MinMHzFSB) {
595 case 0:
596 min_vid_speed *= 13333;
597 break;
598 case 1:
599 min_vid_speed *= 10000;
600 break;
601 case 3:
602 min_vid_speed *= 6666;
603 break;
604 default:
605 return;
606 break;
607 }
608 if (min_vid_speed >= highest_speed)
609 return;
610 /* Calculate kHz for one voltage step */
611 kHz_step = (highest_speed - min_vid_speed) / numvscales;
612
613 j = 0;
614 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
615 speed = longhaul_table[j].frequency;
616 if (speed > min_vid_speed)
617 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
618 else
619 pos = minvid.pos;
620 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
621 vid = vrm_mV_table[mV_vrm_table[pos]];
622 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
623 speed, j, vid.mV);
624 j++;
625 }
626
627 can_scale_voltage = 1;
628 printk(KERN_INFO PFX "Voltage scaling enabled.\n");
629}
630
631
632static int longhaul_verify(struct cpufreq_policy *policy)
633{
634 return cpufreq_frequency_table_verify(policy, longhaul_table);
635}
636
637
638static int longhaul_target(struct cpufreq_policy *policy,
639 unsigned int target_freq, unsigned int relation)
640{
641 unsigned int table_index = 0;
642 unsigned int i;
643 unsigned int dir = 0;
644 u8 vid, current_vid;
645
646 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
647 relation, &table_index))
648 return -EINVAL;
649
650 /* Don't set same frequency again */
651 if (longhaul_index == table_index)
652 return 0;
653
654 if (!can_scale_voltage)
655 longhaul_setstate(table_index);
656 else {
657 /* On test system voltage transitions exceeding single
658 * step up or down were turning motherboard off. Both
659 * "ondemand" and "userspace" are unsafe. C7 is doing
660 * this in hardware, C3 is old and we need to do this
661 * in software. */
662 i = longhaul_index;
663 current_vid = (longhaul_table[longhaul_index].index >> 8);
664 current_vid &= 0x1f;
665 if (table_index > longhaul_index)
666 dir = 1;
667 while (i != table_index) {
668 vid = (longhaul_table[i].index >> 8) & 0x1f;
669 if (vid != current_vid) {
670 longhaul_setstate(i);
671 current_vid = vid;
672 msleep(200);
673 }
674 if (dir)
675 i++;
676 else
677 i--;
678 }
679 longhaul_setstate(table_index);
680 }
681 longhaul_index = table_index;
682 return 0;
683}
684
685
686static unsigned int longhaul_get(unsigned int cpu)
687{
688 if (cpu)
689 return 0;
690 return calc_speed(longhaul_get_cpu_mult());
691}
692
693static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 u32 nesting_level,
695 void *context, void **return_value)
696{
697 struct acpi_device *d;
698
699 if (acpi_bus_get_device(obj_handle, &d))
700 return 0;
701
702 *return_value = acpi_driver_data(d);
703 return 1;
704}
705
706/* VIA don't support PM2 reg, but have something similar */
707static int enable_arbiter_disable(void)
708{
709 struct pci_dev *dev;
710 int status = 1;
711 int reg;
712 u8 pci_cmd;
713
714 /* Find PLE133 host bridge */
715 reg = 0x78;
716 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
717 NULL);
718 /* Find PM133/VT8605 host bridge */
719 if (dev == NULL)
720 dev = pci_get_device(PCI_VENDOR_ID_VIA,
721 PCI_DEVICE_ID_VIA_8605_0, NULL);
722 /* Find CLE266 host bridge */
723 if (dev == NULL) {
724 reg = 0x76;
725 dev = pci_get_device(PCI_VENDOR_ID_VIA,
726 PCI_DEVICE_ID_VIA_862X_0, NULL);
727 /* Find CN400 V-Link host bridge */
728 if (dev == NULL)
729 dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
730 }
731 if (dev != NULL) {
732 /* Enable access to port 0x22 */
733 pci_read_config_byte(dev, reg, &pci_cmd);
734 if (!(pci_cmd & 1<<7)) {
735 pci_cmd |= 1<<7;
736 pci_write_config_byte(dev, reg, pci_cmd);
737 pci_read_config_byte(dev, reg, &pci_cmd);
738 if (!(pci_cmd & 1<<7)) {
739 printk(KERN_ERR PFX
740 "Can't enable access to port 0x22.\n");
741 status = 0;
742 }
743 }
744 pci_dev_put(dev);
745 return status;
746 }
747 return 0;
748}
749
750static int longhaul_setup_southbridge(void)
751{
752 struct pci_dev *dev;
753 u8 pci_cmd;
754
755 /* Find VT8235 southbridge */
756 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
757 if (dev == NULL)
758 /* Find VT8237 southbridge */
759 dev = pci_get_device(PCI_VENDOR_ID_VIA,
760 PCI_DEVICE_ID_VIA_8237, NULL);
761 if (dev != NULL) {
762 /* Set transition time to max */
763 pci_read_config_byte(dev, 0xec, &pci_cmd);
764 pci_cmd &= ~(1 << 2);
765 pci_write_config_byte(dev, 0xec, pci_cmd);
766 pci_read_config_byte(dev, 0xe4, &pci_cmd);
767 pci_cmd &= ~(1 << 7);
768 pci_write_config_byte(dev, 0xe4, pci_cmd);
769 pci_read_config_byte(dev, 0xe5, &pci_cmd);
770 pci_cmd |= 1 << 7;
771 pci_write_config_byte(dev, 0xe5, pci_cmd);
772 /* Get address of ACPI registers block*/
773 pci_read_config_byte(dev, 0x81, &pci_cmd);
774 if (pci_cmd & 1 << 7) {
775 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
776 acpi_regs_addr &= 0xff00;
777 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
778 acpi_regs_addr);
779 }
780
781 pci_dev_put(dev);
782 return 1;
783 }
784 return 0;
785}
786
787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{
789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL;
791 int ret;
792 u32 lo, hi;
793
794 /* Check what we have on this motherboard */
795 switch (c->x86_model) {
796 case 6:
797 cpu_model = CPU_SAMUEL;
798 cpuname = "C3 'Samuel' [C5A]";
799 longhaul_version = TYPE_LONGHAUL_V1;
800 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
801 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
802 break;
803
804 case 7:
805 switch (c->x86_mask) {
806 case 0:
807 longhaul_version = TYPE_LONGHAUL_V1;
808 cpu_model = CPU_SAMUEL2;
809 cpuname = "C3 'Samuel 2' [C5B]";
810 /* Note, this is not a typo, early Samuel2's had
811 * Samuel1 ratios. */
812 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break;
815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]";
820 } else {
821 cpu_model = CPU_EZRA;
822 cpuname = "C3 'Ezra' [C5C]";
823 }
824 memcpy(mults, ezra_mults, sizeof(ezra_mults));
825 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
826 break;
827 }
828 break;
829
830 case 8:
831 cpu_model = CPU_EZRA_T;
832 cpuname = "C3 'Ezra-T' [C5M]";
833 longhaul_version = TYPE_POWERSAVER;
834 numscales = 32;
835 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
836 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
837 break;
838
839 case 9:
840 longhaul_version = TYPE_POWERSAVER;
841 numscales = 32;
842 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
843 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) {
845 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH;
847 cpuname = "C3 'Nehemiah A' [C5XLOE]";
848 break;
849 case 2 ... 4:
850 cpu_model = CPU_NEHEMIAH;
851 cpuname = "C3 'Nehemiah B' [C5XLOH]";
852 break;
853 case 5 ... 15:
854 cpu_model = CPU_NEHEMIAH_C;
855 cpuname = "C3 'Nehemiah C' [C5P]";
856 break;
857 }
858 break;
859
860 default:
861 cpuname = "Unknown";
862 break;
863 }
864 /* Check Longhaul ver. 2 */
865 if (longhaul_version == TYPE_LONGHAUL_V2) {
866 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
867 if (lo == 0 && hi == 0)
868 /* Looks like MSR isn't present */
869 longhaul_version = TYPE_LONGHAUL_V1;
870 }
871
872 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2:
876 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break;
878 case TYPE_POWERSAVER:
879 printk(KERN_CONT "Powersaver supported.\n");
880 break;
881 };
882
883 /* Doesn't hurt */
884 longhaul_setup_southbridge();
885
886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr);
890
891 /* Check ACPI support for C3 state */
892 if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
893 cx = &pr->power.states[ACPI_STATE_C3];
894 if (cx->address > 0 && cx->latency <= 1000)
895 longhaul_flags |= USE_ACPI_C3;
896 }
897 /* Disable if it isn't working */
898 if (disable_acpi_c3)
899 longhaul_flags &= ~USE_ACPI_C3;
900 /* Check if northbridge is friendly */
901 if (enable_arbiter_disable())
902 longhaul_flags |= USE_NORTHBRIDGE;
903
904 /* Check ACPI support for bus master arbiter disable */
905 if (!(longhaul_flags & USE_ACPI_C3
906 || longhaul_flags & USE_NORTHBRIDGE)
907 && ((pr == NULL) || !(pr->flags.bm_control))) {
908 printk(KERN_ERR PFX
909 "No ACPI support. Unsupported northbridge.\n");
910 return -ENODEV;
911 }
912
913 if (longhaul_flags & USE_NORTHBRIDGE)
914 printk(KERN_INFO PFX "Using northbridge support.\n");
915 if (longhaul_flags & USE_ACPI_C3)
916 printk(KERN_INFO PFX "Using ACPI support.\n");
917
918 ret = longhaul_get_ranges();
919 if (ret != 0)
920 return ret;
921
922 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
923 longhaul_setup_voltagescaling();
924
925 policy->cpuinfo.transition_latency = 200000; /* nsec */
926 policy->cur = calc_speed(longhaul_get_cpu_mult());
927
928 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
929 if (ret)
930 return ret;
931
932 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
933
934 return 0;
935}
936
937static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
938{
939 cpufreq_frequency_table_put_attr(policy->cpu);
940 return 0;
941}
942
943static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL,
946};
947
948static struct cpufreq_driver longhaul_driver = {
949 .verify = longhaul_verify,
950 .target = longhaul_target,
951 .get = longhaul_get,
952 .init = longhaul_cpu_init,
953 .exit = __devexit_p(longhaul_cpu_exit),
954 .name = "longhaul",
955 .owner = THIS_MODULE,
956 .attr = longhaul_attr,
957};
958
959
960static int __init longhaul_init(void)
961{
962 struct cpuinfo_x86 *c = &cpu_data(0);
963
964 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
965 return -ENODEV;
966
967#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, "
970 "longhaul disabled.\n");
971 return -ENODEV;
972 }
973#endif
974#ifdef CONFIG_X86_IO_APIC
975 if (cpu_has_apic) {
976 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
977 "broken in this configuration.\n");
978 return -ENODEV;
979 }
980#endif
981 switch (c->x86_model) {
982 case 6 ... 9:
983 return cpufreq_register_driver(&longhaul_driver);
984 case 10:
985 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
986 default:
987 ;
988 }
989
990 return -ENODEV;
991}
992
993
994static void __exit longhaul_exit(void)
995{
996 int i;
997
998 for (i = 0; i < numscales; i++) {
999 if (mults[i] == maxmult) {
1000 longhaul_setstate(i);
1001 break;
1002 }
1003 }
1004
1005 cpufreq_unregister_driver(&longhaul_driver);
1006 kfree(longhaul_table);
1007}
1008
1009/* Even if BIOS is exporting ACPI C3 state, and it is used
1010 * with success when CPU is idle, this state doesn't
1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save
1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1018/* Force revision key to 0 for processors which doesn't
1019 * support voltage scaling, but are introducing itself as
1020 * such. */
1021module_param(revid_errata, int, 0644);
1022MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1023
1024MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1025MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1026MODULE_LICENSE("GPL");
1027
1028late_initcall(longhaul_init);
1029module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca881..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr values specify the ratio read from the CPU.
53 * The mults values specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 -1, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 45, /* 0110 -> 4.5x */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 -1, /* 0000 -> 10.0x */
256 110, /* 0001 -> 11.0x */
257 -1, /* 0010 -> 12.0x */
258 -1, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 -1, /* 1111 -> 12.0x */
271};
272
273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */
277 100, /* 0011 -> 10.0x */
278 55, /* 0100 -> 5.5x */
279 -1, /* 0101 -> RESERVED */
280 45, /* 0110 -> 4.5x */
281 95, /* 0111 -> 9.5x */
282 90, /* 1000 -> 9.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 60, /* 1011 -> 6.0x */
286 120, /* 1100 -> 12.0x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 65, /* 1111 -> 6.5x */
290 90, /* 0000 -> 9.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 100, /* 0011 -> 10.0x */
294 135, /* 0100 -> 13.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 105, /* 0111 -> 10.5x */
298 130, /* 1000 -> 13.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 140, /* 1011 -> 14.0x */
302 120, /* 1100 -> 12.0x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 145 /* 1111 -> 14.5x */
306};
307
308/*
309 * Voltage scales. Div/Mod by 1000 to get actual voltage.
310 * Which scale to use depends on the VRM type in use.
311 */
312
313struct mV_pos {
314 unsigned short mV;
315 unsigned short pos;
316};
317
318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
322 {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
323 {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
324 {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
325 {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327};
328
329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334};
335
336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
340 {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
341 {975, 15}, {950, 14}, {925, 13}, {900, 12},
342 {875, 11}, {850, 10}, {825, 9}, {800, 8},
343 {775, 7}, {750, 6}, {725, 5}, {700, 4},
344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345};
346
347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
351 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
352};
353
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index fc09f142d94d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/timex.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17
18#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
19 "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __init longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return eax * 1000;
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = &cpu_data(0);
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n",
200 *low_freq, *high_freq);
201
202 if (*low_freq > *high_freq)
203 *low_freq = *high_freq;
204 return 0;
205 }
206
207 /* set the upper border to the value determined during TSC init */
208 *high_freq = (cpu_khz / 1000);
209 *high_freq = *high_freq * 1000;
210 dprintk("high frequency is %u kHz\n", *high_freq);
211
212 /* get current borders */
213 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
214 save_lo = msr_lo & 0x0000007F;
215 save_hi = msr_hi & 0x0000007F;
216
217 /* if current perf_pctg is larger than 90%, we need to decrease the
218 * upper limit to make the calculation more accurate.
219 */
220 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
221 /* try decreasing in 10% steps, some processors react only
222 * on some barrier values */
223 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
224 /* set to 0 to try_hi perf_pctg */
225 msr_lo &= 0xFFFFFF80;
226 msr_hi &= 0xFFFFFF80;
227 msr_hi |= try_hi;
228 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
229
230 /* read out current core MHz and current perf_pctg */
231 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
232
233 /* restore values */
234 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
235 }
236 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
237
238 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
239 * eqals
240 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
241 *
242 * high_freq * perf_pctg is stored tempoarily into "ebx".
243 */
244 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
245
246 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
247 return -EIO;
248
249 edx = ((eax - ebx) * 100) / (100 - ecx);
250 *low_freq = edx * 1000; /* back to kHz */
251
252 dprintk("low frequency is %u kHz\n", *low_freq);
253
254 if (*low_freq > *high_freq)
255 *low_freq = *high_freq;
256
257 return 0;
258}
259
260
261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{
263 int result = 0;
264
265 /* capability check */
266 if (policy->cpu != 0)
267 return -ENODEV;
268
269 /* detect low and high frequency */
270 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
271 if (result)
272 return result;
273
274 /* cpuinfo and default policy values */
275 policy->cpuinfo.min_freq = longrun_low_freq;
276 policy->cpuinfo.max_freq = longrun_high_freq;
277 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
278 longrun_get_policy(policy);
279
280 return 0;
281}
282
283
284static struct cpufreq_driver longrun_driver = {
285 .flags = CPUFREQ_CONST_LOOPS,
286 .verify = longrun_verify_policy,
287 .setpolicy = longrun_set_policy,
288 .get = longrun_get,
289 .init = longrun_cpu_init,
290 .name = "longrun",
291 .owner = THIS_MODULE,
292};
293
294
295/**
296 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
297 *
298 * Initializes the LongRun support.
299 */
300static int __init longrun_init(void)
301{
302 struct cpuinfo_x86 *c = &cpu_data(0);
303
304 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
305 !cpu_has(c, X86_FEATURE_LONGRUN))
306 return -ENODEV;
307
308 return cpufreq_register_driver(&longrun_driver);
309}
310
311
312/**
313 * longrun_exit - unregisters LongRun support
314 */
315static void __exit longrun_exit(void)
316{
317 cpufreq_unregister_driver(&longrun_driver);
318}
319
320
321MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
322MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
323 "Efficeon processors.");
324MODULE_LICENSE("GPL");
325
326module_init(longrun_init);
327module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index bd1cac747f67..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/smp.h>
27#include <linux/cpufreq.h>
28#include <linux/cpumask.h>
29#include <linux/timex.h>
30
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/timer.h>
34
35#include "speedstep-lib.h"
36
37#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
39 "p4-clockmod", msg)
40
41/*
42 * Duty Cycle (3bits), note DC_DISABLE is not specified in
43 * intel docs i just use it to mean disable
44 */
45enum {
46 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
47 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
48};
49
50#define DC_ENTRIES 8
51
52
53static int has_N44_O17_errata[NR_CPUS];
54static unsigned int stock_freq;
55static struct cpufreq_driver p4clockmod_driver;
56static unsigned int cpufreq_p4_get(unsigned int cpu);
57
58static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
59{
60 u32 l, h;
61
62 if (!cpu_online(cpu) ||
63 (newstate > DC_DISABLE) || (newstate == DC_RESV))
64 return -EINVAL;
65
66 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
67
68 if (l & 0x01)
69 dprintk("CPU#%d currently thermal throttled\n", cpu);
70
71 if (has_N44_O17_errata[cpu] &&
72 (newstate == DC_25PT || newstate == DC_DFLT))
73 newstate = DC_38PT;
74
75 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
76 if (newstate == DC_DISABLE) {
77 dprintk("CPU#%d disabling modulation\n", cpu);
78 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
79 } else {
80 dprintk("CPU#%d setting duty cycle to %d%%\n",
81 cpu, ((125 * newstate) / 10));
82 /* bits 63 - 5 : reserved
83 * bit 4 : enable/disable
84 * bits 3-1 : duty cycle
85 * bit 0 : reserved
86 */
87 l = (l & ~14);
88 l = l | (1<<4) | ((newstate & 0x7)<<1);
89 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
90 }
91
92 return 0;
93}
94
95
96static struct cpufreq_frequency_table p4clockmod_table[] = {
97 {DC_RESV, CPUFREQ_ENTRY_INVALID},
98 {DC_DFLT, 0},
99 {DC_25PT, 0},
100 {DC_38PT, 0},
101 {DC_50PT, 0},
102 {DC_64PT, 0},
103 {DC_75PT, 0},
104 {DC_88PT, 0},
105 {DC_DISABLE, 0},
106 {DC_RESV, CPUFREQ_TABLE_END},
107};
108
109
110static int cpufreq_p4_target(struct cpufreq_policy *policy,
111 unsigned int target_freq,
112 unsigned int relation)
113{
114 unsigned int newstate = DC_RESV;
115 struct cpufreq_freqs freqs;
116 int i;
117
118 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
119 target_freq, relation, &newstate))
120 return -EINVAL;
121
122 freqs.old = cpufreq_p4_get(policy->cpu);
123 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
124
125 if (freqs.new == freqs.old)
126 return 0;
127
128 /* notifiers */
129 for_each_cpu(i, policy->cpus) {
130 freqs.cpu = i;
131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
132 }
133
134 /* run on each logical CPU,
135 * see section 13.15.3 of IA32 Intel Architecture Software
136 * Developer's Manual, Volume 3
137 */
138 for_each_cpu(i, policy->cpus)
139 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
140
141 /* notifiers */
142 for_each_cpu(i, policy->cpus) {
143 freqs.cpu = i;
144 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
145 }
146
147 return 0;
148}
149
150
151static int cpufreq_p4_verify(struct cpufreq_policy *policy)
152{
153 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
154}
155
156
157static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{
159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST))
161 printk(KERN_WARNING PFX "Warning: EST-capable CPU "
162 "detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition of frequency "
164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) {
167 case 0x0E: /* Core */
168 case 0x0F: /* Core Duo */
169 case 0x16: /* Celeron Core */
170 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */
174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
175 /* fall through */
176 case 0x09: /* Pentium M (Banias) */
177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
178 }
179 }
180
181 if (c->x86 != 0xF)
182 return 0;
183
184 /* on P-4s, the TSC runs with constant frequency independent whether
185 * throttling is active or not. */
186 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
187
188 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
189 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
190 "The speedstep-ich or acpi cpufreq modules offer "
191 "voltage scaling in addition of frequency scaling. "
192 "You should use either one instead of p4-clockmod, "
193 "if possible.\n");
194 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
195 }
196
197 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
198}
199
200
201
202static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203{
204 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
205 int cpuid = 0;
206 unsigned int i;
207
208#ifdef CONFIG_SMP
209 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
210#endif
211
212 /* Errata workaround */
213 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
214 switch (cpuid) {
215 case 0x0f07:
216 case 0x0f0a:
217 case 0x0f11:
218 case 0x0f12:
219 has_N44_O17_errata[policy->cpu] = 1;
220 dprintk("has errata -- disabling low frequencies\n");
221 }
222
223 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
224 c->x86_model < 2) {
225 /* switch to maximum frequency and measure result */
226 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
227 recalibrate_cpu_khz();
228 }
229 /* get max frequency */
230 stock_freq = cpufreq_p4_get_frequency(c);
231 if (!stock_freq)
232 return -EINVAL;
233
234 /* table init */
235 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
236 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
237 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
238 else
239 p4clockmod_table[i].frequency = (stock_freq * i)/8;
240 }
241 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
242
243 /* cpuinfo and default policy values */
244
245 /* the transition latency is set to be 1 higher than the maximum
246 * transition latency of the ondemand governor */
247 policy->cpuinfo.transition_latency = 10000001;
248 policy->cur = stock_freq;
249
250 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
251}
252
253
254static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
255{
256 cpufreq_frequency_table_put_attr(policy->cpu);
257 return 0;
258}
259
260static unsigned int cpufreq_p4_get(unsigned int cpu)
261{
262 u32 l, h;
263
264 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
265
266 if (l & 0x10) {
267 l = l >> 1;
268 l &= 0x7;
269 } else
270 l = DC_DISABLE;
271
272 if (l != DC_DISABLE)
273 return stock_freq * l / 8;
274
275 return stock_freq;
276}
277
278static struct freq_attr *p4clockmod_attr[] = {
279 &cpufreq_freq_attr_scaling_available_freqs,
280 NULL,
281};
282
283static struct cpufreq_driver p4clockmod_driver = {
284 .verify = cpufreq_p4_verify,
285 .target = cpufreq_p4_target,
286 .init = cpufreq_p4_cpu_init,
287 .exit = cpufreq_p4_cpu_exit,
288 .get = cpufreq_p4_get,
289 .name = "p4-clockmod",
290 .owner = THIS_MODULE,
291 .attr = p4clockmod_attr,
292};
293
294
295static int __init cpufreq_p4_init(void)
296{
297 struct cpuinfo_x86 *c = &cpu_data(0);
298 int ret;
299
300 /*
301 * THERM_CONTROL is architectural for IA32 now, so
302 * we can rely on the capability checks
303 */
304 if (c->x86_vendor != X86_VENDOR_INTEL)
305 return -ENODEV;
306
307 if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
308 !test_cpu_cap(c, X86_FEATURE_ACC))
309 return -ENODEV;
310
311 ret = cpufreq_register_driver(&p4clockmod_driver);
312 if (!ret)
313 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
314 "Modulation available\n");
315
316 return ret;
317}
318
319
320static void __exit cpufreq_p4_exit(void)
321{
322 cpufreq_unregister_driver(&p4clockmod_driver);
323}
324
325
326MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
327MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
328MODULE_LICENSE("GPL");
329
330late_initcall(cpufreq_p4_init);
331module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 4f6f679f2799..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,626 +0,0 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu __percpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
408 if (ACPI_FAILURE(status))
409 return -ENODEV;
410
411 status = acpi_get_handle(handle, "_OSC", &osc_handle);
412 if (ACPI_SUCCESS(status)) {
413 ret = pcc_cpufreq_do_osc(&osc_handle);
414 if (ret)
415 dprintk("probe: _OSC evaluation did not succeed\n");
416 /* Firmware's use of _OSC is optional */
417 ret = 0;
418 }
419
420 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
421 if (ACPI_FAILURE(status))
422 return -ENODEV;
423
424 out_obj = output.pointer;
425 if (out_obj->type != ACPI_TYPE_PACKAGE) {
426 ret = -ENODEV;
427 goto out_free;
428 }
429
430 member = &out_obj->package.elements[0];
431 if (member->type != ACPI_TYPE_BUFFER) {
432 ret = -ENODEV;
433 goto out_free;
434 }
435
436 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
437
438 dprintk("probe: mem_resource descriptor: 0x%x,"
439 " length: %d, space_id: %d, resource_usage: %d,"
440 " type_specific: %d, granularity: 0x%llx,"
441 " minimum: 0x%llx, maximum: 0x%llx,"
442 " translation_offset: 0x%llx, address_length: 0x%llx\n",
443 mem_resource->descriptor, mem_resource->length,
444 mem_resource->space_id, mem_resource->resource_usage,
445 mem_resource->type_specific, mem_resource->granularity,
446 mem_resource->minimum, mem_resource->maximum,
447 mem_resource->translation_offset,
448 mem_resource->address_length);
449
450 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
451 ret = -ENODEV;
452 goto out_free;
453 }
454
455 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
456 mem_resource->address_length);
457 if (pcch_virt_addr == NULL) {
458 dprintk("probe: could not map shared mem region\n");
459 goto out_free;
460 }
461 pcch_hdr = pcch_virt_addr;
462
463 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
464 dprintk("probe: PCCH header is at physical address: 0x%llx,"
465 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
466 " supported features: 0x%x, command field: 0x%x,"
467 " status field: 0x%x, nominal latency: %d us\n",
468 mem_resource->minimum, ioread32(&pcch_hdr->signature),
469 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
470 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
471 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
472 ioread32(&pcch_hdr->latency));
473
474 dprintk("probe: min time between commands: %d us,"
475 " max time between commands: %d us,"
476 " nominal CPU frequency: %d MHz,"
477 " minimum CPU frequency: %d MHz,"
478 " minimum CPU frequency without throttling: %d MHz\n",
479 ioread32(&pcch_hdr->minimum_time),
480 ioread32(&pcch_hdr->maximum_time),
481 ioread32(&pcch_hdr->nominal),
482 ioread32(&pcch_hdr->throttled_frequency),
483 ioread32(&pcch_hdr->minimum_frequency));
484
485 member = &out_obj->package.elements[1];
486 if (member->type != ACPI_TYPE_BUFFER) {
487 ret = -ENODEV;
488 goto pcch_free;
489 }
490
491 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
492
493 doorbell.space_id = reg_resource->space_id;
494 doorbell.bit_width = reg_resource->bit_width;
495 doorbell.bit_offset = reg_resource->bit_offset;
496 doorbell.access_width = 64;
497 doorbell.address = reg_resource->address;
498
499 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
500 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
501 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
502 doorbell.access_width, reg_resource->address);
503
504 member = &out_obj->package.elements[2];
505 if (member->type != ACPI_TYPE_INTEGER) {
506 ret = -ENODEV;
507 goto pcch_free;
508 }
509
510 doorbell_preserve = member->integer.value;
511
512 member = &out_obj->package.elements[3];
513 if (member->type != ACPI_TYPE_INTEGER) {
514 ret = -ENODEV;
515 goto pcch_free;
516 }
517
518 doorbell_write = member->integer.value;
519
520 dprintk("probe: doorbell_preserve: 0x%llx,"
521 " doorbell_write: 0x%llx\n",
522 doorbell_preserve, doorbell_write);
523
524 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
525 if (!pcc_cpu_info) {
526 ret = -ENOMEM;
527 goto pcch_free;
528 }
529
530 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
531 " limits: %d MHz, %d MHz\n", PCC_VERSION,
532 ioread32(&pcch_hdr->minimum_frequency),
533 ioread32(&pcch_hdr->nominal));
534 kfree(output.pointer);
535 return ret;
536pcch_free:
537 pcc_clear_mapping();
538out_free:
539 kfree(output.pointer);
540 return ret;
541}
542
543static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
544{
545 unsigned int cpu = policy->cpu;
546 unsigned int result = 0;
547
548 if (!pcch_virt_addr) {
549 result = -1;
550 goto out;
551 }
552
553 result = pcc_get_offset(cpu);
554 if (result) {
555 dprintk("init: PCCP evaluation failed\n");
556 goto out;
557 }
558
559 policy->max = policy->cpuinfo.max_freq =
560 ioread32(&pcch_hdr->nominal) * 1000;
561 policy->min = policy->cpuinfo.min_freq =
562 ioread32(&pcch_hdr->minimum_frequency) * 1000;
563 policy->cur = pcc_get_freq(cpu);
564
565 if (!policy->cur) {
566 dprintk("init: Unable to get current CPU frequency\n");
567 result = -EINVAL;
568 goto out;
569 }
570
571 dprintk("init: policy->max is %d, policy->min is %d\n",
572 policy->max, policy->min);
573out:
574 return result;
575}
576
577static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
578{
579 return 0;
580}
581
582static struct cpufreq_driver pcc_cpufreq_driver = {
583 .flags = CPUFREQ_CONST_LOOPS,
584 .get = pcc_get_freq,
585 .verify = pcc_cpufreq_verify,
586 .target = pcc_cpufreq_target,
587 .init = pcc_cpufreq_cpu_init,
588 .exit = pcc_cpufreq_cpu_exit,
589 .name = "pcc-cpufreq",
590 .owner = THIS_MODULE,
591};
592
593static int __init pcc_cpufreq_init(void)
594{
595 int ret;
596
597 if (acpi_disabled)
598 return 0;
599
600 ret = pcc_cpufreq_probe();
601 if (ret) {
602 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
603 return ret;
604 }
605
606 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
607
608 return ret;
609}
610
611static void __exit pcc_cpufreq_exit(void)
612{
613 cpufreq_unregister_driver(&pcc_cpufreq_driver);
614
615 pcc_clear_mapping();
616
617 free_percpu(pcc_cpu_info);
618}
619
620MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
621MODULE_VERSION(PCC_VERSION);
622MODULE_DESCRIPTION("Processor Clocking Control interface driver");
623MODULE_LICENSE("GPL");
624
625late_initcall(pcc_cpufreq_init);
626module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/ioport.h>
16#include <linux/timex.h>
17#include <linux/io.h>
18
19#include <asm/msr.h>
20
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */
23
24#define PFX "powernow-k6: "
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state(unsigned int best_i)
71{
72 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR PFX "invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency
124 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
125 *
126 * sets a new CPUFreq policy
127 */
128static int powernow_k6_target(struct cpufreq_policy *policy,
129 unsigned int target_freq,
130 unsigned int relation)
131{
132 unsigned int newstate = 0;
133
134 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
135 target_freq, relation, &newstate))
136 return -EINVAL;
137
138 powernow_k6_set_state(newstate);
139
140 return 0;
141}
142
143
144static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
145{
146 unsigned int i, f;
147 int result;
148
149 if (policy->cpu != 0)
150 return -ENODEV;
151
152 /* get frequencies */
153 max_multiplier = powernow_k6_get_cpu_multiplier();
154 busfreq = cpu_khz / max_multiplier;
155
156 /* table init */
157 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
158 f = clock_ratio[i].index;
159 if (f > max_multiplier)
160 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
161 else
162 clock_ratio[i].frequency = busfreq * f;
163 }
164
165 /* cpuinfo and default policy values */
166 policy->cpuinfo.transition_latency = 200000;
167 policy->cur = busfreq * max_multiplier;
168
169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
170 if (result)
171 return result;
172
173 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
174
175 return 0;
176}
177
178
179static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
180{
181 unsigned int i;
182 for (i = 0; i < 8; i++) {
183 if (i == max_multiplier)
184 powernow_k6_set_state(i);
185 }
186 cpufreq_frequency_table_put_attr(policy->cpu);
187 return 0;
188}
189
190static unsigned int powernow_k6_get(unsigned int cpu)
191{
192 unsigned int ret;
193 ret = (busfreq * powernow_k6_get_cpu_multiplier());
194 return ret;
195}
196
197static struct freq_attr *powernow_k6_attr[] = {
198 &cpufreq_freq_attr_scaling_available_freqs,
199 NULL,
200};
201
202static struct cpufreq_driver powernow_k6_driver = {
203 .verify = powernow_k6_verify,
204 .target = powernow_k6_target,
205 .init = powernow_k6_cpu_init,
206 .exit = powernow_k6_cpu_exit,
207 .get = powernow_k6_get,
208 .name = "powernow-k6",
209 .owner = THIS_MODULE,
210 .attr = powernow_k6_attr,
211};
212
213
214/**
215 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
216 *
217 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
218 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
219 * on success.
220 */
221static int __init powernow_k6_init(void)
222{
223 struct cpuinfo_x86 *c = &cpu_data(0);
224
225 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
226 ((c->x86_model != 12) && (c->x86_model != 13)))
227 return -ENODEV;
228
229 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
230 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
231 return -EIO;
232 }
233
234 if (cpufreq_register_driver(&powernow_k6_driver)) {
235 release_region(POWERNOW_IOPORT, 16);
236 return -EINVAL;
237 }
238
239 return 0;
240}
241
242
243/**
244 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
245 *
246 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
247 */
248static void __exit powernow_k6_exit(void)
249{
250 cpufreq_unregister_driver(&powernow_k6_driver);
251 release_region(POWERNOW_IOPORT, 16);
252}
253
254
255MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
256 "Dominik Brodowski <linux@brodo.de>");
257MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
258MODULE_LICENSE("GPL");
259
260module_init(powernow_k6_init);
261module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41ba..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5:
10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
15 */
16
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/moduleparam.h>
20#include <linux/init.h>
21#include <linux/cpufreq.h>
22#include <linux/slab.h>
23#include <linux/string.h>
24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
29#include <asm/msr.h>
30#include <asm/system.h>
31
32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
33#include <linux/acpi.h>
34#include <acpi/processor.h>
35#endif
36
37#include "powernow-k7.h"
38
39#define PFX "powernow: "
40
41
42struct psb_s {
43 u8 signature[10];
44 u8 tableversion;
45 u8 flags;
46 u16 settlingtime;
47 u8 reserved1;
48 u8 numpst;
49};
50
51struct pst_s {
52 u32 cpuid;
53 u8 fsbspeed;
54 u8 maxfid;
55 u8 startvid;
56 u8 numpstates;
57};
58
59#ifdef CONFIG_X86_POWERNOW_K7_ACPI
60union powernow_acpi_control_t {
61 struct {
62 unsigned long fid:5,
63 vid:5,
64 sgtc:20,
65 res1:2;
66 } bits;
67 unsigned long val;
68};
69#endif
70
71#ifdef CONFIG_CPU_FREQ_DEBUG
72/* divide by 1000 to get VCore voltage in V. */
73static const int mobile_vid_table[32] = {
74 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
75 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
76 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
77 1075, 1050, 1025, 1000, 975, 950, 925, 0,
78};
79#endif
80
81/* divide by 10 to get FID. */
82static const int fid_codes[32] = {
83 110, 115, 120, 125, 50, 55, 60, 65,
84 70, 75, 80, 85, 90, 95, 100, 105,
85 30, 190, 40, 200, 130, 135, 140, 210,
86 150, 225, 160, 165, 170, 180, -1, -1,
87};
88
89/* This parameter is used in order to force ACPI instead of legacy method for
90 * configuration purpose.
91 */
92
93static int acpi_force;
94
95static struct cpufreq_frequency_table *powernow_table;
96
97static unsigned int can_scale_bus;
98static unsigned int can_scale_vid;
99static unsigned int minimum_speed = -1;
100static unsigned int maximum_speed;
101static unsigned int number_scales;
102static unsigned int fsb;
103static unsigned int latency;
104static char have_a0;
105
106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
108
109static int check_fsb(unsigned int fsbspeed)
110{
111 int delta;
112 unsigned int f = fsb / 1000;
113
114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
115 return delta < 5;
116}
117
118static int check_powernow(void)
119{
120 struct cpuinfo_x86 *c = &cpu_data(0);
121 unsigned int maxei, eax, ebx, ecx, edx;
122
123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
124#ifdef MODULE
125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
127#endif
128 return 0;
129 }
130
131 /* Get maximum capabilities */
132 maxei = cpuid_eax(0x80000000);
133 if (maxei < 0x80000007) { /* Any powernow info ? */
134#ifdef MODULE
135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
136#endif
137 return 0;
138 }
139
140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
143 have_a0 = 1;
144 }
145
146 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
147
148 /* Check we can actually do something before we say anything.*/
149 if (!(edx & (1 << 1 | 1 << 2)))
150 return 0;
151
152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
153
154 if (edx & 1 << 1) {
155 printk("frequency");
156 can_scale_bus = 1;
157 }
158
159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
160 printk(" and ");
161
162 if (edx & 1 << 2) {
163 printk("voltage");
164 can_scale_vid = 1;
165 }
166
167 printk(".\n");
168 return 1;
169}
170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
172static void invalidate_entry(unsigned int entry)
173{
174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
175}
176#endif
177
178static int get_ranges(unsigned char *pst)
179{
180 unsigned int j;
181 unsigned int speed;
182 u8 fid, vid;
183
184 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
185 (number_scales + 1)), GFP_KERNEL);
186 if (!powernow_table)
187 return -ENOMEM;
188
189 for (j = 0 ; j < number_scales; j++) {
190 fid = *pst++;
191
192 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
193 powernow_table[j].index = fid; /* lower 8 bits */
194
195 speed = powernow_table[j].frequency;
196
197 if ((fid_codes[fid] % 10) == 5) {
198#ifdef CONFIG_X86_POWERNOW_K7_ACPI
199 if (have_a0 == 1)
200 invalidate_entry(j);
201#endif
202 }
203
204 if (speed < minimum_speed)
205 minimum_speed = speed;
206 if (speed > maximum_speed)
207 maximum_speed = speed;
208
209 vid = *pst++;
210 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
211
212 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
213 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
214 fid_codes[fid] % 10, speed/1000, vid,
215 mobile_vid_table[vid]/1000,
216 mobile_vid_table[vid]%1000);
217 }
218 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
219 powernow_table[number_scales].index = 0;
220
221 return 0;
222}
223
224
225static void change_FID(int fid)
226{
227 union msr_fidvidctl fidvidctl;
228
229 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
230 if (fidvidctl.bits.FID != fid) {
231 fidvidctl.bits.SGTC = latency;
232 fidvidctl.bits.FID = fid;
233 fidvidctl.bits.VIDC = 0;
234 fidvidctl.bits.FIDC = 1;
235 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
236 }
237}
238
239
240static void change_VID(int vid)
241{
242 union msr_fidvidctl fidvidctl;
243
244 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
245 if (fidvidctl.bits.VID != vid) {
246 fidvidctl.bits.SGTC = latency;
247 fidvidctl.bits.VID = vid;
248 fidvidctl.bits.FIDC = 0;
249 fidvidctl.bits.VIDC = 1;
250 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
251 }
252}
253
254
255static void change_speed(unsigned int index)
256{
257 u8 fid, vid;
258 struct cpufreq_freqs freqs;
259 union msr_fidvidstatus fidvidstatus;
260 int cfid;
261
262 /* fid are the lower 8 bits of the index we stored into
263 * the cpufreq frequency table in powernow_decode_bios,
264 * vid are the upper 8 bits.
265 */
266
267 fid = powernow_table[index].index & 0xFF;
268 vid = (powernow_table[index].index & 0xFF00) >> 8;
269
270 freqs.cpu = 0;
271
272 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
273 cfid = fidvidstatus.bits.CFID;
274 freqs.old = fsb * fid_codes[cfid] / 10;
275
276 freqs.new = powernow_table[index].frequency;
277
278 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
279
280 /* Now do the magic poking into the MSRs. */
281
282 if (have_a0 == 1) /* A0 errata 5 */
283 local_irq_disable();
284
285 if (freqs.old > freqs.new) {
286 /* Going down, so change FID first */
287 change_FID(fid);
288 change_VID(vid);
289 } else {
290 /* Going up, so change VID first */
291 change_VID(vid);
292 change_FID(fid);
293 }
294
295
296 if (have_a0 == 1)
297 local_irq_enable();
298
299 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
300}
301
302
303#ifdef CONFIG_X86_POWERNOW_K7_ACPI
304
305static struct acpi_processor_performance *acpi_processor_perf;
306
307static int powernow_acpi_init(void)
308{
309 int i;
310 int retval = 0;
311 union powernow_acpi_control_t pc;
312
313 if (acpi_processor_perf != NULL && powernow_table != NULL) {
314 retval = -EINVAL;
315 goto err0;
316 }
317
318 acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
319 GFP_KERNEL);
320 if (!acpi_processor_perf) {
321 retval = -ENOMEM;
322 goto err0;
323 }
324
325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) {
327 retval = -ENOMEM;
328 goto err05;
329 }
330
331 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
332 retval = -EIO;
333 goto err1;
334 }
335
336 if (acpi_processor_perf->control_register.space_id !=
337 ACPI_ADR_SPACE_FIXED_HARDWARE) {
338 retval = -ENODEV;
339 goto err2;
340 }
341
342 if (acpi_processor_perf->status_register.space_id !=
343 ACPI_ADR_SPACE_FIXED_HARDWARE) {
344 retval = -ENODEV;
345 goto err2;
346 }
347
348 number_scales = acpi_processor_perf->state_count;
349
350 if (number_scales < 2) {
351 retval = -ENODEV;
352 goto err2;
353 }
354
355 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
356 (number_scales + 1)), GFP_KERNEL);
357 if (!powernow_table) {
358 retval = -ENOMEM;
359 goto err2;
360 }
361
362 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
363 for (i = 0; i < number_scales; i++) {
364 u8 fid, vid;
365 struct acpi_processor_px *state =
366 &acpi_processor_perf->states[i];
367 unsigned int speed, speed_mhz;
368
369 pc.val = (unsigned long) state->control;
370 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
371 i,
372 (u32) state->core_frequency,
373 (u32) state->power,
374 (u32) state->transition_latency,
375 (u32) state->control,
376 pc.bits.sgtc);
377
378 vid = pc.bits.vid;
379 fid = pc.bits.fid;
380
381 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
382 powernow_table[i].index = fid; /* lower 8 bits */
383 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
384
385 speed = powernow_table[i].frequency;
386 speed_mhz = speed / 1000;
387
388 /* processor_perflib will multiply the MHz value by 1000 to
389 * get a KHz value (e.g. 1266000). However, powernow-k7 works
390 * with true KHz values (e.g. 1266768). To ensure that all
391 * powernow frequencies are available, we must ensure that
392 * ACPI doesn't restrict them, so we round up the MHz value
393 * to ensure that perflib's computed KHz value is greater than
394 * or equal to powernow's KHz value.
395 */
396 if (speed % 1000 > 0)
397 speed_mhz++;
398
399 if ((fid_codes[fid] % 10) == 5) {
400 if (have_a0 == 1)
401 invalidate_entry(i);
402 }
403
404 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
405 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
406 fid_codes[fid] % 10, speed_mhz, vid,
407 mobile_vid_table[vid]/1000,
408 mobile_vid_table[vid]%1000);
409
410 if (state->core_frequency != speed_mhz) {
411 state->core_frequency = speed_mhz;
412 dprintk(" Corrected ACPI frequency to %d\n",
413 speed_mhz);
414 }
415
416 if (latency < pc.bits.sgtc)
417 latency = pc.bits.sgtc;
418
419 if (speed < minimum_speed)
420 minimum_speed = speed;
421 if (speed > maximum_speed)
422 maximum_speed = speed;
423 }
424
425 powernow_table[i].frequency = CPUFREQ_TABLE_END;
426 powernow_table[i].index = 0;
427
428 /* notify BIOS that we exist */
429 acpi_processor_notify_smm(THIS_MODULE);
430
431 return 0;
432
433err2:
434 acpi_processor_unregister_performance(acpi_processor_perf, 0);
435err1:
436 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
437err05:
438 kfree(acpi_processor_perf);
439err0:
440 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
441 "this platform\n");
442 acpi_processor_perf = NULL;
443 return retval;
444}
445#else
446static int powernow_acpi_init(void)
447{
448 printk(KERN_INFO PFX "no support for ACPI processor found."
449 " Please recompile your kernel with ACPI processor\n");
450 return -EINVAL;
451}
452#endif
453
454static void print_pst_entry(struct pst_s *pst, unsigned int j)
455{
456 dprintk("PST:%d (@%p)\n", j, pst);
457 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
458 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
459}
460
461static int powernow_decode_bios(int maxfid, int startvid)
462{
463 struct psb_s *psb;
464 struct pst_s *pst;
465 unsigned int i, j;
466 unsigned char *p;
467 unsigned int etuple;
468 unsigned int ret;
469
470 etuple = cpuid_eax(0x80000001);
471
472 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
473
474 p = phys_to_virt(i);
475
476 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
477 dprintk("Found PSB header at %p\n", p);
478 psb = (struct psb_s *) p;
479 dprintk("Table version: 0x%x\n", psb->tableversion);
480 if (psb->tableversion != 0x12) {
481 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
482 " supported right now\n");
483 return -ENODEV;
484 }
485
486 dprintk("Flags: 0x%x\n", psb->flags);
487 if ((psb->flags & 1) == 0)
488 dprintk("Mobile voltage regulator\n");
489 else
490 dprintk("Desktop voltage regulator\n");
491
492 latency = psb->settlingtime;
493 if (latency < 100) {
494 printk(KERN_INFO PFX "BIOS set settling time "
495 "to %d microseconds. "
496 "Should be at least 100. "
497 "Correcting.\n", latency);
498 latency = 100;
499 }
500 dprintk("Settling Time: %d microseconds.\n",
501 psb->settlingtime);
502 dprintk("Has %d PST tables. (Only dumping ones "
503 "relevant to this CPU).\n",
504 psb->numpst);
505
506 p += sizeof(struct psb_s);
507
508 pst = (struct pst_s *) p;
509
510 for (j = 0; j < psb->numpst; j++) {
511 pst = (struct pst_s *) p;
512 number_scales = pst->numpstates;
513
514 if ((etuple == pst->cpuid) &&
515 check_fsb(pst->fsbspeed) &&
516 (maxfid == pst->maxfid) &&
517 (startvid == pst->startvid)) {
518 print_pst_entry(pst, j);
519 p = (char *)pst + sizeof(struct pst_s);
520 ret = get_ranges(p);
521 return ret;
522 } else {
523 unsigned int k;
524 p = (char *)pst + sizeof(struct pst_s);
525 for (k = 0; k < number_scales; k++)
526 p += 2;
527 }
528 }
529 printk(KERN_INFO PFX "No PST tables match this cpuid "
530 "(0x%x)\n", etuple);
531 printk(KERN_INFO PFX "This is indicative of a broken "
532 "BIOS.\n");
533
534 return -EINVAL;
535 }
536 p++;
537 }
538
539 return -ENODEV;
540}
541
542
543static int powernow_target(struct cpufreq_policy *policy,
544 unsigned int target_freq,
545 unsigned int relation)
546{
547 unsigned int newstate;
548
549 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
550 relation, &newstate))
551 return -EINVAL;
552
553 change_speed(newstate);
554
555 return 0;
556}
557
558
559static int powernow_verify(struct cpufreq_policy *policy)
560{
561 return cpufreq_frequency_table_verify(policy, powernow_table);
562}
563
564/*
565 * We use the fact that the bus frequency is somehow
566 * a multiple of 100000/3 khz, then we compute sgtc according
567 * to this multiple.
568 * That way, we match more how AMD thinks all of that work.
569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS.
571 */
572static int __cpuinit fixup_sgtc(void)
573{
574 unsigned int sgtc;
575 unsigned int m;
576
577 m = fsb / 3333;
578 if ((m % 10) >= 5)
579 m += 5;
580
581 m /= 10;
582
583 sgtc = 100 * m * latency;
584 sgtc = sgtc / 3;
585 if (sgtc > 0xfffff) {
586 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
587 sgtc = 0xfffff;
588 }
589 return sgtc;
590}
591
592static unsigned int powernow_get(unsigned int cpu)
593{
594 union msr_fidvidstatus fidvidstatus;
595 unsigned int cfid;
596
597 if (cpu)
598 return 0;
599 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
600 cfid = fidvidstatus.bits.CFID;
601
602 return fsb * fid_codes[cfid] / 10;
603}
604
605
606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{
608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n",
610 d->ident);
611 printk(KERN_WARNING PFX
612 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
613 "BIOS than 3A71 (01/20/2003)\n");
614 printk(KERN_WARNING PFX
615 "cpufreq scaling has been disabled as a result of this.\n");
616 return 0;
617}
618
619/*
620 * Some Athlon laptops have really fucked PST tables.
621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq.
623 */
624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 {
626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire",
628 .matches = {
629 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
630 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
631 },
632 },
633 { }
634};
635
636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{
638 union msr_fidvidstatus fidvidstatus;
639 int result;
640
641 if (policy->cpu != 0)
642 return -ENODEV;
643
644 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
645
646 recalibrate_cpu_khz();
647
648 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
649 if (!fsb) {
650 printk(KERN_WARNING PFX "can not determine bus frequency\n");
651 return -EINVAL;
652 }
653 dprintk("FSB: %3dMHz\n", fsb/1000);
654
655 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
656 printk(KERN_INFO PFX "PSB/PST known to be broken. "
657 "Trying ACPI instead\n");
658 result = powernow_acpi_init();
659 } else {
660 result = powernow_decode_bios(fidvidstatus.bits.MFID,
661 fidvidstatus.bits.SVID);
662 if (result) {
663 printk(KERN_INFO PFX "Trying ACPI perflib\n");
664 maximum_speed = 0;
665 minimum_speed = -1;
666 latency = 0;
667 result = powernow_acpi_init();
668 if (result) {
669 printk(KERN_INFO PFX
670 "ACPI and legacy methods failed\n");
671 }
672 } else {
673 /* SGTC use the bus clock as timer */
674 latency = fixup_sgtc();
675 printk(KERN_INFO PFX "SGTC: %d\n", latency);
676 }
677 }
678
679 if (result)
680 return result;
681
682 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
683 minimum_speed/1000, maximum_speed/1000);
684
685 policy->cpuinfo.transition_latency =
686 cpufreq_scale(2000000UL, fsb, latency);
687
688 policy->cur = powernow_get(0);
689
690 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
691
692 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
693}
694
695static int powernow_cpu_exit(struct cpufreq_policy *policy)
696{
697 cpufreq_frequency_table_put_attr(policy->cpu);
698
699#ifdef CONFIG_X86_POWERNOW_K7_ACPI
700 if (acpi_processor_perf) {
701 acpi_processor_unregister_performance(acpi_processor_perf, 0);
702 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
703 kfree(acpi_processor_perf);
704 }
705#endif
706
707 kfree(powernow_table);
708 return 0;
709}
710
711static struct freq_attr *powernow_table_attr[] = {
712 &cpufreq_freq_attr_scaling_available_freqs,
713 NULL,
714};
715
716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify,
718 .target = powernow_target,
719 .get = powernow_get,
720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .bios_limit = acpi_processor_get_bios_limit,
722#endif
723 .init = powernow_cpu_init,
724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
728};
729
730static int __init powernow_init(void)
731{
732 if (check_powernow() == 0)
733 return -ENODEV;
734 return cpufreq_register_driver(&powernow_driver);
735}
736
737
738static void __exit powernow_exit(void)
739{
740 cpufreq_unregister_driver(&powernow_driver);
741}
742
743module_param(acpi_force, int, 0444);
744MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
745
746MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
747MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
748MODULE_LICENSE("GPL");
749
750late_initcall(powernow_init);
751module_exit(powernow_exit);
752
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
1/*
2 * (C) 2003 Dave Jones.
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * AMD-specific information
7 *
8 */
9
10union msr_fidvidctl {
11 struct {
12 unsigned FID:5, // 4:0
13 reserved1:3, // 7:5
14 VID:5, // 12:8
15 reserved2:3, // 15:13
16 FIDC:1, // 16
17 VIDC:1, // 17
18 reserved3:2, // 19:18
19 FIDCHGRATIO:1, // 20
20 reserved4:11, // 31-21
21 SGTC:20, // 32:51
22 reserved5:12; // 63:52
23 } bits;
24 unsigned long long val;
25};
26
27union msr_fidvidstatus {
28 struct {
29 unsigned CFID:5, // 4:0
30 reserved1:3, // 7:5
31 SFID:5, // 12:8
32 reserved2:3, // 15:13
33 MFID:5, // 20:16
34 reserved3:11, // 31:21
35 CVID:5, // 36:32
36 reserved4:3, // 39:37
37 SVID:5, // 44:40
38 reserved5:3, // 47:45
39 MVID:5, // 52:48
40 reserved6:11; // 63:53
41 } bits;
42 unsigned long long val;
43};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 491977baf6c0..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1601 +0,0 @@
1/*
2 * (c) 2003-2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : mark.langsdorf@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, Jacob Shin, and others.
18 * Originally developed by Paul Devriendt.
19 * Processor information obtained from Chapter 9 (Power and Thermal Management)
20 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21 * Opteron Processors" available for download from www.amd.com
22 *
23 * Tables for specific CPUs can be inferred from
24 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25 */
26
27#include <linux/kernel.h>
28#include <linux/smp.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/slab.h>
33#include <linux/string.h>
34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
38
39#include <asm/msr.h>
40
41#include <linux/acpi.h>
42#include <linux/mutex.h>
43#include <acpi/processor.h>
44
45#define PFX "powernow-k8: "
46#define VERSION "version 2.20.00"
47#include "powernow-k8.h"
48#include "mperf.h"
49
50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex);
52
53static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54
55static int cpu_family = CPU_OPTERON;
56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
63#ifndef CONFIG_SMP
64static inline const struct cpumask *cpu_core_mask(int cpu)
65{
66 return cpumask_of(0);
67}
68#endif
69
70/* Return a frequency in MHz, given an input fid */
71static u32 find_freq_from_fid(u32 fid)
72{
73 return 800 + (fid * 100);
74}
75
76/* Return a frequency in KHz, given an input fid */
77static u32 find_khz_freq_from_fid(u32 fid)
78{
79 return 1000 * find_freq_from_fid(fid);
80}
81
82static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
83 u32 pstate)
84{
85 return data[pstate].frequency;
86}
87
88/* Return the vco fid for an input fid
89 *
90 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
91 * only from corresponding high fids. This returns "high" fid corresponding to
92 * "low" one.
93 */
94static u32 convert_fid_to_vco_fid(u32 fid)
95{
96 if (fid < HI_FID_TABLE_BOTTOM)
97 return 8 + (2 * fid);
98 else
99 return fid;
100}
101
102/*
103 * Return 1 if the pending bit is set. Unless we just instructed the processor
104 * to transition to a new state, seeing this bit set is really bad news.
105 */
106static int pending_bit_stuck(void)
107{
108 u32 lo, hi;
109
110 if (cpu_family == CPU_HW_PSTATE)
111 return 0;
112
113 rdmsr(MSR_FIDVID_STATUS, lo, hi);
114 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
115}
116
117/*
118 * Update the global current fid / vid values from the status msr.
119 * Returns 1 on error.
120 */
121static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
122{
123 u32 lo, hi;
124 u32 i = 0;
125
126 if (cpu_family == CPU_HW_PSTATE) {
127 rdmsr(MSR_PSTATE_STATUS, lo, hi);
128 i = lo & HW_PSTATE_MASK;
129 data->currpstate = i;
130
131 /*
132 * a workaround for family 11h erratum 311 might cause
133 * an "out-of-range Pstate if the core is in Pstate-0
134 */
135 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
136 data->currpstate = HW_PSTATE_0;
137
138 return 0;
139 }
140 do {
141 if (i++ > 10000) {
142 dprintk("detected change pending stuck\n");
143 return 1;
144 }
145 rdmsr(MSR_FIDVID_STATUS, lo, hi);
146 } while (lo & MSR_S_LO_CHANGE_PENDING);
147
148 data->currvid = hi & MSR_S_HI_CURRENT_VID;
149 data->currfid = lo & MSR_S_LO_CURRENT_FID;
150
151 return 0;
152}
153
154/* the isochronous relief time */
155static void count_off_irt(struct powernow_k8_data *data)
156{
157 udelay((1 << data->irt) * 10);
158 return;
159}
160
161/* the voltage stabilization time */
162static void count_off_vst(struct powernow_k8_data *data)
163{
164 udelay(data->vstable * VST_UNITS_20US);
165 return;
166}
167
168/* need to init the control msr to a safe value (for each cpu) */
169static void fidvid_msr_init(void)
170{
171 u32 lo, hi;
172 u8 fid, vid;
173
174 rdmsr(MSR_FIDVID_STATUS, lo, hi);
175 vid = hi & MSR_S_HI_CURRENT_VID;
176 fid = lo & MSR_S_LO_CURRENT_FID;
177 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
178 hi = MSR_C_HI_STP_GNT_BENIGN;
179 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
180 wrmsr(MSR_FIDVID_CTL, lo, hi);
181}
182
183/* write the new fid value along with the other control fields to the msr */
184static int write_new_fid(struct powernow_k8_data *data, u32 fid)
185{
186 u32 lo;
187 u32 savevid = data->currvid;
188 u32 i = 0;
189
190 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
191 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
192 return 1;
193 }
194
195 lo = fid;
196 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
197 lo |= MSR_C_LO_INIT_FID_VID;
198
199 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
200 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
201
202 do {
203 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
204 if (i++ > 100) {
205 printk(KERN_ERR PFX
206 "Hardware error - pending bit very stuck - "
207 "no further pstate changes possible\n");
208 return 1;
209 }
210 } while (query_current_values_with_pending_wait(data));
211
212 count_off_irt(data);
213
214 if (savevid != data->currvid) {
215 printk(KERN_ERR PFX
216 "vid change on fid trans, old 0x%x, new 0x%x\n",
217 savevid, data->currvid);
218 return 1;
219 }
220
221 if (fid != data->currfid) {
222 printk(KERN_ERR PFX
223 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
224 data->currfid);
225 return 1;
226 }
227
228 return 0;
229}
230
231/* Write a new vid to the hardware */
232static int write_new_vid(struct powernow_k8_data *data, u32 vid)
233{
234 u32 lo;
235 u32 savefid = data->currfid;
236 int i = 0;
237
238 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
239 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
240 return 1;
241 }
242
243 lo = data->currfid;
244 lo |= (vid << MSR_C_LO_VID_SHIFT);
245 lo |= MSR_C_LO_INIT_FID_VID;
246
247 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
248 vid, lo, STOP_GRANT_5NS);
249
250 do {
251 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
252 if (i++ > 100) {
253 printk(KERN_ERR PFX "internal error - pending bit "
254 "very stuck - no further pstate "
255 "changes possible\n");
256 return 1;
257 }
258 } while (query_current_values_with_pending_wait(data));
259
260 if (savefid != data->currfid) {
261 printk(KERN_ERR PFX "fid changed on vid trans, old "
262 "0x%x new 0x%x\n",
263 savefid, data->currfid);
264 return 1;
265 }
266
267 if (vid != data->currvid) {
268 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
269 "curr 0x%x\n",
270 vid, data->currvid);
271 return 1;
272 }
273
274 return 0;
275}
276
277/*
278 * Reduce the vid by the max of step or reqvid.
279 * Decreasing vid codes represent increasing voltages:
280 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
281 */
282static int decrease_vid_code_by_step(struct powernow_k8_data *data,
283 u32 reqvid, u32 step)
284{
285 if ((data->currvid - reqvid) > step)
286 reqvid = data->currvid - step;
287
288 if (write_new_vid(data, reqvid))
289 return 1;
290
291 count_off_vst(data);
292
293 return 0;
294}
295
296/* Change hardware pstate by single MSR write */
297static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
298{
299 wrmsr(MSR_PSTATE_CTRL, pstate, 0);
300 data->currpstate = pstate;
301 return 0;
302}
303
304/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
305static int transition_fid_vid(struct powernow_k8_data *data,
306 u32 reqfid, u32 reqvid)
307{
308 if (core_voltage_pre_transition(data, reqvid, reqfid))
309 return 1;
310
311 if (core_frequency_transition(data, reqfid))
312 return 1;
313
314 if (core_voltage_post_transition(data, reqvid))
315 return 1;
316
317 if (query_current_values_with_pending_wait(data))
318 return 1;
319
320 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
321 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
322 "curr 0x%x 0x%x\n",
323 smp_processor_id(),
324 reqfid, reqvid, data->currfid, data->currvid);
325 return 1;
326 }
327
328 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
329 smp_processor_id(), data->currfid, data->currvid);
330
331 return 0;
332}
333
334/* Phase 1 - core voltage transition ... setup voltage */
335static int core_voltage_pre_transition(struct powernow_k8_data *data,
336 u32 reqvid, u32 reqfid)
337{
338 u32 rvosteps = data->rvo;
339 u32 savefid = data->currfid;
340 u32 maxvid, lo, rvomult = 1;
341
342 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
343 "reqvid 0x%x, rvo 0x%x\n",
344 smp_processor_id(),
345 data->currfid, data->currvid, reqvid, data->rvo);
346
347 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
348 rvomult = 2;
349 rvosteps *= rvomult;
350 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
351 maxvid = 0x1f & (maxvid >> 16);
352 dprintk("ph1 maxvid=0x%x\n", maxvid);
353 if (reqvid < maxvid) /* lower numbers are higher voltages */
354 reqvid = maxvid;
355
356 while (data->currvid > reqvid) {
357 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
358 data->currvid, reqvid);
359 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
360 return 1;
361 }
362
363 while ((rvosteps > 0) &&
364 ((rvomult * data->rvo + data->currvid) > reqvid)) {
365 if (data->currvid == maxvid) {
366 rvosteps = 0;
367 } else {
368 dprintk("ph1: changing vid for rvo, req 0x%x\n",
369 data->currvid - 1);
370 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
371 return 1;
372 rvosteps--;
373 }
374 }
375
376 if (query_current_values_with_pending_wait(data))
377 return 1;
378
379 if (savefid != data->currfid) {
380 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
381 data->currfid);
382 return 1;
383 }
384
385 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
386 data->currfid, data->currvid);
387
388 return 0;
389}
390
391/* Phase 2 - core frequency transition */
392static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
393{
394 u32 vcoreqfid, vcocurrfid, vcofiddiff;
395 u32 fid_interval, savevid = data->currvid;
396
397 if (data->currfid == reqfid) {
398 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
399 data->currfid);
400 return 0;
401 }
402
403 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
404 "reqfid 0x%x\n",
405 smp_processor_id(),
406 data->currfid, data->currvid, reqfid);
407
408 vcoreqfid = convert_fid_to_vco_fid(reqfid);
409 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
410 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
411 : vcoreqfid - vcocurrfid;
412
413 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
414 vcofiddiff = 0;
415
416 while (vcofiddiff > 2) {
417 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
418
419 if (reqfid > data->currfid) {
420 if (data->currfid > LO_FID_TABLE_TOP) {
421 if (write_new_fid(data,
422 data->currfid + fid_interval))
423 return 1;
424 } else {
425 if (write_new_fid
426 (data,
427 2 + convert_fid_to_vco_fid(data->currfid)))
428 return 1;
429 }
430 } else {
431 if (write_new_fid(data, data->currfid - fid_interval))
432 return 1;
433 }
434
435 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
436 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
437 : vcoreqfid - vcocurrfid;
438 }
439
440 if (write_new_fid(data, reqfid))
441 return 1;
442
443 if (query_current_values_with_pending_wait(data))
444 return 1;
445
446 if (data->currfid != reqfid) {
447 printk(KERN_ERR PFX
448 "ph2: mismatch, failed fid transition, "
449 "curr 0x%x, req 0x%x\n",
450 data->currfid, reqfid);
451 return 1;
452 }
453
454 if (savevid != data->currvid) {
455 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
456 savevid, data->currvid);
457 return 1;
458 }
459
460 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
461 data->currfid, data->currvid);
462
463 return 0;
464}
465
466/* Phase 3 - core voltage transition flow ... jump to the final vid. */
467static int core_voltage_post_transition(struct powernow_k8_data *data,
468 u32 reqvid)
469{
470 u32 savefid = data->currfid;
471 u32 savereqvid = reqvid;
472
473 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
474 smp_processor_id(),
475 data->currfid, data->currvid);
476
477 if (reqvid != data->currvid) {
478 if (write_new_vid(data, reqvid))
479 return 1;
480
481 if (savefid != data->currfid) {
482 printk(KERN_ERR PFX
483 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
484 savefid, data->currfid);
485 return 1;
486 }
487
488 if (data->currvid != reqvid) {
489 printk(KERN_ERR PFX
490 "ph3: failed vid transition\n, "
491 "req 0x%x, curr 0x%x",
492 reqvid, data->currvid);
493 return 1;
494 }
495 }
496
497 if (query_current_values_with_pending_wait(data))
498 return 1;
499
500 if (savereqvid != data->currvid) {
501 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
502 return 1;
503 }
504
505 if (savefid != data->currfid) {
506 dprintk("ph3 failed, currfid changed 0x%x\n",
507 data->currfid);
508 return 1;
509 }
510
511 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
512 data->currfid, data->currvid);
513
514 return 0;
515}
516
517static void check_supported_cpu(void *_rc)
518{
519 u32 eax, ebx, ecx, edx;
520 int *rc = _rc;
521
522 *rc = -ENODEV;
523
524 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
525 return;
526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
528 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
529 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
530 return;
531
532 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
533 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
534 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
535 printk(KERN_INFO PFX
536 "Processor cpuid %x not supported\n", eax);
537 return;
538 }
539
540 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
541 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
542 printk(KERN_INFO PFX
543 "No frequency change capabilities detected\n");
544 return;
545 }
546
547 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
548 if ((edx & P_STATE_TRANSITION_CAPABLE)
549 != P_STATE_TRANSITION_CAPABLE) {
550 printk(KERN_INFO PFX
551 "Power state transitions not supported\n");
552 return;
553 }
554 } else { /* must be a HW Pstate capable processor */
555 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
556 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
557 cpu_family = CPU_HW_PSTATE;
558 else
559 return;
560 }
561
562 *rc = 0;
563}
564
565static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
566 u8 maxvid)
567{
568 unsigned int j;
569 u8 lastfid = 0xff;
570
571 for (j = 0; j < data->numps; j++) {
572 if (pst[j].vid > LEAST_VID) {
573 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
574 j, pst[j].vid);
575 return -EINVAL;
576 }
577 if (pst[j].vid < data->rvo) {
578 /* vid + rvo >= 0 */
579 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
580 " %d\n", j);
581 return -ENODEV;
582 }
583 if (pst[j].vid < maxvid + data->rvo) {
584 /* vid + rvo >= maxvid */
585 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
586 " %d\n", j);
587 return -ENODEV;
588 }
589 if (pst[j].fid > MAX_FID) {
590 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
591 " %d\n", j);
592 return -ENODEV;
593 }
594 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
595 /* Only first fid is allowed to be in "low" range */
596 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
597 "0x%x\n", j, pst[j].fid);
598 return -EINVAL;
599 }
600 if (pst[j].fid < lastfid)
601 lastfid = pst[j].fid;
602 }
603 if (lastfid & 1) {
604 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
605 return -EINVAL;
606 }
607 if (lastfid > LO_FID_TABLE_TOP)
608 printk(KERN_INFO FW_BUG PFX
609 "first fid not from lo freq table\n");
610
611 return 0;
612}
613
614static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
615 unsigned int entry)
616{
617 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
618}
619
620static void print_basics(struct powernow_k8_data *data)
621{
622 int j;
623 for (j = 0; j < data->numps; j++) {
624 if (data->powernow_table[j].frequency !=
625 CPUFREQ_ENTRY_INVALID) {
626 if (cpu_family == CPU_HW_PSTATE) {
627 printk(KERN_INFO PFX
628 " %d : pstate %d (%d MHz)\n", j,
629 data->powernow_table[j].index,
630 data->powernow_table[j].frequency/1000);
631 } else {
632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8);
638 }
639 }
640 }
641 if (data->batps)
642 printk(KERN_INFO PFX "Only %d pstates on battery\n",
643 data->batps);
644}
645
646static u32 freq_from_fid_did(u32 fid, u32 did)
647{
648 u32 mhz = 0;
649
650 if (boot_cpu_data.x86 == 0x10)
651 mhz = (100 * (fid + 0x10)) >> did;
652 else if (boot_cpu_data.x86 == 0x11)
653 mhz = (100 * (fid + 8)) >> did;
654 else
655 BUG();
656
657 return mhz * 1000;
658}
659
660static int fill_powernow_table(struct powernow_k8_data *data,
661 struct pst_s *pst, u8 maxvid)
662{
663 struct cpufreq_frequency_table *powernow_table;
664 unsigned int j;
665
666 if (data->batps) {
667 /* use ACPI support to get full speed on mains power */
668 printk(KERN_WARNING PFX
669 "Only %d pstates usable (use ACPI driver for full "
670 "range\n", data->batps);
671 data->numps = data->batps;
672 }
673
674 for (j = 1; j < data->numps; j++) {
675 if (pst[j-1].fid >= pst[j].fid) {
676 printk(KERN_ERR PFX "PST out of sequence\n");
677 return -EINVAL;
678 }
679 }
680
681 if (data->numps < 2) {
682 printk(KERN_ERR PFX "no p states to transition\n");
683 return -ENODEV;
684 }
685
686 if (check_pst_table(data, pst, maxvid))
687 return -EINVAL;
688
689 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
690 * (data->numps + 1)), GFP_KERNEL);
691 if (!powernow_table) {
692 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
693 return -ENOMEM;
694 }
695
696 for (j = 0; j < data->numps; j++) {
697 int freq;
698 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
699 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
700 freq = find_khz_freq_from_fid(pst[j].fid);
701 powernow_table[j].frequency = freq;
702 }
703 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
704 powernow_table[data->numps].index = 0;
705
706 if (query_current_values_with_pending_wait(data)) {
707 kfree(powernow_table);
708 return -EIO;
709 }
710
711 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
712 data->powernow_table = powernow_table;
713 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
714 print_basics(data);
715
716 for (j = 0; j < data->numps; j++)
717 if ((pst[j].fid == data->currfid) &&
718 (pst[j].vid == data->currvid))
719 return 0;
720
721 dprintk("currfid/vid do not match PST, ignoring\n");
722 return 0;
723}
724
725/* Find and validate the PSB/PST table in BIOS. */
726static int find_psb_table(struct powernow_k8_data *data)
727{
728 struct psb_s *psb;
729 unsigned int i;
730 u32 mvs;
731 u8 maxvid;
732 u32 cpst = 0;
733 u32 thiscpuid;
734
735 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
736 /* Scan BIOS looking for the signature. */
737 /* It can not be at ffff0 - it is too big. */
738
739 psb = phys_to_virt(i);
740 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
741 continue;
742
743 dprintk("found PSB header at 0x%p\n", psb);
744
745 dprintk("table vers: 0x%x\n", psb->tableversion);
746 if (psb->tableversion != PSB_VERSION_1_4) {
747 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
748 return -ENODEV;
749 }
750
751 dprintk("flags: 0x%x\n", psb->flags1);
752 if (psb->flags1) {
753 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
754 return -ENODEV;
755 }
756
757 data->vstable = psb->vstable;
758 dprintk("voltage stabilization time: %d(*20us)\n",
759 data->vstable);
760
761 dprintk("flags2: 0x%x\n", psb->flags2);
762 data->rvo = psb->flags2 & 3;
763 data->irt = ((psb->flags2) >> 2) & 3;
764 mvs = ((psb->flags2) >> 4) & 3;
765 data->vidmvs = 1 << mvs;
766 data->batps = ((psb->flags2) >> 6) & 3;
767
768 dprintk("ramp voltage offset: %d\n", data->rvo);
769 dprintk("isochronous relief time: %d\n", data->irt);
770 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
771
772 dprintk("numpst: 0x%x\n", psb->num_tables);
773 cpst = psb->num_tables;
774 if ((psb->cpuid == 0x00000fc0) ||
775 (psb->cpuid == 0x00000fe0)) {
776 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
777 if ((thiscpuid == 0x00000fc0) ||
778 (thiscpuid == 0x00000fe0))
779 cpst = 1;
780 }
781 if (cpst != 1) {
782 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
783 return -ENODEV;
784 }
785
786 data->plllock = psb->plllocktime;
787 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
788 dprintk("maxfid: 0x%x\n", psb->maxfid);
789 dprintk("maxvid: 0x%x\n", psb->maxvid);
790 maxvid = psb->maxvid;
791
792 data->numps = psb->numps;
793 dprintk("numpstates: 0x%x\n", data->numps);
794 return fill_powernow_table(data,
795 (struct pst_s *)(psb+1), maxvid);
796 }
797 /*
798 * If you see this message, complain to BIOS manufacturer. If
799 * he tells you "we do not support Linux" or some similar
800 * nonsense, remember that Windows 2000 uses the same legacy
801 * mechanism that the old Linux PSB driver uses. Tell them it
802 * is broken with Windows 2000.
803 *
804 * The reference to the AMD documentation is chapter 9 in the
805 * BIOS and Kernel Developer's Guide, which is available on
806 * www.amd.com
807 */
808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
811 return -ENODEV;
812}
813
814static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
815 unsigned int index)
816{
817 u64 control;
818
819 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
820 return;
821
822 control = data->acpi_data.states[index].control;
823 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
824 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
825 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
826 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
827 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
828 data->vstable = (control >> VST_SHIFT) & VST_MASK;
829}
830
831static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
832{
833 struct cpufreq_frequency_table *powernow_table;
834 int ret_val = -ENODEV;
835 u64 control, status;
836
837 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
838 dprintk("register performance failed: bad ACPI data\n");
839 return -EIO;
840 }
841
842 /* verify the data contained in the ACPI structures */
843 if (data->acpi_data.state_count <= 1) {
844 dprintk("No ACPI P-States\n");
845 goto err_out;
846 }
847
848 control = data->acpi_data.control_register.space_id;
849 status = data->acpi_data.status_register.space_id;
850
851 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
852 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
853 dprintk("Invalid control/status registers (%x - %x)\n",
854 control, status);
855 goto err_out;
856 }
857
858 /* fill in data->powernow_table */
859 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
860 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
861 if (!powernow_table) {
862 dprintk("powernow_table memory alloc failure\n");
863 goto err_out;
864 }
865
866 /* fill in data */
867 data->numps = data->acpi_data.state_count;
868 powernow_k8_acpi_pst_values(data, 0);
869
870 if (cpu_family == CPU_HW_PSTATE)
871 ret_val = fill_powernow_table_pstate(data, powernow_table);
872 else
873 ret_val = fill_powernow_table_fidvid(data, powernow_table);
874 if (ret_val)
875 goto err_out_mem;
876
877 powernow_table[data->acpi_data.state_count].frequency =
878 CPUFREQ_TABLE_END;
879 powernow_table[data->acpi_data.state_count].index = 0;
880 data->powernow_table = powernow_table;
881
882 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
883 print_basics(data);
884
885 /* notify BIOS that we exist */
886 acpi_processor_notify_smm(THIS_MODULE);
887
888 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
889 printk(KERN_ERR PFX
890 "unable to alloc powernow_k8_data cpumask\n");
891 ret_val = -ENOMEM;
892 goto err_out_mem;
893 }
894
895 return 0;
896
897err_out_mem:
898 kfree(powernow_table);
899
900err_out:
901 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
902
903 /* data->acpi_data.state_count informs us at ->exit()
904 * whether ACPI was used */
905 data->acpi_data.state_count = 0;
906
907 return ret_val;
908}
909
910static int fill_powernow_table_pstate(struct powernow_k8_data *data,
911 struct cpufreq_frequency_table *powernow_table)
912{
913 int i;
914 u32 hi = 0, lo = 0;
915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
917
918 for (i = 0; i < data->acpi_data.state_count; i++) {
919 u32 index;
920
921 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
922 if (index > data->max_hw_pstate) {
923 printk(KERN_ERR PFX "invalid pstate %d - "
924 "bad value %d.\n", i, index);
925 printk(KERN_ERR PFX "Please report to BIOS "
926 "manufacturer\n");
927 invalidate_entry(powernow_table, i);
928 continue;
929 }
930 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
931 if (!(hi & HW_PSTATE_VALID_MASK)) {
932 dprintk("invalid pstate %d, ignoring\n", index);
933 invalidate_entry(powernow_table, i);
934 continue;
935 }
936
937 powernow_table[i].index = index;
938
939 /* Frequency may be rounded for these */
940 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
941 || boot_cpu_data.x86 == 0x11) {
942 powernow_table[i].frequency =
943 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
944 } else
945 powernow_table[i].frequency =
946 data->acpi_data.states[i].core_frequency * 1000;
947 }
948 return 0;
949}
950
951static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
952 struct cpufreq_frequency_table *powernow_table)
953{
954 int i;
955
956 for (i = 0; i < data->acpi_data.state_count; i++) {
957 u32 fid;
958 u32 vid;
959 u32 freq, index;
960 u64 status, control;
961
962 if (data->exttype) {
963 status = data->acpi_data.states[i].status;
964 fid = status & EXT_FID_MASK;
965 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
966 } else {
967 control = data->acpi_data.states[i].control;
968 fid = control & FID_MASK;
969 vid = (control >> VID_SHIFT) & VID_MASK;
970 }
971
972 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
973
974 index = fid | (vid<<8);
975 powernow_table[i].index = index;
976
977 freq = find_khz_freq_from_fid(fid);
978 powernow_table[i].frequency = freq;
979
980 /* verify frequency is OK */
981 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
982 dprintk("invalid freq %u kHz, ignoring\n", freq);
983 invalidate_entry(powernow_table, i);
984 continue;
985 }
986
987 /* verify voltage is OK -
988 * BIOSs are using "off" to indicate invalid */
989 if (vid == VID_OFF) {
990 dprintk("invalid vid %u, ignoring\n", vid);
991 invalidate_entry(powernow_table, i);
992 continue;
993 }
994
995 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
996 printk(KERN_INFO PFX "invalid freq entries "
997 "%u kHz vs. %u kHz\n", freq,
998 (unsigned int)
999 (data->acpi_data.states[i].core_frequency
1000 * 1000));
1001 invalidate_entry(powernow_table, i);
1002 continue;
1003 }
1004 }
1005 return 0;
1006}
1007
1008static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
1009{
1010 if (data->acpi_data.state_count)
1011 acpi_processor_unregister_performance(&data->acpi_data,
1012 data->cpu);
1013 free_cpumask_var(data->acpi_data.shared_cpu_map);
1014}
1015
1016static int get_transition_latency(struct powernow_k8_data *data)
1017{
1018 int max_latency = 0;
1019 int i;
1020 for (i = 0; i < data->acpi_data.state_count; i++) {
1021 int cur_latency = data->acpi_data.states[i].transition_latency
1022 + data->acpi_data.states[i].bus_master_latency;
1023 if (cur_latency > max_latency)
1024 max_latency = cur_latency;
1025 }
1026 if (max_latency == 0) {
1027 /*
1028 * Fam 11h and later may return 0 as transition latency. This
1029 * is intended and means "very fast". While cpufreq core and
1030 * governors currently can handle that gracefully, better set it
1031 * to 1 to avoid problems in the future.
1032 */
1033 if (boot_cpu_data.x86 < 0x11)
1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1035 "latency\n");
1036 max_latency = 1;
1037 }
1038 /* value in usecs, needs to be in nanoseconds */
1039 return 1000 * max_latency;
1040}
1041
1042/* Take a frequency, and issue the fid/vid transition command */
1043static int transition_frequency_fidvid(struct powernow_k8_data *data,
1044 unsigned int index)
1045{
1046 u32 fid = 0;
1047 u32 vid = 0;
1048 int res, i;
1049 struct cpufreq_freqs freqs;
1050
1051 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1052
1053 /* fid/vid correctness check for k8 */
1054 /* fid are the lower 8 bits of the index we stored into
1055 * the cpufreq frequency table in find_psb_table, vid
1056 * are the upper 8 bits.
1057 */
1058 fid = data->powernow_table[index].index & 0xFF;
1059 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
1060
1061 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
1062
1063 if (query_current_values_with_pending_wait(data))
1064 return 1;
1065
1066 if ((data->currvid == vid) && (data->currfid == fid)) {
1067 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
1068 fid, vid);
1069 return 0;
1070 }
1071
1072 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1073 smp_processor_id(), fid, vid);
1074 freqs.old = find_khz_freq_from_fid(data->currfid);
1075 freqs.new = find_khz_freq_from_fid(fid);
1076
1077 for_each_cpu(i, data->available_cores) {
1078 freqs.cpu = i;
1079 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1080 }
1081
1082 res = transition_fid_vid(data, fid, vid);
1083 freqs.new = find_khz_freq_from_fid(data->currfid);
1084
1085 for_each_cpu(i, data->available_cores) {
1086 freqs.cpu = i;
1087 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1088 }
1089 return res;
1090}
1091
1092/* Take a frequency, and issue the hardware pstate transition command */
1093static int transition_frequency_pstate(struct powernow_k8_data *data,
1094 unsigned int index)
1095{
1096 u32 pstate = 0;
1097 int res, i;
1098 struct cpufreq_freqs freqs;
1099
1100 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1101
1102 /* get MSR index for hardware pstate transition */
1103 pstate = index & HW_PSTATE_MASK;
1104 if (pstate > data->max_hw_pstate)
1105 return 0;
1106 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1107 data->currpstate);
1108 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1109
1110 for_each_cpu(i, data->available_cores) {
1111 freqs.cpu = i;
1112 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1113 }
1114
1115 res = transition_pstate(data, pstate);
1116 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1117
1118 for_each_cpu(i, data->available_cores) {
1119 freqs.cpu = i;
1120 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1121 }
1122 return res;
1123}
1124
1125/* Driver entry point to switch to the target frequency */
1126static int powernowk8_target(struct cpufreq_policy *pol,
1127 unsigned targfreq, unsigned relation)
1128{
1129 cpumask_var_t oldmask;
1130 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1131 u32 checkfid;
1132 u32 checkvid;
1133 unsigned int newstate;
1134 int ret = -EIO;
1135
1136 if (!data)
1137 return -EINVAL;
1138
1139 checkfid = data->currfid;
1140 checkvid = data->currvid;
1141
1142 /* only run on specific CPU from here on. */
1143 /* This is poor form: use a workqueue or smp_call_function_single */
1144 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1145 return -ENOMEM;
1146
1147 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1148 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1149
1150 if (smp_processor_id() != pol->cpu) {
1151 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1152 goto err_out;
1153 }
1154
1155 if (pending_bit_stuck()) {
1156 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1157 goto err_out;
1158 }
1159
1160 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1161 pol->cpu, targfreq, pol->min, pol->max, relation);
1162
1163 if (query_current_values_with_pending_wait(data))
1164 goto err_out;
1165
1166 if (cpu_family != CPU_HW_PSTATE) {
1167 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1168 data->currfid, data->currvid);
1169
1170 if ((checkvid != data->currvid) ||
1171 (checkfid != data->currfid)) {
1172 printk(KERN_INFO PFX
1173 "error - out of sync, fix 0x%x 0x%x, "
1174 "vid 0x%x 0x%x\n",
1175 checkfid, data->currfid,
1176 checkvid, data->currvid);
1177 }
1178 }
1179
1180 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1181 targfreq, relation, &newstate))
1182 goto err_out;
1183
1184 mutex_lock(&fidvid_mutex);
1185
1186 powernow_k8_acpi_pst_values(data, newstate);
1187
1188 if (cpu_family == CPU_HW_PSTATE)
1189 ret = transition_frequency_pstate(data, newstate);
1190 else
1191 ret = transition_frequency_fidvid(data, newstate);
1192 if (ret) {
1193 printk(KERN_ERR PFX "transition frequency failed\n");
1194 ret = 1;
1195 mutex_unlock(&fidvid_mutex);
1196 goto err_out;
1197 }
1198 mutex_unlock(&fidvid_mutex);
1199
1200 if (cpu_family == CPU_HW_PSTATE)
1201 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1202 newstate);
1203 else
1204 pol->cur = find_khz_freq_from_fid(data->currfid);
1205 ret = 0;
1206
1207err_out:
1208 set_cpus_allowed_ptr(current, oldmask);
1209 free_cpumask_var(oldmask);
1210 return ret;
1211}
1212
1213/* Driver entry point to verify the policy and range of frequencies */
1214static int powernowk8_verify(struct cpufreq_policy *pol)
1215{
1216 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1217
1218 if (!data)
1219 return -EINVAL;
1220
1221 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1222}
1223
1224struct init_on_cpu {
1225 struct powernow_k8_data *data;
1226 int rc;
1227};
1228
1229static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1230{
1231 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1232
1233 if (pending_bit_stuck()) {
1234 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1235 init_on_cpu->rc = -ENODEV;
1236 return;
1237 }
1238
1239 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1240 init_on_cpu->rc = -ENODEV;
1241 return;
1242 }
1243
1244 if (cpu_family == CPU_OPTERON)
1245 fidvid_msr_init();
1246
1247 init_on_cpu->rc = 0;
1248}
1249
1250/* per CPU init entry point to the driver */
1251static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1252{
1253 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1254 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1255 FW_BUG PFX "Try again with latest BIOS.\n";
1256 struct powernow_k8_data *data;
1257 struct init_on_cpu init_on_cpu;
1258 int rc;
1259 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1260
1261 if (!cpu_online(pol->cpu))
1262 return -ENODEV;
1263
1264 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1265 if (rc)
1266 return -ENODEV;
1267
1268 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1269 if (!data) {
1270 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1271 return -ENOMEM;
1272 }
1273
1274 data->cpu = pol->cpu;
1275 data->currpstate = HW_PSTATE_INVALID;
1276
1277 if (powernow_k8_cpu_init_acpi(data)) {
1278 /*
1279 * Use the PSB BIOS structure. This is only availabe on
1280 * an UP version, and is deprecated by AMD.
1281 */
1282 if (num_online_cpus() != 1) {
1283 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1284 goto err_out;
1285 }
1286 if (pol->cpu != 0) {
1287 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1288 "CPU other than CPU0. Complain to your BIOS "
1289 "vendor.\n");
1290 goto err_out;
1291 }
1292 rc = find_psb_table(data);
1293 if (rc)
1294 goto err_out;
1295
1296 /* Take a crude guess here.
1297 * That guess was in microseconds, so multiply with 1000 */
1298 pol->cpuinfo.transition_latency = (
1299 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1300 ((1 << data->irt) * 30)) * 1000;
1301 } else /* ACPI _PSS objects available */
1302 pol->cpuinfo.transition_latency = get_transition_latency(data);
1303
1304 /* only run on specific CPU from here on */
1305 init_on_cpu.data = data;
1306 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1307 &init_on_cpu, 1);
1308 rc = init_on_cpu.rc;
1309 if (rc != 0)
1310 goto err_out_exit_acpi;
1311
1312 if (cpu_family == CPU_HW_PSTATE)
1313 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1314 else
1315 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1316 data->available_cores = pol->cpus;
1317
1318 if (cpu_family == CPU_HW_PSTATE)
1319 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1320 data->currpstate);
1321 else
1322 pol->cur = find_khz_freq_from_fid(data->currfid);
1323 dprintk("policy current frequency %d kHz\n", pol->cur);
1324
1325 /* min/max the cpu is capable of */
1326 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1327 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1328 powernow_k8_cpu_exit_acpi(data);
1329 kfree(data->powernow_table);
1330 kfree(data);
1331 return -EINVAL;
1332 }
1333
1334 /* Check for APERF/MPERF support in hardware */
1335 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1336 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1337
1338 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1339
1340 if (cpu_family == CPU_HW_PSTATE)
1341 dprintk("cpu_init done, current pstate 0x%x\n",
1342 data->currpstate);
1343 else
1344 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1345 data->currfid, data->currvid);
1346
1347 per_cpu(powernow_data, pol->cpu) = data;
1348
1349 return 0;
1350
1351err_out_exit_acpi:
1352 powernow_k8_cpu_exit_acpi(data);
1353
1354err_out:
1355 kfree(data);
1356 return -ENODEV;
1357}
1358
1359static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1360{
1361 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1362
1363 if (!data)
1364 return -EINVAL;
1365
1366 powernow_k8_cpu_exit_acpi(data);
1367
1368 cpufreq_frequency_table_put_attr(pol->cpu);
1369
1370 kfree(data->powernow_table);
1371 kfree(data);
1372 per_cpu(powernow_data, pol->cpu) = NULL;
1373
1374 return 0;
1375}
1376
1377static void query_values_on_cpu(void *_err)
1378{
1379 int *err = _err;
1380 struct powernow_k8_data *data = __get_cpu_var(powernow_data);
1381
1382 *err = query_current_values_with_pending_wait(data);
1383}
1384
1385static unsigned int powernowk8_get(unsigned int cpu)
1386{
1387 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1388 unsigned int khz = 0;
1389 int err;
1390
1391 if (!data)
1392 return 0;
1393
1394 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1395 if (err)
1396 goto out;
1397
1398 if (cpu_family == CPU_HW_PSTATE)
1399 khz = find_khz_freq_from_pstate(data->powernow_table,
1400 data->currpstate);
1401 else
1402 khz = find_khz_freq_from_fid(data->currfid);
1403
1404
1405out:
1406 return khz;
1407}
1408
1409static void _cpb_toggle_msrs(bool t)
1410{
1411 int cpu;
1412
1413 get_online_cpus();
1414
1415 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1416
1417 for_each_cpu(cpu, cpu_online_mask) {
1418 struct msr *reg = per_cpu_ptr(msrs, cpu);
1419 if (t)
1420 reg->l &= ~BIT(25);
1421 else
1422 reg->l |= BIT(25);
1423 }
1424 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1425
1426 put_online_cpus();
1427}
1428
1429/*
1430 * Switch on/off core performance boosting.
1431 *
1432 * 0=disable
1433 * 1=enable.
1434 */
1435static void cpb_toggle(bool t)
1436{
1437 if (!cpb_capable)
1438 return;
1439
1440 if (t && !cpb_enabled) {
1441 cpb_enabled = true;
1442 _cpb_toggle_msrs(t);
1443 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1444 } else if (!t && cpb_enabled) {
1445 cpb_enabled = false;
1446 _cpb_toggle_msrs(t);
1447 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1448 }
1449}
1450
1451static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1452 size_t count)
1453{
1454 int ret = -EINVAL;
1455 unsigned long val = 0;
1456
1457 ret = strict_strtoul(buf, 10, &val);
1458 if (!ret && (val == 0 || val == 1) && cpb_capable)
1459 cpb_toggle(val);
1460 else
1461 return -EINVAL;
1462
1463 return count;
1464}
1465
1466static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1467{
1468 return sprintf(buf, "%u\n", cpb_enabled);
1469}
1470
1471#define define_one_rw(_name) \
1472static struct freq_attr _name = \
1473__ATTR(_name, 0644, show_##_name, store_##_name)
1474
1475define_one_rw(cpb);
1476
1477static struct freq_attr *powernow_k8_attr[] = {
1478 &cpufreq_freq_attr_scaling_available_freqs,
1479 &cpb,
1480 NULL,
1481};
1482
1483static struct cpufreq_driver cpufreq_amd64_driver = {
1484 .verify = powernowk8_verify,
1485 .target = powernowk8_target,
1486 .bios_limit = acpi_processor_get_bios_limit,
1487 .init = powernowk8_cpu_init,
1488 .exit = __devexit_p(powernowk8_cpu_exit),
1489 .get = powernowk8_get,
1490 .name = "powernow-k8",
1491 .owner = THIS_MODULE,
1492 .attr = powernow_k8_attr,
1493};
1494
1495/*
1496 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1497 * cannot block the remaining ones from boosting. On the CPU_UP path we
1498 * simply keep the boost-disable flag in sync with the current global
1499 * state.
1500 */
1501static int cpb_notify(struct notifier_block *nb, unsigned long action,
1502 void *hcpu)
1503{
1504 unsigned cpu = (long)hcpu;
1505 u32 lo, hi;
1506
1507 switch (action) {
1508 case CPU_UP_PREPARE:
1509 case CPU_UP_PREPARE_FROZEN:
1510
1511 if (!cpb_enabled) {
1512 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1513 lo |= BIT(25);
1514 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1515 }
1516 break;
1517
1518 case CPU_DOWN_PREPARE:
1519 case CPU_DOWN_PREPARE_FROZEN:
1520 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1521 lo &= ~BIT(25);
1522 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1523 break;
1524
1525 default:
1526 break;
1527 }
1528
1529 return NOTIFY_OK;
1530}
1531
1532static struct notifier_block cpb_nb = {
1533 .notifier_call = cpb_notify,
1534};
1535
1536/* driver entry point for init */
1537static int __cpuinit powernowk8_init(void)
1538{
1539 unsigned int i, supported_cpus = 0, cpu;
1540
1541 for_each_online_cpu(i) {
1542 int rc;
1543 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1544 if (rc == 0)
1545 supported_cpus++;
1546 }
1547
1548 if (supported_cpus != num_online_cpus())
1549 return -ENODEV;
1550
1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc();
1561 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM;
1564 }
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1575 }
1576
1577 return cpufreq_register_driver(&cpufreq_amd64_driver);
1578}
1579
1580/* driver entry point for term */
1581static void __exit powernowk8_exit(void)
1582{
1583 dprintk("exit\n");
1584
1585 if (boot_cpu_has(X86_FEATURE_CPB)) {
1586 msrs_free(msrs);
1587 msrs = NULL;
1588
1589 unregister_cpu_notifier(&cpb_nb);
1590 }
1591
1592 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1593}
1594
1595MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1596 "Mark Langsdorf <mark.langsdorf@amd.com>");
1597MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1598MODULE_LICENSE("GPL");
1599
1600late_initcall(powernowk8_init);
1601module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8enum pstate {
9 HW_PSTATE_INVALID = 0xff,
10 HW_PSTATE_0 = 0,
11 HW_PSTATE_1 = 1,
12 HW_PSTATE_2 = 2,
13 HW_PSTATE_3 = 3,
14 HW_PSTATE_4 = 4,
15 HW_PSTATE_5 = 5,
16 HW_PSTATE_6 = 6,
17 HW_PSTATE_7 = 7,
18};
19
20struct powernow_k8_data {
21 unsigned int cpu;
22
23 u32 numps; /* number of p-states */
24 u32 batps; /* number of p-states supported on battery */
25 u32 max_hw_pstate; /* maximum legal hardware pstate */
26
27 /* these values are constant when the PSB is used to determine
28 * vid/fid pairings, but are modified during the ->target() call
29 * when ACPI is used */
30 u32 rvo; /* ramp voltage offset */
31 u32 irt; /* isochronous relief time */
32 u32 vidmvs; /* usable value calculated from mvs */
33 u32 vstable; /* voltage stabilization time, units 20 us */
34 u32 plllock; /* pll lock time, units 1 us */
35 u32 exttype; /* extended interface = 1 */
36
37 /* keep track of the current fid / vid or pstate */
38 u32 currvid;
39 u32 currfid;
40 enum pstate currpstate;
41
42 /* the powernow_table includes all frequency and vid/fid pairings:
43 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
44 * frequency is in kHz */
45 struct cpufreq_frequency_table *powernow_table;
46
47 /* the acpi table needs to be kept. it's only available if ACPI was
48 * used to determine valid frequency/vid/fid states */
49 struct acpi_processor_performance acpi_data;
50
51 /* we need to keep track of associated cores, but let cpufreq
52 * handle hotplug events - so just point at cpufreq pol->cpus
53 * structure */
54 struct cpumask *available_cores;
55};
56
57/* processor's cpuid instruction support */
58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
59#define CPUID_XFAM 0x0ff00000 /* extended family */
60#define CPUID_XFAM_K8 0
61#define CPUID_XMOD 0x000f0000 /* extended model */
62#define CPUID_XMOD_REV_MASK 0x000c0000
63#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
64#define CPUID_USE_XFAM_XMOD 0x00000f00
65#define CPUID_GET_MAX_CAPABILITIES 0x80000000
66#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
67#define P_STATE_TRANSITION_CAPABLE 6
68
69/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
70/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
71/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
72/* the register number is placed in ecx, and the data is returned in edx:eax. */
73
74#define MSR_FIDVID_CTL 0xc0010041
75#define MSR_FIDVID_STATUS 0xc0010042
76
77/* Field definitions within the FID VID Low Control MSR : */
78#define MSR_C_LO_INIT_FID_VID 0x00010000
79#define MSR_C_LO_NEW_VID 0x00003f00
80#define MSR_C_LO_NEW_FID 0x0000003f
81#define MSR_C_LO_VID_SHIFT 8
82
83/* Field definitions within the FID VID High Control MSR : */
84#define MSR_C_HI_STP_GNT_TO 0x000fffff
85
86/* Field definitions within the FID VID Low Status MSR : */
87#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
88#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
89#define MSR_S_LO_MAX_FID 0x003f0000
90#define MSR_S_LO_START_FID 0x00003f00
91#define MSR_S_LO_CURRENT_FID 0x0000003f
92
93/* Field definitions within the FID VID High Status MSR : */
94#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
95#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
96#define MSR_S_HI_START_VID 0x00003f00
97#define MSR_S_HI_CURRENT_VID 0x0000003f
98#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
99
100
101/* Hardware Pstate _PSS and MSR definitions */
102#define USE_HW_PSTATE 0x00000080
103#define HW_PSTATE_MASK 0x00000007
104#define HW_PSTATE_VALID_MASK 0x80000000
105#define HW_PSTATE_MAX_MASK 0x000000f0
106#define HW_PSTATE_MAX_SHIFT 4
107#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
108#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
109#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
110#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
111
112/* define the two driver architectures */
113#define CPU_OPTERON 0
114#define CPU_HW_PSTATE 1
115
116
117/*
118 * There are restrictions frequencies have to follow:
119 * - only 1 entry in the low fid table ( <=1.4GHz )
120 * - lowest entry in the high fid table must be >= 2 * the entry in the
121 * low fid table
122 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
123 * in the low fid table
124 * - the parts can only step at <= 200 MHz intervals, odd fid values are
125 * supported in revision G and later revisions.
126 * - lowest frequency must be >= interprocessor hypertransport link speed
127 * (only applies to MP systems obviously)
128 */
129
130/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
131#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
132#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
133
134#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
135#define HI_VCOFREQ_TABLE_BOTTOM 1600
136
137#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
138
139#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
140#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
141
142#define MIN_FREQ 800 /* Min and max freqs, per spec */
143#define MAX_FREQ 5000
144
145#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
146#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
147
148#define VID_OFF 0x3f
149
150#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
151
152#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
153
154#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
155#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
156
157/*
158 * Most values of interest are encoded in a single field of the _PSS
159 * entries: the "control" value.
160 */
161
162#define IRT_SHIFT 30
163#define RVO_SHIFT 28
164#define EXT_TYPE_SHIFT 27
165#define PLL_L_SHIFT 20
166#define MVS_SHIFT 18
167#define VST_SHIFT 11
168#define VID_SHIFT 6
169#define IRT_MASK 3
170#define RVO_MASK 3
171#define EXT_TYPE_MASK 1
172#define PLL_L_MASK 0x7f
173#define MVS_MASK 3
174#define VST_MASK 0x7f
175#define VID_MASK 0x1f
176#define FID_MASK 0x1f
177#define EXT_VID_MASK 0x3f
178#define EXT_FID_MASK 0x3f
179
180
181/*
182 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
183 * to tell the OS's power management driver which VIDs and FIDs are
184 * supported by this particular processor.
185 * If the data in the PSB / PST is wrong, then this driver will program the
186 * wrong values into hardware, which is very likely to lead to a crash.
187 */
188
189#define PSB_ID_STRING "AMDK7PNOW!"
190#define PSB_ID_STRING_LEN 10
191
192#define PSB_VERSION_1_4 0x14
193
194struct psb_s {
195 u8 signature[10];
196 u8 tableversion;
197 u8 flags1;
198 u16 vstable;
199 u8 flags2;
200 u8 num_tables;
201 u32 cpuid;
202 u8 plllocktime;
203 u8 maxfid;
204 u8 maxvid;
205 u8 numps;
206};
207
208/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
209struct pst_s {
210 u8 fid;
211 u8 vid;
212};
213
214#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
215
216static int core_voltage_pre_transition(struct powernow_k8_data *data,
217 u32 reqvid, u32 regfid);
218static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
219static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
220
221static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
222
223static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
224static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/*
2 * sc520_freq.c: cpufreq driver for the AMD Elan sc520
3 *
4 * Copyright (C) 2005 Sean Young <sean@mess.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Based on elanfreq.c
12 *
13 * 2005-03-30: - initial revision
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19
20#include <linux/delay.h>
21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
24
25#include <asm/msr.h>
26
27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29
30static __u8 __iomem *cpuctl;
31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
35
36static struct cpufreq_frequency_table sc520_freq_table[] = {
37 {0x01, 100000},
38 {0x02, 133000},
39 {0, CPUFREQ_TABLE_END},
40};
41
42static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43{
44 u8 clockspeed_reg = *cpuctl;
45
46 switch (clockspeed_reg & 0x03) {
47 default:
48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
50 case 0x01:
51 return 100000;
52 case 0x02:
53 return 133000;
54 }
55}
56
57static void sc520_freq_set_cpu_state(unsigned int state)
58{
59
60 struct cpufreq_freqs freqs;
61 u8 clockspeed_reg;
62
63 freqs.old = sc520_freq_get_cpu_frequency(0);
64 freqs.new = sc520_freq_table[state].frequency;
65 freqs.cpu = 0; /* AMD Elan is UP */
66
67 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
68
69 dprintk("attempting to set frequency to %i kHz\n",
70 sc520_freq_table[state].frequency);
71
72 local_irq_disable();
73
74 clockspeed_reg = *cpuctl & ~0x03;
75 *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
76
77 local_irq_enable();
78
79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
80};
81
82static int sc520_freq_verify(struct cpufreq_policy *policy)
83{
84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
85}
86
87static int sc520_freq_target(struct cpufreq_policy *policy,
88 unsigned int target_freq,
89 unsigned int relation)
90{
91 unsigned int newstate = 0;
92
93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
95 return -EINVAL;
96
97 sc520_freq_set_cpu_state(newstate);
98
99 return 0;
100}
101
102
103/*
104 * Module init and exit code
105 */
106
107static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
108{
109 struct cpuinfo_x86 *c = &cpu_data(0);
110 int result;
111
112 /* capability check */
113 if (c->x86_vendor != X86_VENDOR_AMD ||
114 c->x86 != 4 || c->x86_model != 9)
115 return -ENODEV;
116
117 /* cpuinfo and default policy values */
118 policy->cpuinfo.transition_latency = 1000000; /* 1ms */
119 policy->cur = sc520_freq_get_cpu_frequency(0);
120
121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
122 if (result)
123 return result;
124
125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
126
127 return 0;
128}
129
130
131static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
132{
133 cpufreq_frequency_table_put_attr(policy->cpu);
134 return 0;
135}
136
137
138static struct freq_attr *sc520_freq_attr[] = {
139 &cpufreq_freq_attr_scaling_available_freqs,
140 NULL,
141};
142
143
144static struct cpufreq_driver sc520_freq_driver = {
145 .get = sc520_freq_get_cpu_frequency,
146 .verify = sc520_freq_verify,
147 .target = sc520_freq_target,
148 .init = sc520_freq_cpu_init,
149 .exit = sc520_freq_cpu_exit,
150 .name = "sc520_freq",
151 .owner = THIS_MODULE,
152 .attr = sc520_freq_attr,
153};
154
155
156static int __init sc520_freq_init(void)
157{
158 struct cpuinfo_x86 *c = &cpu_data(0);
159 int err;
160
161 /* Test if we have the right hardware */
162 if (c->x86_vendor != X86_VENDOR_AMD ||
163 c->x86 != 4 || c->x86_model != 9) {
164 dprintk("no Elan SC520 processor found!\n");
165 return -ENODEV;
166 }
167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
168 if (!cpuctl) {
169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
170 return -ENOMEM;
171 }
172
173 err = cpufreq_register_driver(&sc520_freq_driver);
174 if (err)
175 iounmap(cpuctl);
176
177 return err;
178}
179
180
181static void __exit sc520_freq_exit(void)
182{
183 cpufreq_unregister_driver(&sc520_freq_driver);
184 iounmap(cpuctl);
185}
186
187
188MODULE_LICENSE("GPL");
189MODULE_AUTHOR("Sean Young <sean@mess.org>");
190MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
191
192module_init(sc520_freq_init);
193module_exit(sc520_freq_exit);
194
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Since the original Pentium M, most new Intel CPUs support Enhanced
6 * SpeedStep.
7 *
8 * Despite the "SpeedStep" in the name, this is almost entirely unlike
9 * traditional SpeedStep.
10 *
11 * Modelled on speedstep.c
12 *
13 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/sched.h> /* current */
21#include <linux/delay.h>
22#include <linux/compiler.h>
23#include <linux/gfp.h>
24
25#include <asm/msr.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
28
29#define PFX "speedstep-centrino: "
30#define MAINTAINER "cpufreq@vger.kernel.org"
31
32#define dprintk(msg...) \
33 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
34
35#define INTEL_MSR_RANGE (0xffff)
36
37struct cpu_id
38{
39 __u8 x86; /* CPU family */
40 __u8 x86_model; /* model */
41 __u8 x86_mask; /* stepping */
42};
43
44enum {
45 CPU_BANIAS,
46 CPU_DOTHAN_A1,
47 CPU_DOTHAN_A2,
48 CPU_DOTHAN_B0,
49 CPU_MP4HT_D0,
50 CPU_MP4HT_E0,
51};
52
53static const struct cpu_id cpu_ids[] = {
54 [CPU_BANIAS] = { 6, 9, 5 },
55 [CPU_DOTHAN_A1] = { 6, 13, 1 },
56 [CPU_DOTHAN_A2] = { 6, 13, 2 },
57 [CPU_DOTHAN_B0] = { 6, 13, 6 },
58 [CPU_MP4HT_D0] = {15, 3, 4 },
59 [CPU_MP4HT_E0] = {15, 4, 1 },
60};
61#define N_IDS ARRAY_SIZE(cpu_ids)
62
63struct cpu_model
64{
65 const struct cpu_id *cpu_id;
66 const char *model_name;
67 unsigned max_freq; /* max clock in kHz */
68
69 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
70};
71static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
72 const struct cpu_id *x);
73
74/* Operating points for current CPU */
75static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
76static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
77
78static struct cpufreq_driver centrino_driver;
79
80#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
81
82/* Computes the correct form for IA32_PERF_CTL MSR for a particular
83 frequency/voltage operating point; frequency in MHz, volts in mV.
84 This is stored as "index" in the structure. */
85#define OP(mhz, mv) \
86 { \
87 .frequency = (mhz) * 1000, \
88 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
89 }
90
91/*
92 * These voltage tables were derived from the Intel Pentium M
93 * datasheet, document 25261202.pdf, Table 5. I have verified they
94 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
95 * M.
96 */
97
98/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
99static struct cpufreq_frequency_table banias_900[] =
100{
101 OP(600, 844),
102 OP(800, 988),
103 OP(900, 1004),
104 { .frequency = CPUFREQ_TABLE_END }
105};
106
107/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
108static struct cpufreq_frequency_table banias_1000[] =
109{
110 OP(600, 844),
111 OP(800, 972),
112 OP(900, 988),
113 OP(1000, 1004),
114 { .frequency = CPUFREQ_TABLE_END }
115};
116
117/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
118static struct cpufreq_frequency_table banias_1100[] =
119{
120 OP( 600, 956),
121 OP( 800, 1020),
122 OP( 900, 1100),
123 OP(1000, 1164),
124 OP(1100, 1180),
125 { .frequency = CPUFREQ_TABLE_END }
126};
127
128
129/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
130static struct cpufreq_frequency_table banias_1200[] =
131{
132 OP( 600, 956),
133 OP( 800, 1004),
134 OP( 900, 1020),
135 OP(1000, 1100),
136 OP(1100, 1164),
137 OP(1200, 1180),
138 { .frequency = CPUFREQ_TABLE_END }
139};
140
141/* Intel Pentium M processor 1.30GHz (Banias) */
142static struct cpufreq_frequency_table banias_1300[] =
143{
144 OP( 600, 956),
145 OP( 800, 1260),
146 OP(1000, 1292),
147 OP(1200, 1356),
148 OP(1300, 1388),
149 { .frequency = CPUFREQ_TABLE_END }
150};
151
152/* Intel Pentium M processor 1.40GHz (Banias) */
153static struct cpufreq_frequency_table banias_1400[] =
154{
155 OP( 600, 956),
156 OP( 800, 1180),
157 OP(1000, 1308),
158 OP(1200, 1436),
159 OP(1400, 1484),
160 { .frequency = CPUFREQ_TABLE_END }
161};
162
163/* Intel Pentium M processor 1.50GHz (Banias) */
164static struct cpufreq_frequency_table banias_1500[] =
165{
166 OP( 600, 956),
167 OP( 800, 1116),
168 OP(1000, 1228),
169 OP(1200, 1356),
170 OP(1400, 1452),
171 OP(1500, 1484),
172 { .frequency = CPUFREQ_TABLE_END }
173};
174
175/* Intel Pentium M processor 1.60GHz (Banias) */
176static struct cpufreq_frequency_table banias_1600[] =
177{
178 OP( 600, 956),
179 OP( 800, 1036),
180 OP(1000, 1164),
181 OP(1200, 1276),
182 OP(1400, 1420),
183 OP(1600, 1484),
184 { .frequency = CPUFREQ_TABLE_END }
185};
186
187/* Intel Pentium M processor 1.70GHz (Banias) */
188static struct cpufreq_frequency_table banias_1700[] =
189{
190 OP( 600, 956),
191 OP( 800, 1004),
192 OP(1000, 1116),
193 OP(1200, 1228),
194 OP(1400, 1308),
195 OP(1700, 1484),
196 { .frequency = CPUFREQ_TABLE_END }
197};
198#undef OP
199
200#define _BANIAS(cpuid, max, name) \
201{ .cpu_id = cpuid, \
202 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
203 .max_freq = (max)*1000, \
204 .op_points = banias_##max, \
205}
206#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
207
208/* CPU models, their operating frequency range, and freq/voltage
209 operating points */
210static struct cpu_model models[] =
211{
212 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
213 BANIAS(1000),
214 BANIAS(1100),
215 BANIAS(1200),
216 BANIAS(1300),
217 BANIAS(1400),
218 BANIAS(1500),
219 BANIAS(1600),
220 BANIAS(1700),
221
222 /* NULL model_name is a wildcard */
223 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
224 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
225 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
226 { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
227 { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
228
229 { NULL, }
230};
231#undef _BANIAS
232#undef BANIAS
233
234static int centrino_cpu_init_table(struct cpufreq_policy *policy)
235{
236 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
237 struct cpu_model *model;
238
239 for(model = models; model->cpu_id != NULL; model++)
240 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
241 (model->model_name == NULL ||
242 strcmp(cpu->x86_model_id, model->model_name) == 0))
243 break;
244
245 if (model->cpu_id == NULL) {
246 /* No match at all */
247 dprintk("no support for CPU model \"%s\": "
248 "send /proc/cpuinfo to " MAINTAINER "\n",
249 cpu->x86_model_id);
250 return -ENOENT;
251 }
252
253 if (model->op_points == NULL) {
254 /* Matched a non-match */
255 dprintk("no table support for CPU model \"%s\"\n",
256 cpu->x86_model_id);
257 dprintk("try using the acpi-cpufreq driver\n");
258 return -ENOENT;
259 }
260
261 per_cpu(centrino_model, policy->cpu) = model;
262
263 dprintk("found \"%s\": max frequency: %dkHz\n",
264 model->model_name, model->max_freq);
265
266 return 0;
267}
268
269#else
270static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
271{
272 return -ENODEV;
273}
274#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
275
276static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
277 const struct cpu_id *x)
278{
279 if ((c->x86 == x->x86) &&
280 (c->x86_model == x->x86_model) &&
281 (c->x86_mask == x->x86_mask))
282 return 1;
283 return 0;
284}
285
286/* To be called only after centrino_model is initialized */
287static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
288{
289 int i;
290
291 /*
292 * Extract clock in kHz from PERF_CTL value
293 * for centrino, as some DSDTs are buggy.
294 * Ideally, this can be done using the acpi_data structure.
295 */
296 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
298 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
299 msr = (msr >> 8) & 0xff;
300 return msr * 100000;
301 }
302
303 if ((!per_cpu(centrino_model, cpu)) ||
304 (!per_cpu(centrino_model, cpu)->op_points))
305 return 0;
306
307 msr &= 0xffff;
308 for (i = 0;
309 per_cpu(centrino_model, cpu)->op_points[i].frequency
310 != CPUFREQ_TABLE_END;
311 i++) {
312 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
313 return per_cpu(centrino_model, cpu)->
314 op_points[i].frequency;
315 }
316 if (failsafe)
317 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
318 else
319 return 0;
320}
321
322/* Return the current CPU frequency in kHz */
323static unsigned int get_cur_freq(unsigned int cpu)
324{
325 unsigned l, h;
326 unsigned clock_freq;
327
328 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 clock_freq = extract_clock(l, cpu, 0);
330
331 if (unlikely(clock_freq == 0)) {
332 /*
333 * On some CPUs, we can see transient MSR values (which are
334 * not present in _PSS), while CPU is doing some automatic
335 * P-state transition (like TM2). Get the last freq set
336 * in PERF_CTL.
337 */
338 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
339 clock_freq = extract_clock(l, cpu, 1);
340 }
341 return clock_freq;
342}
343
344
345static int centrino_cpu_init(struct cpufreq_policy *policy)
346{
347 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
348 unsigned freq;
349 unsigned l, h;
350 int ret;
351 int i;
352
353 /* Only Intel makes Enhanced Speedstep-capable CPUs */
354 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
355 !cpu_has(cpu, X86_FEATURE_EST))
356 return -ENODEV;
357
358 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
359 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
360
361 if (policy->cpu != 0)
362 return -ENODEV;
363
364 for (i = 0; i < N_IDS; i++)
365 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
366 break;
367
368 if (i != N_IDS)
369 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
370
371 if (!per_cpu(centrino_cpu, policy->cpu)) {
372 dprintk("found unsupported CPU with "
373 "Enhanced SpeedStep: send /proc/cpuinfo to "
374 MAINTAINER "\n");
375 return -ENODEV;
376 }
377
378 if (centrino_cpu_init_table(policy)) {
379 return -ENODEV;
380 }
381
382 /* Check to see if Enhanced SpeedStep is enabled, and try to
383 enable it if not. */
384 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
385
386 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
387 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
388 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
389 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
390
391 /* check to see if it stuck */
392 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 printk(KERN_INFO PFX
395 "couldn't enable Enhanced SpeedStep\n");
396 return -ENODEV;
397 }
398 }
399
400 freq = get_cur_freq(policy->cpu);
401 policy->cpuinfo.transition_latency = 10000;
402 /* 10uS transition latency */
403 policy->cur = freq;
404
405 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
406
407 ret = cpufreq_frequency_table_cpuinfo(policy,
408 per_cpu(centrino_model, policy->cpu)->op_points);
409 if (ret)
410 return (ret);
411
412 cpufreq_frequency_table_get_attr(
413 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
414
415 return 0;
416}
417
418static int centrino_cpu_exit(struct cpufreq_policy *policy)
419{
420 unsigned int cpu = policy->cpu;
421
422 if (!per_cpu(centrino_model, cpu))
423 return -ENODEV;
424
425 cpufreq_frequency_table_put_attr(cpu);
426
427 per_cpu(centrino_model, cpu) = NULL;
428
429 return 0;
430}
431
432/**
433 * centrino_verify - verifies a new CPUFreq policy
434 * @policy: new policy
435 *
436 * Limit must be within this model's frequency range at least one
437 * border included.
438 */
439static int centrino_verify (struct cpufreq_policy *policy)
440{
441 return cpufreq_frequency_table_verify(policy,
442 per_cpu(centrino_model, policy->cpu)->op_points);
443}
444
445/**
446 * centrino_setpolicy - set a new CPUFreq policy
447 * @policy: new policy
448 * @target_freq: the target frequency
449 * @relation: how that frequency relates to achieved frequency
450 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
451 *
452 * Sets a new CPUFreq policy.
453 */
454static int centrino_target (struct cpufreq_policy *policy,
455 unsigned int target_freq,
456 unsigned int relation)
457{
458 unsigned int newstate = 0;
459 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
460 struct cpufreq_freqs freqs;
461 int retval = 0;
462 unsigned int j, k, first_cpu, tmp;
463 cpumask_var_t covered_cpus;
464
465 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
466 return -ENOMEM;
467
468 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
469 retval = -ENODEV;
470 goto out;
471 }
472
473 if (unlikely(cpufreq_frequency_table_target(policy,
474 per_cpu(centrino_model, cpu)->op_points,
475 target_freq,
476 relation,
477 &newstate))) {
478 retval = -EINVAL;
479 goto out;
480 }
481
482 first_cpu = 1;
483 for_each_cpu(j, policy->cpus) {
484 int good_cpu;
485
486 /* cpufreq holds the hotplug lock, so we are safe here */
487 if (!cpu_online(j))
488 continue;
489
490 /*
491 * Support for SMP systems.
492 * Make sure we are running on CPU that wants to change freq
493 */
494 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
495 good_cpu = cpumask_any_and(policy->cpus,
496 cpu_online_mask);
497 else
498 good_cpu = j;
499
500 if (good_cpu >= nr_cpu_ids) {
501 dprintk("couldn't limit to CPUs in this domain\n");
502 retval = -EAGAIN;
503 if (first_cpu) {
504 /* We haven't started the transition yet. */
505 goto out;
506 }
507 break;
508 }
509
510 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
511
512 if (first_cpu) {
513 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
514 if (msr == (oldmsr & 0xffff)) {
515 dprintk("no change needed - msr was and needs "
516 "to be %x\n", oldmsr);
517 retval = 0;
518 goto out;
519 }
520
521 freqs.old = extract_clock(oldmsr, cpu, 0);
522 freqs.new = extract_clock(msr, cpu, 0);
523
524 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
525 target_freq, freqs.old, freqs.new, msr);
526
527 for_each_cpu(k, policy->cpus) {
528 if (!cpu_online(k))
529 continue;
530 freqs.cpu = k;
531 cpufreq_notify_transition(&freqs,
532 CPUFREQ_PRECHANGE);
533 }
534
535 first_cpu = 0;
536 /* all but 16 LSB are reserved, treat them with care */
537 oldmsr &= ~0xffff;
538 msr &= 0xffff;
539 oldmsr |= msr;
540 }
541
542 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
543 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
544 break;
545
546 cpumask_set_cpu(j, covered_cpus);
547 }
548
549 for_each_cpu(k, policy->cpus) {
550 if (!cpu_online(k))
551 continue;
552 freqs.cpu = k;
553 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
554 }
555
556 if (unlikely(retval)) {
557 /*
558 * We have failed halfway through the frequency change.
559 * We have sent callbacks to policy->cpus and
560 * MSRs have already been written on coverd_cpus.
561 * Best effort undo..
562 */
563
564 for_each_cpu(j, covered_cpus)
565 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
566
567 tmp = freqs.new;
568 freqs.new = freqs.old;
569 freqs.old = tmp;
570 for_each_cpu(j, policy->cpus) {
571 if (!cpu_online(j))
572 continue;
573 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
574 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
575 }
576 }
577 retval = 0;
578
579out:
580 free_cpumask_var(covered_cpus);
581 return retval;
582}
583
584static struct freq_attr* centrino_attr[] = {
585 &cpufreq_freq_attr_scaling_available_freqs,
586 NULL,
587};
588
589static struct cpufreq_driver centrino_driver = {
590 .name = "centrino", /* should be speedstep-centrino,
591 but there's a 16 char limit */
592 .init = centrino_cpu_init,
593 .exit = centrino_cpu_exit,
594 .verify = centrino_verify,
595 .target = centrino_target,
596 .get = get_cur_freq,
597 .attr = centrino_attr,
598 .owner = THIS_MODULE,
599};
600
601
602/**
603 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
604 *
605 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
606 * unsupported devices, -ENOENT if there's no voltage table for this
607 * particular CPU model, -EINVAL on problems during initiatization,
608 * and zero on success.
609 *
610 * This is quite picky. Not only does the CPU have to advertise the
611 * "est" flag in the cpuid capability flags, we look for a specific
612 * CPU model and stepping, and we need to have the exact model name in
613 * our voltage tables. That is, be paranoid about not releasing
614 * someone's valuable magic smoke.
615 */
616static int __init centrino_init(void)
617{
618 struct cpuinfo_x86 *cpu = &cpu_data(0);
619
620 if (!cpu_has(cpu, X86_FEATURE_EST))
621 return -ENODEV;
622
623 return cpufreq_register_driver(&centrino_driver);
624}
625
626static void __exit centrino_exit(void)
627{
628 cpufreq_unregister_driver(&centrino_driver);
629}
630
631MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
632MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
633MODULE_LICENSE ("GPL");
634
635late_initcall(centrino_init);
636module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/sched.h>
27
28#include "speedstep-lib.h"
29
30
31/* speedstep_chipset:
32 * It is necessary to know which chipset is used. As accesses to
33 * this device occur at various places in this module, we need a
34 * static struct pci_dev * pointing to that device.
35 */
36static struct pci_dev *speedstep_chipset_dev;
37
38
39/* speedstep_processor
40 */
41static enum speedstep_processor speedstep_processor;
42
43static u32 pmbase;
44
45/*
46 * There are only two frequency states for each processor. Values
47 * are in kHz for the time being.
48 */
49static struct cpufreq_frequency_table speedstep_freqs[] = {
50 {SPEEDSTEP_HIGH, 0},
51 {SPEEDSTEP_LOW, 0},
52 {0, CPUFREQ_TABLE_END},
53};
54
55
56#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
57 "speedstep-ich", msg)
58
59
60/**
61 * speedstep_find_register - read the PMBASE address
62 *
63 * Returns: -ENODEV if no register could be found
64 */
65static int speedstep_find_register(void)
66{
67 if (!speedstep_chipset_dev)
68 return -ENODEV;
69
70 /* get PMBASE */
71 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
72 if (!(pmbase & 0x01)) {
73 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
74 return -ENODEV;
75 }
76
77 pmbase &= 0xFFFFFFFE;
78 if (!pmbase) {
79 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
80 return -ENODEV;
81 }
82
83 dprintk("pmbase is 0x%x\n", pmbase);
84 return 0;
85}
86
87/**
88 * speedstep_set_state - set the SpeedStep state
89 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
90 *
91 * Tries to change the SpeedStep state. Can be called from
92 * smp_call_function_single.
93 */
94static void speedstep_set_state(unsigned int state)
95{
96 u8 pm2_blk;
97 u8 value;
98 unsigned long flags;
99
100 if (state > 0x1)
101 return;
102
103 /* Disable IRQs */
104 local_irq_save(flags);
105
106 /* read state */
107 value = inb(pmbase + 0x50);
108
109 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
110
111 /* write new state */
112 value &= 0xFE;
113 value |= state;
114
115 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
116
117 /* Disable bus master arbitration */
118 pm2_blk = inb(pmbase + 0x20);
119 pm2_blk |= 0x01;
120 outb(pm2_blk, (pmbase + 0x20));
121
122 /* Actual transition */
123 outb(value, (pmbase + 0x50));
124
125 /* Restore bus master arbitration */
126 pm2_blk &= 0xfe;
127 outb(pm2_blk, (pmbase + 0x20));
128
129 /* check if transition was successful */
130 value = inb(pmbase + 0x50);
131
132 /* Enable IRQs */
133 local_irq_restore(flags);
134
135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
136
137 if (state == (value & 0x1))
138 dprintk("change to %u MHz succeeded\n",
139 speedstep_get_frequency(speedstep_processor) / 1000);
140 else
141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
142
143 return;
144}
145
146/* Wrapper for smp_call_function_single. */
147static void _speedstep_set_state(void *_state)
148{
149 speedstep_set_state(*(unsigned int *)_state);
150}
151
152/**
153 * speedstep_activate - activate SpeedStep control in the chipset
154 *
155 * Tries to activate the SpeedStep status and control registers.
156 * Returns -EINVAL on an unsupported chipset, and zero on success.
157 */
158static int speedstep_activate(void)
159{
160 u16 value = 0;
161
162 if (!speedstep_chipset_dev)
163 return -EINVAL;
164
165 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
166 if (!(value & 0x08)) {
167 value |= 0x08;
168 dprintk("activating SpeedStep (TM) registers\n");
169 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
170 }
171
172 return 0;
173}
174
175
176/**
177 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
178 *
179 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
180 * the LPC bridge / PM module which contains all power-management
181 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
182 * chipset, or zero on failure.
183 */
184static unsigned int speedstep_detect_chipset(void)
185{
186 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
187 PCI_DEVICE_ID_INTEL_82801DB_12,
188 PCI_ANY_ID, PCI_ANY_ID,
189 NULL);
190 if (speedstep_chipset_dev)
191 return 4; /* 4-M */
192
193 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
194 PCI_DEVICE_ID_INTEL_82801CA_12,
195 PCI_ANY_ID, PCI_ANY_ID,
196 NULL);
197 if (speedstep_chipset_dev)
198 return 3; /* 3-M */
199
200
201 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
202 PCI_DEVICE_ID_INTEL_82801BA_10,
203 PCI_ANY_ID, PCI_ANY_ID,
204 NULL);
205 if (speedstep_chipset_dev) {
206 /* speedstep.c causes lockups on Dell Inspirons 8000 and
207 * 8100 which use a pretty old revision of the 82815
208 * host brige. Abort on these systems.
209 */
210 static struct pci_dev *hostbridge;
211
212 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
213 PCI_DEVICE_ID_INTEL_82815_MC,
214 PCI_ANY_ID, PCI_ANY_ID,
215 NULL);
216
217 if (!hostbridge)
218 return 2; /* 2-M */
219
220 if (hostbridge->revision < 5) {
221 dprintk("hostbridge does not support speedstep\n");
222 speedstep_chipset_dev = NULL;
223 pci_dev_put(hostbridge);
224 return 0;
225 }
226
227 pci_dev_put(hostbridge);
228 return 2; /* 2-M */
229 }
230
231 return 0;
232}
233
234static void get_freq_data(void *_speed)
235{
236 unsigned int *speed = _speed;
237
238 *speed = speedstep_get_frequency(speedstep_processor);
239}
240
241static unsigned int speedstep_get(unsigned int cpu)
242{
243 unsigned int speed;
244
245 /* You're supposed to ensure CPU is online. */
246 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
247 BUG();
248
249 dprintk("detected %u kHz as current frequency\n", speed);
250 return speed;
251}
252
253/**
254 * speedstep_target - set a new CPUFreq policy
255 * @policy: new policy
256 * @target_freq: the target frequency
257 * @relation: how that frequency relates to achieved frequency
258 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
259 *
260 * Sets a new CPUFreq policy.
261 */
262static int speedstep_target(struct cpufreq_policy *policy,
263 unsigned int target_freq,
264 unsigned int relation)
265{
266 unsigned int newstate = 0, policy_cpu;
267 struct cpufreq_freqs freqs;
268 int i;
269
270 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
271 target_freq, relation, &newstate))
272 return -EINVAL;
273
274 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
275 freqs.old = speedstep_get(policy_cpu);
276 freqs.new = speedstep_freqs[newstate].frequency;
277 freqs.cpu = policy->cpu;
278
279 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
280
281 /* no transition necessary */
282 if (freqs.old == freqs.new)
283 return 0;
284
285 for_each_cpu(i, policy->cpus) {
286 freqs.cpu = i;
287 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
288 }
289
290 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
291 true);
292
293 for_each_cpu(i, policy->cpus) {
294 freqs.cpu = i;
295 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
296 }
297
298 return 0;
299}
300
301
302/**
303 * speedstep_verify - verifies a new CPUFreq policy
304 * @policy: new policy
305 *
306 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
307 * at least one border included.
308 */
309static int speedstep_verify(struct cpufreq_policy *policy)
310{
311 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
312}
313
314struct get_freqs {
315 struct cpufreq_policy *policy;
316 int ret;
317};
318
319static void get_freqs_on_cpu(void *_get_freqs)
320{
321 struct get_freqs *get_freqs = _get_freqs;
322
323 get_freqs->ret =
324 speedstep_get_freqs(speedstep_processor,
325 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
326 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
327 &get_freqs->policy->cpuinfo.transition_latency,
328 &speedstep_set_state);
329}
330
331static int speedstep_cpu_init(struct cpufreq_policy *policy)
332{
333 int result;
334 unsigned int policy_cpu, speed;
335 struct get_freqs gf;
336
337 /* only run on CPU to be set, or on its sibling */
338#ifdef CONFIG_SMP
339 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
340#endif
341 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
342
343 /* detect low and high frequency and transition latency */
344 gf.policy = policy;
345 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
346 if (gf.ret)
347 return gf.ret;
348
349 /* get current speed setting */
350 speed = speedstep_get(policy_cpu);
351 if (!speed)
352 return -EIO;
353
354 dprintk("currently at %s speed setting - %i MHz\n",
355 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
356 ? "low" : "high",
357 (speed / 1000));
358
359 /* cpuinfo and default policy values */
360 policy->cur = speed;
361
362 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
363 if (result)
364 return result;
365
366 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
367
368 return 0;
369}
370
371
372static int speedstep_cpu_exit(struct cpufreq_policy *policy)
373{
374 cpufreq_frequency_table_put_attr(policy->cpu);
375 return 0;
376}
377
378static struct freq_attr *speedstep_attr[] = {
379 &cpufreq_freq_attr_scaling_available_freqs,
380 NULL,
381};
382
383
384static struct cpufreq_driver speedstep_driver = {
385 .name = "speedstep-ich",
386 .verify = speedstep_verify,
387 .target = speedstep_target,
388 .init = speedstep_cpu_init,
389 .exit = speedstep_cpu_exit,
390 .get = speedstep_get,
391 .owner = THIS_MODULE,
392 .attr = speedstep_attr,
393};
394
395
396/**
397 * speedstep_init - initializes the SpeedStep CPUFreq driver
398 *
399 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
400 * devices, -EINVAL on problems during initiatization, and zero on
401 * success.
402 */
403static int __init speedstep_init(void)
404{
405 /* detect processor */
406 speedstep_processor = speedstep_detect_processor();
407 if (!speedstep_processor) {
408 dprintk("Intel(R) SpeedStep(TM) capable processor "
409 "not found\n");
410 return -ENODEV;
411 }
412
413 /* detect chipset */
414 if (!speedstep_detect_chipset()) {
415 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
416 "(yet) available.\n");
417 return -ENODEV;
418 }
419
420 /* activate speedstep support */
421 if (speedstep_activate()) {
422 pci_dev_put(speedstep_chipset_dev);
423 return -EINVAL;
424 }
425
426 if (speedstep_find_register())
427 return -ENODEV;
428
429 return cpufreq_register_driver(&speedstep_driver);
430}
431
432
433/**
434 * speedstep_exit - unregisters SpeedStep support
435 *
436 * Unregisters SpeedStep support.
437 */
438static void __exit speedstep_exit(void)
439{
440 pci_dev_put(speedstep_chipset_dev);
441 cpufreq_unregister_driver(&speedstep_driver);
442}
443
444
445MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
446 "Dominik Brodowski <linux@brodo.de>");
447MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
448 "with ICH-M southbridges.");
449MODULE_LICENSE("GPL");
450
451module_init(speedstep_init);
452module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16
17#include <asm/msr.h>
18#include <asm/tsc.h>
19#include "speedstep-lib.h"
20
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
22 "speedstep-lib", msg)
23
24#define PFX "speedstep-lib: "
25
26#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
27static int relaxed_check;
28#else
29#define relaxed_check 0
30#endif
31
32/*********************************************************************
33 * GET PROCESSOR CORE SPEED IN KHZ *
34 *********************************************************************/
35
36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
37{
38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
39 struct {
40 unsigned int ratio; /* Frequency Multiplier (x10) */
41 u8 bitmap; /* power on configuration bits
42 [27, 25:22] (in MSR 0x2a) */
43 } msr_decode_mult[] = {
44 { 30, 0x01 },
45 { 35, 0x05 },
46 { 40, 0x02 },
47 { 45, 0x06 },
48 { 50, 0x00 },
49 { 55, 0x04 },
50 { 60, 0x0b },
51 { 65, 0x0f },
52 { 70, 0x09 },
53 { 75, 0x0d },
54 { 80, 0x0a },
55 { 85, 0x26 },
56 { 90, 0x20 },
57 { 100, 0x2b },
58 { 0, 0xff } /* error or unknown value */
59 };
60
61 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
62 struct {
63 unsigned int value; /* Front Side Bus speed in MHz */
64 u8 bitmap; /* power on configuration bits [18: 19]
65 (in MSR 0x2a) */
66 } msr_decode_fsb[] = {
67 { 66, 0x0 },
68 { 100, 0x2 },
69 { 133, 0x1 },
70 { 0, 0xff}
71 };
72
73 u32 msr_lo, msr_tmp;
74 int i = 0, j = 0;
75
76 /* read MSR 0x2a - we only need the low 32 bits */
77 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
78 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
79 msr_tmp = msr_lo;
80
81 /* decode the FSB */
82 msr_tmp &= 0x00c0000;
83 msr_tmp >>= 18;
84 while (msr_tmp != msr_decode_fsb[i].bitmap) {
85 if (msr_decode_fsb[i].bitmap == 0xff)
86 return 0;
87 i++;
88 }
89
90 /* decode the multiplier */
91 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
92 dprintk("workaround for early PIIIs\n");
93 msr_lo &= 0x03c00000;
94 } else
95 msr_lo &= 0x0bc00000;
96 msr_lo >>= 22;
97 while (msr_lo != msr_decode_mult[j].bitmap) {
98 if (msr_decode_mult[j].bitmap == 0xff)
99 return 0;
100 j++;
101 }
102
103 dprintk("speed is %u\n",
104 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
105
106 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
107}
108
109
110static unsigned int pentiumM_get_frequency(void)
111{
112 u32 msr_lo, msr_tmp;
113
114 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
115 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
116
117 /* see table B-2 of 24547212.pdf */
118 if (msr_lo & 0x00040000) {
119 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
120 msr_lo, msr_tmp);
121 return 0;
122 }
123
124 msr_tmp = (msr_lo >> 22) & 0x1f;
125 dprintk("bits 22-26 are 0x%x, speed is %u\n",
126 msr_tmp, (msr_tmp * 100 * 1000));
127
128 return msr_tmp * 100 * 1000;
129}
130
131static unsigned int pentium_core_get_frequency(void)
132{
133 u32 fsb = 0;
134 u32 msr_lo, msr_tmp;
135 int ret;
136
137 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
138 /* see table B-2 of 25366920.pdf */
139 switch (msr_lo & 0x07) {
140 case 5:
141 fsb = 100000;
142 break;
143 case 1:
144 fsb = 133333;
145 break;
146 case 3:
147 fsb = 166667;
148 break;
149 case 2:
150 fsb = 200000;
151 break;
152 case 0:
153 fsb = 266667;
154 break;
155 case 4:
156 fsb = 333333;
157 break;
158 default:
159 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
160 }
161
162 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
163 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
164 msr_lo, msr_tmp);
165
166 msr_tmp = (msr_lo >> 22) & 0x1f;
167 dprintk("bits 22-26 are 0x%x, speed is %u\n",
168 msr_tmp, (msr_tmp * fsb));
169
170 ret = (msr_tmp * fsb);
171 return ret;
172}
173
174
175static unsigned int pentium4_get_frequency(void)
176{
177 struct cpuinfo_x86 *c = &boot_cpu_data;
178 u32 msr_lo, msr_hi, mult;
179 unsigned int fsb = 0;
180 unsigned int ret;
181 u8 fsb_code;
182
183 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
184 * to System Bus Frequency Ratio Field in the Processor Frequency
185 * Configuration Register of the MSR. Therefore the current
186 * frequency cannot be calculated and has to be measured.
187 */
188 if (c->x86_model < 2)
189 return cpu_khz;
190
191 rdmsr(0x2c, msr_lo, msr_hi);
192
193 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
194
195 /* decode the FSB: see IA-32 Intel (C) Architecture Software
196 * Developer's Manual, Volume 3: System Prgramming Guide,
197 * revision #12 in Table B-1: MSRs in the Pentium 4 and
198 * Intel Xeon Processors, on page B-4 and B-5.
199 */
200 fsb_code = (msr_lo >> 16) & 0x7;
201 switch (fsb_code) {
202 case 0:
203 fsb = 100 * 1000;
204 break;
205 case 1:
206 fsb = 13333 * 10;
207 break;
208 case 2:
209 fsb = 200 * 1000;
210 break;
211 }
212
213 if (!fsb)
214 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
215 "Please send an e-mail to <linux@brodo.de>\n");
216
217 /* Multiplier. */
218 mult = msr_lo >> 24;
219
220 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
221 fsb, mult, (fsb * mult));
222
223 ret = (fsb * mult);
224 return ret;
225}
226
227
228/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
230{
231 switch (processor) {
232 case SPEEDSTEP_CPU_PCORE:
233 return pentium_core_get_frequency();
234 case SPEEDSTEP_CPU_PM:
235 return pentiumM_get_frequency();
236 case SPEEDSTEP_CPU_P4D:
237 case SPEEDSTEP_CPU_P4M:
238 return pentium4_get_frequency();
239 case SPEEDSTEP_CPU_PIII_T:
240 case SPEEDSTEP_CPU_PIII_C:
241 case SPEEDSTEP_CPU_PIII_C_EARLY:
242 return pentium3_get_frequency(processor);
243 default:
244 return 0;
245 };
246 return 0;
247}
248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
249
250
251/*********************************************************************
252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
253 *********************************************************************/
254
255unsigned int speedstep_detect_processor(void)
256{
257 struct cpuinfo_x86 *c = &cpu_data(0);
258 u32 ebx, msr_lo, msr_hi;
259
260 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
261
262 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
263 ((c->x86 != 6) && (c->x86 != 0xF)))
264 return 0;
265
266 if (c->x86 == 0xF) {
267 /* Intel Mobile Pentium 4-M
268 * or Intel Mobile Pentium 4 with 533 MHz FSB */
269 if (c->x86_model != 2)
270 return 0;
271
272 ebx = cpuid_ebx(0x00000001);
273 ebx &= 0x000000FF;
274
275 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
276
277 switch (c->x86_mask) {
278 case 4:
279 /*
280 * B-stepping [M-P4-M]
281 * sample has ebx = 0x0f, production has 0x0e.
282 */
283 if ((ebx == 0x0e) || (ebx == 0x0f))
284 return SPEEDSTEP_CPU_P4M;
285 break;
286 case 7:
287 /*
288 * C-stepping [M-P4-M]
289 * needs to have ebx=0x0e, else it's a celeron:
290 * cf. 25130917.pdf / page 7, footnote 5 even
291 * though 25072120.pdf / page 7 doesn't say
292 * samples are only of B-stepping...
293 */
294 if (ebx == 0x0e)
295 return SPEEDSTEP_CPU_P4M;
296 break;
297 case 9:
298 /*
299 * D-stepping [M-P4-M or M-P4/533]
300 *
301 * this is totally strange: CPUID 0x0F29 is
302 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
303 * The latter need to be sorted out as they don't
304 * support speedstep.
305 * Celerons with CPUID 0x0F29 may have either
306 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
307 * specific.
308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
310 * also, M-P4M HTs have ebx=0x8, too
311 * For now, they are distinguished by the model_id
312 * string
313 */
314 if ((ebx == 0x0e) ||
315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
318 break;
319 default:
320 break;
321 }
322 return 0;
323 }
324
325 switch (c->x86_model) {
326 case 0x0B: /* Intel PIII [Tualatin] */
327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
329 ebx = cpuid_ebx(0x00000001);
330 dprintk("ebx is %x\n", ebx);
331
332 ebx &= 0x000000FF;
333
334 if (ebx != 0x06)
335 return 0;
336
337 /* So far all PIII-M processors support SpeedStep. See
338 * Intel's 24540640.pdf of June 2003
339 */
340 return SPEEDSTEP_CPU_PIII_T;
341
342 case 0x08: /* Intel PIII [Coppermine] */
343
344 /* all mobile PIII Coppermines have FSB 100 MHz
345 * ==> sort out a few desktop PIIIs. */
346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
349 msr_lo &= 0x00c0000;
350 if (msr_lo != 0x0080000)
351 return 0;
352
353 /*
354 * If the processor is a mobile version,
355 * platform ID has bit 50 set
356 * it has SpeedStep technology if either
357 * bit 56 or 57 is set
358 */
359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
364 if (c->x86_mask == 0x01) {
365 dprintk("early PIII version\n");
366 return SPEEDSTEP_CPU_PIII_C_EARLY;
367 } else
368 return SPEEDSTEP_CPU_PIII_C;
369 }
370
371 default:
372 return 0;
373 }
374}
375EXPORT_SYMBOL_GPL(speedstep_detect_processor);
376
377
378/*********************************************************************
379 * DETECT SPEEDSTEP SPEEDS *
380 *********************************************************************/
381
382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
383 unsigned int *low_speed,
384 unsigned int *high_speed,
385 unsigned int *transition_latency,
386 void (*set_state) (unsigned int state))
387{
388 unsigned int prev_speed;
389 unsigned int ret = 0;
390 unsigned long flags;
391 struct timeval tv1, tv2;
392
393 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
394 return -EINVAL;
395
396 dprintk("trying to determine both speeds\n");
397
398 /* get current speed */
399 prev_speed = speedstep_get_frequency(processor);
400 if (!prev_speed)
401 return -EIO;
402
403 dprintk("previous speed is %u\n", prev_speed);
404
405 local_irq_save(flags);
406
407 /* switch to low state */
408 set_state(SPEEDSTEP_LOW);
409 *low_speed = speedstep_get_frequency(processor);
410 if (!*low_speed) {
411 ret = -EIO;
412 goto out;
413 }
414
415 dprintk("low speed is %u\n", *low_speed);
416
417 /* start latency measurement */
418 if (transition_latency)
419 do_gettimeofday(&tv1);
420
421 /* switch to high state */
422 set_state(SPEEDSTEP_HIGH);
423
424 /* end latency measurement */
425 if (transition_latency)
426 do_gettimeofday(&tv2);
427
428 *high_speed = speedstep_get_frequency(processor);
429 if (!*high_speed) {
430 ret = -EIO;
431 goto out;
432 }
433
434 dprintk("high speed is %u\n", *high_speed);
435
436 if (*low_speed == *high_speed) {
437 ret = -ENODEV;
438 goto out;
439 }
440
441 /* switch to previous state, if necessary */
442 if (*high_speed != prev_speed)
443 set_state(SPEEDSTEP_LOW);
444
445 if (transition_latency) {
446 *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
447 tv2.tv_usec - tv1.tv_usec;
448 dprintk("transition latency is %u uSec\n", *transition_latency);
449
450 /* convert uSec to nSec and add 20% for safety reasons */
451 *transition_latency *= 1200;
452
453 /* check if the latency measurement is too high or too low
454 * and set it to a safe value (500uSec) in that case
455 */
456 if (*transition_latency > 10000000 ||
457 *transition_latency < 50000) {
458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
462 *transition_latency, 500000);
463 *transition_latency = 500000;
464 }
465 }
466
467out:
468 local_irq_restore(flags);
469 return ret;
470}
471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
472
473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
474module_param(relaxed_check, int, 0444);
475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
477#endif
478
479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14enum speedstep_processor {
15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19/* the following processors are not speedstep-capable and are not auto-detected
20 * in speedstep_detect_processor(). However, their speed can be detected using
21 * the speedstep_get_frequency() call. */
22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26
27/* speedstep states -- only two of them */
28
29#define SPEEDSTEP_HIGH 0x00000000
30#define SPEEDSTEP_LOW 0x00000001
31
32
33/* detect a speedstep-capable processor */
34extern enum speedstep_processor speedstep_detect_processor(void);
35
36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38
39
40/* detect the low and high speeds of the processor. The callback
41 * set_state"'s first argument is either SPEEDSTEP_HIGH or
42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated.
44 */
45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed,
47 unsigned int *high_speed,
48 unsigned int *transition_latency,
49 void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 8abd869baabf..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/delay.h>
21#include <linux/io.h>
22#include <asm/ist.h>
23
24#include "speedstep-lib.h"
25
26/* speedstep system management interface port/command.
27 *
28 * These parameters are got from IST-SMI BIOS call.
29 * If user gives it, these are used.
30 *
31 */
32static int smi_port;
33static int smi_cmd;
34static unsigned int smi_sig;
35
36/* info about the processor */
37static enum speedstep_processor speedstep_processor;
38
39/*
40 * There are only two frequency states for each processor. Values
41 * are in kHz for the time being.
42 */
43static struct cpufreq_frequency_table speedstep_freqs[] = {
44 {SPEEDSTEP_HIGH, 0},
45 {SPEEDSTEP_LOW, 0},
46 {0, CPUFREQ_TABLE_END},
47};
48
49#define GET_SPEEDSTEP_OWNER 0
50#define GET_SPEEDSTEP_STATE 1
51#define SET_SPEEDSTEP_STATE 2
52#define GET_SPEEDSTEP_FREQS 4
53
54/* how often shall the SMI call be tried if it failed, e.g. because
55 * of DMA activity going on? */
56#define SMI_TRIES 5
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
59 "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership(void)
65{
66 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n",
74 command, smi_port);
75
76 __asm__ __volatile__(
77 "push %%ebp\n"
78 "out %%al, (%%dx)\n"
79 "pop %%ebp\n"
80 : "=D" (result),
81 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
82 "=S" (dummy)
83 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
84 "D" (0), "S" (magic)
85 : "memory"
86 );
87
88 dprintk("result is %x\n", result);
89
90 return result;
91}
92
93/**
94 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
95 * @low: the low frequency value is placed here
96 * @high: the high frequency value is placed here
97 *
98 * Only available on later SpeedStep-enabled systems, returns false results or
99 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
100 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
101 */
102static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
103{
104 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
105 u32 state = 0;
106 u32 function = GET_SPEEDSTEP_FREQS;
107
108 if (!(ist_info.event & 0xFFFF)) {
109 dprintk("bug #1422 -- can't read freqs from BIOS\n");
110 return -ENODEV;
111 }
112
113 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
114
115 dprintk("trying to determine frequencies with command %x at port %x\n",
116 command, smi_port);
117
118 __asm__ __volatile__(
119 "push %%ebp\n"
120 "out %%al, (%%dx)\n"
121 "pop %%ebp"
122 : "=a" (result),
123 "=b" (high_mhz),
124 "=c" (low_mhz),
125 "=d" (state), "=D" (edi), "=S" (dummy)
126 : "a" (command),
127 "b" (function),
128 "c" (state),
129 "d" (smi_port), "S" (0), "D" (0)
130 );
131
132 dprintk("result %x, low_freq %u, high_freq %u\n",
133 result, low_mhz, high_mhz);
134
135 /* abort if results are obviously incorrect... */
136 if ((high_mhz + low_mhz) < 600)
137 return -EINVAL;
138
139 *high = high_mhz * 1000;
140 *low = low_mhz * 1000;
141
142 return result;
143}
144
145/**
146 * speedstep_get_state - set the SpeedStep state
147 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
148 *
149 */
150static int speedstep_get_state(void)
151{
152 u32 function = GET_SPEEDSTEP_STATE;
153 u32 result, state, edi, command, dummy;
154
155 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
156
157 dprintk("trying to determine current setting with command %x "
158 "at port %x\n", command, smi_port);
159
160 __asm__ __volatile__(
161 "push %%ebp\n"
162 "out %%al, (%%dx)\n"
163 "pop %%ebp\n"
164 : "=a" (result),
165 "=b" (state), "=D" (edi),
166 "=c" (dummy), "=d" (dummy), "=S" (dummy)
167 : "a" (command), "b" (function), "c" (0),
168 "d" (smi_port), "S" (0), "D" (0)
169 );
170
171 dprintk("state is %x, result is %x\n", state, result);
172
173 return state & 1;
174}
175
176
177/**
178 * speedstep_set_state - set the SpeedStep state
179 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
180 *
181 */
182static void speedstep_set_state(unsigned int state)
183{
184 unsigned int result = 0, command, new_state, dummy;
185 unsigned long flags;
186 unsigned int function = SET_SPEEDSTEP_STATE;
187 unsigned int retry = 0;
188
189 if (state > 0x1)
190 return;
191
192 /* Disable IRQs */
193 local_irq_save(flags);
194
195 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
196
197 dprintk("trying to set frequency to state %u "
198 "with command %x at port %x\n",
199 state, command, smi_port);
200
201 do {
202 if (retry) {
203 dprintk("retry %u, previous result %u, waiting...\n",
204 retry, result);
205 mdelay(retry * 50);
206 }
207 retry++;
208 __asm__ __volatile__(
209 "push %%ebp\n"
210 "out %%al, (%%dx)\n"
211 "pop %%ebp"
212 : "=b" (new_state), "=D" (result),
213 "=c" (dummy), "=a" (dummy),
214 "=d" (dummy), "=S" (dummy)
215 : "a" (command), "b" (function), "c" (state),
216 "d" (smi_port), "S" (0), "D" (0)
217 );
218 } while ((new_state != state) && (retry <= SMI_TRIES));
219
220 /* enable IRQs */
221 local_irq_restore(flags);
222
223 if (new_state == state)
224 dprintk("change to %u MHz succeeded after %u tries "
225 "with result %u\n",
226 (speedstep_freqs[new_state].frequency / 1000),
227 retry, result);
228 else
229 printk(KERN_ERR "cpufreq: change to state %u "
230 "failed with new_state %u and result %u\n",
231 state, new_state, result);
232
233 return;
234}
235
236
237/**
238 * speedstep_target - set a new CPUFreq policy
239 * @policy: new policy
240 * @target_freq: new freq
241 * @relation:
242 *
243 * Sets a new CPUFreq policy/freq.
244 */
245static int speedstep_target(struct cpufreq_policy *policy,
246 unsigned int target_freq, unsigned int relation)
247{
248 unsigned int newstate = 0;
249 struct cpufreq_freqs freqs;
250
251 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
252 target_freq, relation, &newstate))
253 return -EINVAL;
254
255 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
256 freqs.new = speedstep_freqs[newstate].frequency;
257 freqs.cpu = 0; /* speedstep.c is UP only driver */
258
259 if (freqs.old == freqs.new)
260 return 0;
261
262 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
263 speedstep_set_state(newstate);
264 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
265
266 return 0;
267}
268
269
270/**
271 * speedstep_verify - verifies a new CPUFreq policy
272 * @policy: new policy
273 *
274 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
275 * at least one border included.
276 */
277static int speedstep_verify(struct cpufreq_policy *policy)
278{
279 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
280}
281
282
283static int speedstep_cpu_init(struct cpufreq_policy *policy)
284{
285 int result;
286 unsigned int speed, state;
287 unsigned int *low, *high;
288
289 /* capability check */
290 if (policy->cpu != 0)
291 return -ENODEV;
292
293 result = speedstep_smi_ownership();
294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n");
296 return -EINVAL;
297 }
298
299 /* detect low and high frequency */
300 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
301 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
302
303 result = speedstep_smi_get_freqs(low, high);
304 if (result) {
305 /* fall back to speedstep_lib.c dection mechanism:
306 * try both states out */
307 dprintk("could not detect low and high frequencies "
308 "by SMI call.\n");
309 result = speedstep_get_freqs(speedstep_processor,
310 low, high,
311 NULL,
312 &speedstep_set_state);
313
314 if (result) {
315 dprintk("could not detect two different speeds"
316 " -- aborting.\n");
317 return result;
318 } else
319 dprintk("workaround worked.\n");
320 }
321
322 /* get current speed setting */
323 state = speedstep_get_state();
324 speed = speedstep_freqs[state].frequency;
325
326 dprintk("currently at %s speed setting - %i MHz\n",
327 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
328 ? "low" : "high",
329 (speed / 1000));
330
331 /* cpuinfo and default policy values */
332 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
333 policy->cur = speed;
334
335 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
336 if (result)
337 return result;
338
339 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
340
341 return 0;
342}
343
344static int speedstep_cpu_exit(struct cpufreq_policy *policy)
345{
346 cpufreq_frequency_table_put_attr(policy->cpu);
347 return 0;
348}
349
350static unsigned int speedstep_get(unsigned int cpu)
351{
352 if (cpu)
353 return -ENODEV;
354 return speedstep_get_frequency(speedstep_processor);
355}
356
357
358static int speedstep_resume(struct cpufreq_policy *policy)
359{
360 int result = speedstep_smi_ownership();
361
362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n");
364
365 return result;
366}
367
368static struct freq_attr *speedstep_attr[] = {
369 &cpufreq_freq_attr_scaling_available_freqs,
370 NULL,
371};
372
373static struct cpufreq_driver speedstep_driver = {
374 .name = "speedstep-smi",
375 .verify = speedstep_verify,
376 .target = speedstep_target,
377 .init = speedstep_cpu_init,
378 .exit = speedstep_cpu_exit,
379 .get = speedstep_get,
380 .resume = speedstep_resume,
381 .owner = THIS_MODULE,
382 .attr = speedstep_attr,
383};
384
385/**
386 * speedstep_init - initializes the SpeedStep CPUFreq driver
387 *
388 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
389 * BIOS, -EINVAL on problems during initiatization, and zero on
390 * success.
391 */
392static int __init speedstep_init(void)
393{
394 speedstep_processor = speedstep_detect_processor();
395
396 switch (speedstep_processor) {
397 case SPEEDSTEP_CPU_PIII_T:
398 case SPEEDSTEP_CPU_PIII_C:
399 case SPEEDSTEP_CPU_PIII_C_EARLY:
400 break;
401 default:
402 speedstep_processor = 0;
403 }
404
405 if (!speedstep_processor) {
406 dprintk("No supported Intel CPU detected.\n");
407 return -ENODEV;
408 }
409
410 dprintk("signature:0x%.8lx, command:0x%.8lx, "
411 "event:0x%.8lx, perf_level:0x%.8lx.\n",
412 ist_info.signature, ist_info.command,
413 ist_info.event, ist_info.perf_level);
414
415 /* Error if no IST-SMI BIOS or no PARM
416 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
417 if ((ist_info.signature != 0x47534943) && (
418 (smi_port == 0) || (smi_cmd == 0)))
419 return -ENODEV;
420
421 if (smi_sig == 1)
422 smi_sig = 0x47534943;
423 else
424 smi_sig = ist_info.signature;
425
426 /* setup smi_port from MODLULE_PARM or BIOS */
427 if ((smi_port > 0xff) || (smi_port < 0))
428 return -EINVAL;
429 else if (smi_port == 0)
430 smi_port = ist_info.command & 0xff;
431
432 if ((smi_cmd > 0xff) || (smi_cmd < 0))
433 return -EINVAL;
434 else if (smi_cmd == 0)
435 smi_cmd = (ist_info.command >> 16) & 0xff;
436
437 return cpufreq_register_driver(&speedstep_driver);
438}
439
440
441/**
442 * speedstep_exit - unregisters SpeedStep support
443 *
444 * Unregisters SpeedStep support.
445 */
446static void __exit speedstep_exit(void)
447{
448 cpufreq_unregister_driver(&speedstep_driver);
449}
450
451module_param(smi_port, int, 0444);
452module_param(smi_cmd, int, 0444);
453module_param(smi_sig, uint, 0444);
454
455MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
456 "-- Intel's default setting is 0xb2");
457MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
458 "-- Intel's default setting is 0x82");
459MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
460 "SMI interface.");
461
462MODULE_AUTHOR("Hiroshi Miura");
463MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
464MODULE_LICENSE("GPL");
465
466module_init(speedstep_init);
467module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efbb..1edf5ba4fb2b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
29 29
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 31{
32 u64 misc_enable;
33
32 /* Unmask CPUID levels if masked: */ 34 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { 35 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37 37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { 38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
118 * (model 2) with the same problem. 118 * (model 2) with the same problem.
119 */ 119 */
120 if (c->x86 == 15) { 120 if (c->x86 == 15) {
121 u64 misc_enable;
122
123 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 121 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
124 122
125 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { 123 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
130 } 128 }
131 } 129 }
132#endif 130#endif
131
132 /*
133 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
134 * clear the fast string and enhanced fast string CPU capabilities.
135 */
136 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
137 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
138 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
139 printk(KERN_INFO "Disabled fast string operations\n");
140 setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
141 setup_clear_cpu_cap(X86_FEATURE_ERMS);
142 }
143 }
133} 144}
134 145
135#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
@@ -170,7 +181,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
170{ 181{
171#ifdef CONFIG_SMP 182#ifdef CONFIG_SMP
172 /* calling is from identify_secondary_cpu() ? */ 183 /* calling is from identify_secondary_cpu() ? */
173 if (c->cpu_index == boot_cpu_id) 184 if (!c->cpu_index)
174 return; 185 return;
175 186
176 /* 187 /*
@@ -276,17 +287,14 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 287
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 288static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 289{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 290#ifdef CONFIG_NUMA
280 unsigned node; 291 unsigned node;
281 int cpu = smp_processor_id(); 292 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 293
284 /* Don't do the funky fallback heuristics the AMD version employs 294 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 295 for now. */
286 node = apicid_to_node[apicid]; 296 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE) 297 if (node == NUMA_NO_NODE || !node_online(node)) {
288 node = first_node(node_online_map);
289 else if (!node_online(node)) {
290 /* reuse the value from init_cpu_to_node() */ 298 /* reuse the value from init_cpu_to_node() */
291 node = cpu_to_node(cpu); 299 node = cpu_to_node(cpu);
292 } 300 }
@@ -403,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
403 411
404 switch (c->x86_model) { 412 switch (c->x86_model) {
405 case 5: 413 case 5:
406 if (c->x86_mask == 0) { 414 if (l2 == 0)
407 if (l2 == 0) 415 p = "Celeron (Covington)";
408 p = "Celeron (Covington)"; 416 else if (l2 == 256)
409 else if (l2 == 256) 417 p = "Mobile Pentium II (Dixon)";
410 p = "Mobile Pentium II (Dixon)";
411 }
412 break; 418 break;
413 419
414 case 6: 420 case 6:
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3fec7d9bfd62..0bf12644aa73 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/amd_nb.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22 22
23#define LVL_1_INST 1 23#define LVL_1_INST 1
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
48 { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 49 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 50 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ 67 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ 68 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ 69 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
70 { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ 71 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 89 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ 90 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 91 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
92 { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 94 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ 95 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx {
149}; 152};
150 153
151struct amd_l3_cache { 154struct amd_l3_cache {
152 struct pci_dev *dev; 155 struct amd_northbridge *nb;
153 bool can_disable;
154 unsigned indices; 156 unsigned indices;
155 u8 subcaches[4]; 157 u8 subcaches[4];
156}; 158};
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
266 line_size = l2.line_size; 268 line_size = l2.line_size;
267 lines_per_tag = l2.lines_per_tag; 269 lines_per_tag = l2.lines_per_tag;
268 /* cpu_data has errata corrections for K7 applied */ 270 /* cpu_data has errata corrections for K7 applied */
269 size_in_kb = current_cpu_data.x86_cache_size; 271 size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
270 break; 272 break;
271 case 3: 273 case 3:
272 if (!l3.val) 274 if (!l3.val)
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
288 eax->split.type = types[leaf]; 290 eax->split.type = types[leaf];
289 eax->split.level = levels[leaf]; 291 eax->split.level = levels[leaf];
290 eax->split.num_threads_sharing = 0; 292 eax->split.num_threads_sharing = 0;
291 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 293 eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
292 294
293 295
294 if (assoc == 0xffff) 296 if (assoc == 0xffff)
@@ -302,23 +304,22 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
302 304
303struct _cache_attr { 305struct _cache_attr {
304 struct attribute attr; 306 struct attribute attr;
305 ssize_t (*show)(struct _cpuid4_info *, char *); 307 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
309 unsigned int);
307}; 310};
308 311
309#ifdef CONFIG_CPU_SUP_AMD 312#ifdef CONFIG_AMD_NB
310 313
311/* 314/*
312 * L3 cache descriptors 315 * L3 cache descriptors
313 */ 316 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) 317static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
317{ 318{
318 unsigned int sc0, sc1, sc2, sc3; 319 unsigned int sc0, sc1, sc2, sc3;
319 u32 val = 0; 320 u32 val = 0;
320 321
321 pci_read_config_dword(l3->dev, 0x1C4, &val); 322 pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
322 323
323 /* calculate subcache sizes */ 324 /* calculate subcache sizes */
324 l3->subcaches[0] = sc0 = !(val & BIT(0)); 325 l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -326,50 +327,17 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); 327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
328 329
329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
348} 331}
349 332
350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, 333static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
351 int index) 334 int index)
352{ 335{
336 static struct amd_l3_cache *__cpuinitdata l3_caches;
353 int node; 337 int node;
354 338
355 if (boot_cpu_data.x86 != 0x10) 339 /* only for L3, and not in virtualized environments */
356 return; 340 if (index < 3 || amd_nb_num() == 0)
357
358 if (index < 3)
359 return;
360
361 /* see errata #382 and #388 */
362 if (boot_cpu_data.x86_model < 0x8)
363 return;
364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
373 return; 341 return;
374 342
375 /* 343 /*
@@ -377,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
377 * never freed but this is done only on shutdown so it doesn't matter. 345 * never freed but this is done only on shutdown so it doesn't matter.
378 */ 346 */
379 if (!l3_caches) { 347 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); 348 int size = amd_nb_num() * sizeof(struct amd_l3_cache);
381 349
382 l3_caches = kzalloc(size, GFP_ATOMIC); 350 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches) 351 if (!l3_caches)
@@ -386,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
386 354
387 node = amd_get_nb_id(smp_processor_id()); 355 node = amd_get_nb_id(smp_processor_id());
388 356
389 if (!l3_caches[node]) { 357 if (!l3_caches[node].nb) {
390 l3_caches[node] = amd_init_l3_cache(node); 358 l3_caches[node].nb = node_to_amd_nb(node);
391 l3_caches[node]->can_disable = true; 359 amd_calc_l3_indices(&l3_caches[node]);
392 } 360 }
393 361
394 WARN_ON(!l3_caches[node]); 362 this_leaf->l3 = &l3_caches[node];
395
396 this_leaf->l3 = l3_caches[node];
397} 363}
398 364
399/* 365/*
@@ -407,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{ 373{
408 unsigned int reg = 0; 374 unsigned int reg = 0;
409 375
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg); 376 pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
411 377
412 /* check whether this slot is activated already */ 378 /* check whether this slot is activated already */
413 if (reg & (3UL << 30)) 379 if (reg & (3UL << 30))
@@ -421,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
421{ 387{
422 int index; 388 int index;
423 389
424 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 390 if (!this_leaf->l3 ||
391 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
425 return -EINVAL; 392 return -EINVAL;
426 393
427 index = amd_get_l3_disable_slot(this_leaf->l3, slot); 394 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -433,7 +400,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
433 400
434#define SHOW_CACHE_DISABLE(slot) \ 401#define SHOW_CACHE_DISABLE(slot) \
435static ssize_t \ 402static ssize_t \
436show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ 403show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
404 unsigned int cpu) \
437{ \ 405{ \
438 return show_cache_disable(this_leaf, buf, slot); \ 406 return show_cache_disable(this_leaf, buf, slot); \
439} 407}
@@ -456,7 +424,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
456 if (!l3->subcaches[i]) 424 if (!l3->subcaches[i])
457 continue; 425 continue;
458 426
459 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 427 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
460 428
461 /* 429 /*
462 * We need to WBINVD on a core on the node containing the L3 430 * We need to WBINVD on a core on the node containing the L3
@@ -466,7 +434,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
466 wbinvd_on_cpu(cpu); 434 wbinvd_on_cpu(cpu);
467 435
468 reg |= BIT(31); 436 reg |= BIT(31);
469 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 437 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
470 } 438 }
471} 439}
472 440
@@ -485,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
485{ 453{
486 int ret = 0; 454 int ret = 0;
487 455
488#define SUBCACHE_MASK (3UL << 20) 456 /* check if @slot is already used or the index is already disabled */
489#define SUBCACHE_INDEX 0xfff
490
491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot); 457 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0) 458 if (ret >= 0)
497 return -EINVAL; 459 return -EINVAL;
498 460
499 /* 461 if (index > l3->indices)
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL; 462 return -EINVAL;
505 463
506 /* do not allow writes outside of allowed bits */ 464 /* check whether the other slot has disabled the same index already */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 465 if (index == amd_get_l3_disable_slot(l3, !slot))
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL; 466 return -EINVAL;
510 467
511 amd_l3_disable_index(l3, cpu, slot, index); 468 amd_l3_disable_index(l3, cpu, slot, index);
@@ -523,7 +480,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
523 if (!capable(CAP_SYS_ADMIN)) 480 if (!capable(CAP_SYS_ADMIN))
524 return -EPERM; 481 return -EPERM;
525 482
526 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 483 if (!this_leaf->l3 ||
484 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
527 return -EINVAL; 485 return -EINVAL;
528 486
529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 487 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -544,7 +502,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
544#define STORE_CACHE_DISABLE(slot) \ 502#define STORE_CACHE_DISABLE(slot) \
545static ssize_t \ 503static ssize_t \
546store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 504store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
547 const char *buf, size_t count) \ 505 const char *buf, size_t count, \
506 unsigned int cpu) \
548{ \ 507{ \
549 return store_cache_disable(this_leaf, buf, count, slot); \ 508 return store_cache_disable(this_leaf, buf, count, slot); \
550} 509}
@@ -556,25 +515,55 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 515static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
557 show_cache_disable_1, store_cache_disable_1); 516 show_cache_disable_1, store_cache_disable_1);
558 517
559#else /* CONFIG_CPU_SUP_AMD */ 518static ssize_t
560static void __cpuinit 519show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
562{ 520{
563}; 521 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
564#endif /* CONFIG_CPU_SUP_AMD */ 522 return -EINVAL;
523
524 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
525}
526
527static ssize_t
528store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
529 unsigned int cpu)
530{
531 unsigned long val;
532
533 if (!capable(CAP_SYS_ADMIN))
534 return -EPERM;
535
536 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
537 return -EINVAL;
538
539 if (strict_strtoul(buf, 16, &val) < 0)
540 return -EINVAL;
541
542 if (amd_set_subcaches(cpu, val))
543 return -EINVAL;
544
545 return count;
546}
547
548static struct _cache_attr subcaches =
549 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
550
551#else /* CONFIG_AMD_NB */
552#define amd_init_l3_cache(x, y)
553#endif /* CONFIG_AMD_NB */
565 554
566static int 555static int
567__cpuinit cpuid4_cache_lookup_regs(int index, 556__cpuinit cpuid4_cache_lookup_regs(int index,
568 struct _cpuid4_info_regs *this_leaf) 557 struct _cpuid4_info_regs *this_leaf)
569{ 558{
570 union _cpuid4_leaf_eax eax; 559 union _cpuid4_leaf_eax eax;
571 union _cpuid4_leaf_ebx ebx; 560 union _cpuid4_leaf_ebx ebx;
572 union _cpuid4_leaf_ecx ecx; 561 union _cpuid4_leaf_ecx ecx;
573 unsigned edx; 562 unsigned edx;
574 563
575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 564 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
576 amd_cpuid4(index, &eax, &ebx, &ecx); 565 amd_cpuid4(index, &eax, &ebx, &ecx);
577 amd_check_l3_disable(this_leaf, index); 566 amd_init_l3_cache(this_leaf, index);
578 } else { 567 } else {
579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 568 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
580 } 569 }
@@ -784,11 +773,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
784 struct cpuinfo_x86 *c = &cpu_data(cpu); 773 struct cpuinfo_x86 *c = &cpu_data(cpu);
785 774
786 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 775 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
787 for_each_cpu(i, c->llc_shared_map) { 776 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
788 if (!per_cpu(ici_cpuid4_info, i)) 777 if (!per_cpu(ici_cpuid4_info, i))
789 continue; 778 continue;
790 this_leaf = CPUID4_INFO_IDX(i, index); 779 this_leaf = CPUID4_INFO_IDX(i, index);
791 for_each_cpu(sibling, c->llc_shared_map) { 780 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
792 if (!cpu_online(sibling)) 781 if (!cpu_online(sibling))
793 continue; 782 continue;
794 set_bit(sibling, this_leaf->shared_cpu_map); 783 set_bit(sibling, this_leaf->shared_cpu_map);
@@ -922,8 +911,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
922#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) 911#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
923 912
924#define show_one_plus(file_name, object, val) \ 913#define show_one_plus(file_name, object, val) \
925static ssize_t show_##file_name \ 914static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
926 (struct _cpuid4_info *this_leaf, char *buf) \ 915 unsigned int cpu) \
927{ \ 916{ \
928 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 917 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
929} 918}
@@ -934,7 +923,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
934show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); 923show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
935show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); 924show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
936 925
937static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 926static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
927 unsigned int cpu)
938{ 928{
939 return sprintf(buf, "%luK\n", this_leaf->size / 1024); 929 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
940} 930}
@@ -958,17 +948,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
958 return n; 948 return n;
959} 949}
960 950
961static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) 951static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
952 unsigned int cpu)
962{ 953{
963 return show_shared_cpu_map_func(leaf, 0, buf); 954 return show_shared_cpu_map_func(leaf, 0, buf);
964} 955}
965 956
966static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) 957static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
958 unsigned int cpu)
967{ 959{
968 return show_shared_cpu_map_func(leaf, 1, buf); 960 return show_shared_cpu_map_func(leaf, 1, buf);
969} 961}
970 962
971static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) 963static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
964 unsigned int cpu)
972{ 965{
973 switch (this_leaf->eax.split.type) { 966 switch (this_leaf->eax.split.type) {
974 case CACHE_TYPE_DATA: 967 case CACHE_TYPE_DATA:
@@ -999,30 +992,54 @@ define_one_ro(size);
999define_one_ro(shared_cpu_map); 992define_one_ro(shared_cpu_map);
1000define_one_ro(shared_cpu_list); 993define_one_ro(shared_cpu_list);
1001 994
1002#define DEFAULT_SYSFS_CACHE_ATTRS \
1003 &type.attr, \
1004 &level.attr, \
1005 &coherency_line_size.attr, \
1006 &physical_line_partition.attr, \
1007 &ways_of_associativity.attr, \
1008 &number_of_sets.attr, \
1009 &size.attr, \
1010 &shared_cpu_map.attr, \
1011 &shared_cpu_list.attr
1012
1013static struct attribute *default_attrs[] = { 995static struct attribute *default_attrs[] = {
1014 DEFAULT_SYSFS_CACHE_ATTRS, 996 &type.attr,
997 &level.attr,
998 &coherency_line_size.attr,
999 &physical_line_partition.attr,
1000 &ways_of_associativity.attr,
1001 &number_of_sets.attr,
1002 &size.attr,
1003 &shared_cpu_map.attr,
1004 &shared_cpu_list.attr,
1015 NULL 1005 NULL
1016}; 1006};
1017 1007
1018static struct attribute *default_l3_attrs[] = { 1008#ifdef CONFIG_AMD_NB
1019 DEFAULT_SYSFS_CACHE_ATTRS, 1009static struct attribute ** __cpuinit amd_l3_attrs(void)
1020#ifdef CONFIG_CPU_SUP_AMD 1010{
1021 &cache_disable_0.attr, 1011 static struct attribute **attrs;
1022 &cache_disable_1.attr, 1012 int n;
1013
1014 if (attrs)
1015 return attrs;
1016
1017 n = sizeof (default_attrs) / sizeof (struct attribute *);
1018
1019 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
1020 n += 2;
1021
1022 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1023 n += 1;
1024
1025 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
1026 if (attrs == NULL)
1027 return attrs = default_attrs;
1028
1029 for (n = 0; default_attrs[n]; n++)
1030 attrs[n] = default_attrs[n];
1031
1032 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
1033 attrs[n++] = &cache_disable_0.attr;
1034 attrs[n++] = &cache_disable_1.attr;
1035 }
1036
1037 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1038 attrs[n++] = &subcaches.attr;
1039
1040 return attrs;
1041}
1023#endif 1042#endif
1024 NULL
1025};
1026 1043
1027static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 1044static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1028{ 1045{
@@ -1032,7 +1049,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1032 1049
1033 ret = fattr->show ? 1050 ret = fattr->show ?
1034 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1051 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1035 buf) : 1052 buf, this_leaf->cpu) :
1036 0; 1053 0;
1037 return ret; 1054 return ret;
1038} 1055}
@@ -1046,7 +1063,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
1046 1063
1047 ret = fattr->store ? 1064 ret = fattr->store ?
1048 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1065 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1049 buf, count) : 1066 buf, count, this_leaf->cpu) :
1050 0; 1067 0;
1051 return ret; 1068 return ret;
1052} 1069}
@@ -1133,11 +1150,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1133 1150
1134 this_leaf = CPUID4_INFO_IDX(cpu, i); 1151 this_leaf = CPUID4_INFO_IDX(cpu, i);
1135 1152
1136 if (this_leaf->l3 && this_leaf->l3->can_disable) 1153 ktype_cache.default_attrs = default_attrs;
1137 ktype_cache.default_attrs = default_l3_attrs; 1154#ifdef CONFIG_AMD_NB
1138 else 1155 if (this_leaf->l3)
1139 ktype_cache.default_attrs = default_attrs; 1156 ktype_cache.default_attrs = amd_l3_attrs();
1140 1157#endif
1141 retval = kobject_init_and_add(&(this_object->kobj), 1158 retval = kobject_init_and_add(&(this_object->kobj),
1142 &ktype_cache, 1159 &ktype_cache,
1143 per_cpu(ici_cache_kobject, cpu), 1160 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
106ssize_t apei_read_mce(struct mce *m, u64 *record_id) 106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{ 107{
108 struct cper_mce_record rcd; 108 struct cper_mce_record rcd;
109 ssize_t len; 109 int rc, pos;
110 110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd)); 111 rc = erst_get_record_id_begin(&pos);
112 if (len <= 0) 112 if (rc)
113 return len; 113 return rc;
114 /* Can not skip other records in storage via ERST unless clear them */ 114retry:
115 else if (len != sizeof(rcd) || 115 rc = erst_get_record_id_next(&pos, record_id);
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { 116 if (rc)
117 if (printk_ratelimit()) 117 goto out;
118 pr_warning( 118 /* no more record */
119 "MCE-APEI: Can not skip the unknown record in ERST"); 119 if (*record_id == APEI_ERST_INVALID_RECORD_ID)
120 return -EIO; 120 goto out;
121 } 121 rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
122 122 /* someone else has cleared the record, try next one */
123 if (rc == -ENOENT)
124 goto retry;
125 else if (rc < 0)
126 goto out;
127 /* try to skip other type records in storage */
128 else if (rc != sizeof(rcd) ||
129 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
130 goto retry;
123 memcpy(m, &rcd.mce, sizeof(*m)); 131 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id; 132 rc = sizeof(*m);
133out:
134 erst_get_record_id_end();
125 135
126 return sizeof(*m); 136 return rc;
127} 137}
128 138
129/* Check whether there is record in ERST */ 139/* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,13 +25,14 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <asm/mce.h> 26#include <asm/mce.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
28#include <asm/nmi.h>
28 29
29/* Update fake mce registers on current CPU. */ 30/* Update fake mce registers on current CPU. */
30static void inject_mce(struct mce *m) 31static void inject_mce(struct mce *m)
31{ 32{
32 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
33 34
34 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
35 i->finished = 0; 36 i->finished = 0;
36 mb(); 37 mb();
37 m->finished = 0; 38 m->finished = 0;
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
83 struct die_args *args = (struct die_args *)data; 84 struct die_args *args = (struct die_args *)data;
84 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
85 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
87 return NOTIFY_DONE; 88 return NOTIFY_DONE;
88 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
89 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
95 96
96static struct notifier_block mce_raise_nb = { 97static struct notifier_block mce_raise_nb = {
97 .notifier_call = mce_raise_notify, 98 .notifier_call = mce_raise_notify,
98 .priority = 1000, 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
99}; 100};
100 101
101/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
192 .release = seq_release, 192 .release = seq_release,
193 .read = seq_read, 193 .read = seq_read,
194 .write = severities_coverage_write, 194 .write = severities_coverage_write,
195 .llseek = seq_lseek,
195}; 196};
196 197
197static int __init severities_debugfs_init(void) 198static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909fe..ff1ae9b6464d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/syscore_ops.h>
24#include <linux/delay.h> 25#include <linux/delay.h>
25#include <linux/ctype.h> 26#include <linux/ctype.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
@@ -104,20 +105,6 @@ static int cpu_missing;
104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
106 107
107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108 void *data)
109{
110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112
113 return NOTIFY_STOP;
114}
115
116static struct notifier_block mce_dec_nb = {
117 .notifier_call = default_decode_mce,
118 .priority = -1,
119};
120
121/* MCA banks polled by the period polling timer for corrected events */ 108/* MCA banks polled by the period polling timer for corrected events */
122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 109DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
123 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 110 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -211,6 +198,8 @@ void mce_log(struct mce *mce)
211 198
212static void print_mce(struct mce *m) 199static void print_mce(struct mce *m)
213{ 200{
201 int ret = 0;
202
214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 203 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215 m->extcpu, m->mcgstatus, m->bank, m->status); 204 m->extcpu, m->mcgstatus, m->bank, m->status);
216 205
@@ -238,7 +227,11 @@ static void print_mce(struct mce *m)
238 * Print out human-readable details about the MCE error, 227 * Print out human-readable details about the MCE error,
239 * (if the CPU has an implementation for that) 228 * (if the CPU has an implementation for that)
240 */ 229 */
241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 230 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
231 if (ret == NOTIFY_STOP)
232 return;
233
234 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
242} 235}
243 236
244#define PANIC_TIMEOUT 5 /* 5 seconds */ 237#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -326,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
326 319
327static int msr_to_offset(u32 msr) 320static int msr_to_offset(u32 msr)
328{ 321{
329 unsigned bank = __get_cpu_var(injectm.bank); 322 unsigned bank = __this_cpu_read(injectm.bank);
330 323
331 if (msr == rip_msr) 324 if (msr == rip_msr)
332 return offsetof(struct mce, ip); 325 return offsetof(struct mce, ip);
@@ -346,7 +339,7 @@ static u64 mce_rdmsrl(u32 msr)
346{ 339{
347 u64 v; 340 u64 v;
348 341
349 if (__get_cpu_var(injectm).finished) { 342 if (__this_cpu_read(injectm.finished)) {
350 int offset = msr_to_offset(msr); 343 int offset = msr_to_offset(msr);
351 344
352 if (offset < 0) 345 if (offset < 0)
@@ -369,7 +362,7 @@ static u64 mce_rdmsrl(u32 msr)
369 362
370static void mce_wrmsrl(u32 msr, u64 v) 363static void mce_wrmsrl(u32 msr, u64 v)
371{ 364{
372 if (__get_cpu_var(injectm).finished) { 365 if (__this_cpu_read(injectm.finished)) {
373 int offset = msr_to_offset(msr); 366 int offset = msr_to_offset(msr);
374 367
375 if (offset >= 0) 368 if (offset >= 0)
@@ -589,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
590 mce_log(&m); 583 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 584 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
592 add_taint(TAINT_MACHINE_CHECK);
593 } 585 }
594 586
595 /* 587 /*
@@ -881,7 +873,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 873 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 874 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 875 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 876 * parser). So only support physical addresses up to page granuality for now.
885 */ 877 */
886static int mce_usable_address(struct mce *m) 878static int mce_usable_address(struct mce *m)
887{ 879{
@@ -1159,7 +1151,7 @@ static void mce_start_timer(unsigned long data)
1159 1151
1160 WARN_ON(smp_processor_id() != data); 1152 WARN_ON(smp_processor_id() != data);
1161 1153
1162 if (mce_available(&current_cpu_data)) { 1154 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1163 machine_check_poll(MCP_TIMESTAMP, 1155 machine_check_poll(MCP_TIMESTAMP,
1164 &__get_cpu_var(mce_poll_banks)); 1156 &__get_cpu_var(mce_poll_banks));
1165 } 1157 }
@@ -1625,7 +1617,7 @@ out:
1625static unsigned int mce_poll(struct file *file, poll_table *wait) 1617static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{ 1618{
1627 poll_wait(file, &mce_wait, wait); 1619 poll_wait(file, &mce_wait, wait);
1628 if (rcu_dereference_check_mce(mcelog.next)) 1620 if (rcu_access_index(mcelog.next))
1629 return POLLIN | POLLRDNORM; 1621 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce()) 1622 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM; 1623 return POLLIN | POLLRDNORM;
@@ -1665,6 +1657,7 @@ struct file_operations mce_chrdev_ops = {
1665 .read = mce_read, 1657 .read = mce_read,
1666 .poll = mce_poll, 1658 .poll = mce_poll,
1667 .unlocked_ioctl = mce_ioctl, 1659 .unlocked_ioctl = mce_ioctl,
1660 .llseek = no_llseek,
1668}; 1661};
1669EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1662EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1670 1663
@@ -1720,8 +1713,6 @@ __setup("mce", mcheck_enable);
1720 1713
1721int __init mcheck_init(void) 1714int __init mcheck_init(void)
1722{ 1715{
1723 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1724
1725 mcheck_intel_therm_init(); 1716 mcheck_intel_therm_init();
1726 1717
1727 return 0; 1718 return 0;
@@ -1748,14 +1739,14 @@ static int mce_disable_error_reporting(void)
1748 return 0; 1739 return 0;
1749} 1740}
1750 1741
1751static int mce_suspend(struct sys_device *dev, pm_message_t state) 1742static int mce_suspend(void)
1752{ 1743{
1753 return mce_disable_error_reporting(); 1744 return mce_disable_error_reporting();
1754} 1745}
1755 1746
1756static int mce_shutdown(struct sys_device *dev) 1747static void mce_shutdown(void)
1757{ 1748{
1758 return mce_disable_error_reporting(); 1749 mce_disable_error_reporting();
1759} 1750}
1760 1751
1761/* 1752/*
@@ -1763,18 +1754,22 @@ static int mce_shutdown(struct sys_device *dev)
1763 * Only one CPU is active at this time, the others get re-added later using 1754 * Only one CPU is active at this time, the others get re-added later using
1764 * CPU hotplug: 1755 * CPU hotplug:
1765 */ 1756 */
1766static int mce_resume(struct sys_device *dev) 1757static void mce_resume(void)
1767{ 1758{
1768 __mcheck_cpu_init_generic(); 1759 __mcheck_cpu_init_generic();
1769 __mcheck_cpu_init_vendor(&current_cpu_data); 1760 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1770
1771 return 0;
1772} 1761}
1773 1762
1763static struct syscore_ops mce_syscore_ops = {
1764 .suspend = mce_suspend,
1765 .shutdown = mce_shutdown,
1766 .resume = mce_resume,
1767};
1768
1774static void mce_cpu_restart(void *data) 1769static void mce_cpu_restart(void *data)
1775{ 1770{
1776 del_timer_sync(&__get_cpu_var(mce_timer)); 1771 del_timer_sync(&__get_cpu_var(mce_timer));
1777 if (!mce_available(&current_cpu_data)) 1772 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1778 return; 1773 return;
1779 __mcheck_cpu_init_generic(); 1774 __mcheck_cpu_init_generic();
1780 __mcheck_cpu_init_timer(); 1775 __mcheck_cpu_init_timer();
@@ -1789,7 +1784,7 @@ static void mce_restart(void)
1789/* Toggle features for corrected errors */ 1784/* Toggle features for corrected errors */
1790static void mce_disable_ce(void *all) 1785static void mce_disable_ce(void *all)
1791{ 1786{
1792 if (!mce_available(&current_cpu_data)) 1787 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1793 return; 1788 return;
1794 if (all) 1789 if (all)
1795 del_timer_sync(&__get_cpu_var(mce_timer)); 1790 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1798,7 +1793,7 @@ static void mce_disable_ce(void *all)
1798 1793
1799static void mce_enable_ce(void *all) 1794static void mce_enable_ce(void *all)
1800{ 1795{
1801 if (!mce_available(&current_cpu_data)) 1796 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1802 return; 1797 return;
1803 cmci_reenable(); 1798 cmci_reenable();
1804 cmci_recheck(); 1799 cmci_recheck();
@@ -1807,9 +1802,6 @@ static void mce_enable_ce(void *all)
1807} 1802}
1808 1803
1809static struct sysdev_class mce_sysclass = { 1804static struct sysdev_class mce_sysclass = {
1810 .suspend = mce_suspend,
1811 .shutdown = mce_shutdown,
1812 .resume = mce_resume,
1813 .name = "machinecheck", 1805 .name = "machinecheck",
1814}; 1806};
1815 1807
@@ -2021,7 +2013,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2021 unsigned long action = *(unsigned long *)h; 2013 unsigned long action = *(unsigned long *)h;
2022 int i; 2014 int i;
2023 2015
2024 if (!mce_available(&current_cpu_data)) 2016 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2025 return; 2017 return;
2026 2018
2027 if (!(action & CPU_TASKS_FROZEN)) 2019 if (!(action & CPU_TASKS_FROZEN))
@@ -2039,7 +2031,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2039 unsigned long action = *(unsigned long *)h; 2031 unsigned long action = *(unsigned long *)h;
2040 int i; 2032 int i;
2041 2033
2042 if (!mce_available(&current_cpu_data)) 2034 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2043 return; 2035 return;
2044 2036
2045 if (!(action & CPU_TASKS_FROZEN)) 2037 if (!(action & CPU_TASKS_FROZEN))
@@ -2138,6 +2130,7 @@ static __init int mcheck_init_device(void)
2138 return err; 2130 return err;
2139 } 2131 }
2140 2132
2133 register_syscore_ops(&mce_syscore_ops);
2141 register_hotcpu_notifier(&mce_cpu_notifier); 2134 register_hotcpu_notifier(&mce_cpu_notifier);
2142 misc_register(&mce_log_device); 2135 misc_register(&mce_log_device);
2143 2136
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..bb0adad35143 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
31#include <asm/mce.h> 31#include <asm/mce.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33 33
34#define PFX "mce_threshold: "
35#define VERSION "version 1.1.1"
36#define NR_BANKS 6 34#define NR_BANKS 6
37#define NR_BLOCKS 9 35#define NR_BLOCKS 9
38#define THRESHOLD_MAX 0xFFF 36#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
59 struct list_head miscj; 57 struct list_head miscj;
60}; 58};
61 59
62/* defaults used early on boot */
63static struct threshold_block threshold_defaults = {
64 .interrupt_enable = 0,
65 .threshold_limit = THRESHOLD_MAX,
66};
67
68struct threshold_bank { 60struct threshold_bank {
69 struct kobject *kobj; 61 struct kobject *kobj;
70 struct threshold_block *blocks; 62 struct threshold_block *blocks;
@@ -89,49 +81,101 @@ static void amd_threshold_interrupt(void);
89struct thresh_restart { 81struct thresh_restart {
90 struct threshold_block *b; 82 struct threshold_block *b;
91 int reset; 83 int reset;
84 int set_lvt_off;
85 int lvt_off;
92 u16 old_limit; 86 u16 old_limit;
93}; 87};
94 88
89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
90{
91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
92
93 if (apic < 0) {
94 pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
95 "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
96 b->bank, b->block, b->address, hi, lo);
97 return 0;
98 }
99
100 if (apic != msr) {
101 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
102 "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
103 b->cpu, apic, b->bank, b->block, b->address, hi, lo);
104 return 0;
105 }
106
107 return 1;
108};
109
95/* must be called with correct cpu affinity */ 110/* must be called with correct cpu affinity */
96/* Called via smp_call_function_single() */ 111/* Called via smp_call_function_single() */
97static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
98{ 113{
99 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
100 u32 mci_misc_hi, mci_misc_lo; 115 u32 hi, lo;
101 116
102 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 117 rdmsr(tr->b->address, lo, hi);
103 118
104 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 119 if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
105 tr->reset = 1; /* limit cannot be lower than err count */ 120 tr->reset = 1; /* limit cannot be lower than err count */
106 121
107 if (tr->reset) { /* reset err count and overflow bit */ 122 if (tr->reset) { /* reset err count and overflow bit */
108 mci_misc_hi = 123 hi =
109 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 124 (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
110 (THRESHOLD_MAX - tr->b->threshold_limit); 125 (THRESHOLD_MAX - tr->b->threshold_limit);
111 } else if (tr->old_limit) { /* change limit w/o reset */ 126 } else if (tr->old_limit) { /* change limit w/o reset */
112 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 127 int new_count = (hi & THRESHOLD_MAX) +
113 (tr->old_limit - tr->b->threshold_limit); 128 (tr->old_limit - tr->b->threshold_limit);
114 129
115 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 130 hi = (hi & ~MASK_ERR_COUNT_HI) |
116 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
117 } 132 }
118 133
134 if (tr->set_lvt_off) {
135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
136 /* set new lvt offset */
137 hi &= ~MASK_LVTOFF_HI;
138 hi |= tr->lvt_off << 20;
139 }
140 }
141
119 tr->b->interrupt_enable ? 142 tr->b->interrupt_enable ?
120 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
121 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 144 (hi &= ~MASK_INT_TYPE_HI);
122 145
123 mci_misc_hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
124 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 147 wrmsr(tr->b->address, lo, hi);
148}
149
150static void mce_threshold_block_init(struct threshold_block *b, int offset)
151{
152 struct thresh_restart tr = {
153 .b = b,
154 .set_lvt_off = 1,
155 .lvt_off = offset,
156 };
157
158 b->threshold_limit = THRESHOLD_MAX;
159 threshold_restart_bank(&tr);
160};
161
162static int setup_APIC_mce(int reserved, int new)
163{
164 if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
165 APIC_EILVT_MSG_FIX, 0))
166 return new;
167
168 return reserved;
125} 169}
126 170
127/* cpu init entry point, called from mce.c with preempt off */ 171/* cpu init entry point, called from mce.c with preempt off */
128void mce_amd_feature_init(struct cpuinfo_x86 *c) 172void mce_amd_feature_init(struct cpuinfo_x86 *c)
129{ 173{
174 struct threshold_block b;
130 unsigned int cpu = smp_processor_id(); 175 unsigned int cpu = smp_processor_id();
131 u32 low = 0, high = 0, address = 0; 176 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 177 unsigned int bank, block;
133 struct thresh_restart tr; 178 int offset = -1;
134 u8 lvt_off;
135 179
136 for (bank = 0; bank < NR_BANKS; ++bank) { 180 for (bank = 0; bank < NR_BANKS; ++bank) {
137 for (block = 0; block < NR_BLOCKS; ++block) { 181 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,19 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
162 if (shared_bank[bank] && c->cpu_core_id) 206 if (shared_bank[bank] && c->cpu_core_id)
163 break; 207 break;
164#endif 208#endif
165 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, 209 offset = setup_APIC_mce(offset,
166 APIC_EILVT_MSG_FIX, 0); 210 (high & MASK_LVTOFF_HI) >> 20);
167 211
168 high &= ~MASK_LVTOFF_HI; 212 memset(&b, 0, sizeof(b));
169 high |= lvt_off << 20; 213 b.cpu = cpu;
170 wrmsr(address, low, high); 214 b.bank = bank;
171 215 b.block = block;
172 threshold_defaults.address = address; 216 b.address = address;
173 tr.b = &threshold_defaults;
174 tr.reset = 0;
175 tr.old_limit = 0;
176 threshold_restart_bank(&tr);
177 217
218 mce_threshold_block_init(&b, offset);
178 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
179 } 220 }
180 } 221 }
@@ -277,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
277 318
278 b->interrupt_enable = !!new; 319 b->interrupt_enable = !!new;
279 320
321 memset(&tr, 0, sizeof(tr));
280 tr.b = b; 322 tr.b = b;
281 tr.reset = 0;
282 tr.old_limit = 0;
283 323
284 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 324 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
285 325
@@ -300,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
300 if (new < 1) 340 if (new < 1)
301 new = 1; 341 new = 1;
302 342
343 memset(&tr, 0, sizeof(tr));
303 tr.old_limit = b->threshold_limit; 344 tr.old_limit = b->threshold_limit;
304 b->threshold_limit = new; 345 b->threshold_limit = new;
305 tr.b = b; 346 tr.b = b;
306 tr.reset = 0;
307 347
308 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 348 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 349
@@ -469,6 +509,7 @@ recurse:
469out_free: 509out_free:
470 if (b) { 510 if (b) {
471 kobject_put(&b->kobj); 511 kobject_put(&b->kobj);
512 list_del(&b->miscj);
472 kfree(b); 513 kfree(b);
473 } 514 }
474 return err; 515 return err;
@@ -487,15 +528,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
487 int i, err = 0; 528 int i, err = 0;
488 struct threshold_bank *b = NULL; 529 struct threshold_bank *b = NULL;
489 char name[32]; 530 char name[32];
490#ifdef CONFIG_SMP
491 struct cpuinfo_x86 *c = &cpu_data(cpu);
492#endif
493 531
494 sprintf(name, "threshold_bank%i", bank); 532 sprintf(name, "threshold_bank%i", bank);
495 533
496#ifdef CONFIG_SMP 534#ifdef CONFIG_SMP
497 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 535 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
498 i = cpumask_first(c->llc_shared_map); 536 i = cpumask_first(cpu_llc_shared_mask(cpu));
499 537
500 /* first core not up yet */ 538 /* first core not up yet */
501 if (cpu_data(i).cpu_core_id) 539 if (cpu_data(i).cpu_core_id)
@@ -515,7 +553,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
515 if (err) 553 if (err)
516 goto out; 554 goto out;
517 555
518 cpumask_copy(b->cpus, c->llc_shared_map); 556 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
519 per_cpu(threshold_banks, cpu)[bank] = b; 557 per_cpu(threshold_banks, cpu)[bank] = b;
520 558
521 goto out; 559 goto out;
@@ -582,9 +620,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
582 continue; 620 continue;
583 err = threshold_create_bank(cpu, bank); 621 err = threshold_create_bank(cpu, bank);
584 if (err) 622 if (err)
585 goto out; 623 return err;
586 } 624 }
587out: 625
588 return err; 626 return err;
589} 627}
590 628
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
130 unsigned long flags; 130 unsigned long flags;
131 int banks; 131 int banks;
132 132
133 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks)) 133 if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
134 return; 134 return;
135 local_irq_save(flags); 135 local_irq_save(flags);
136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,14 @@ struct thermal_state {
53 struct _thermal_state core_power_limit; 53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle; 54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit; 55 struct _thermal_state package_power_limit;
56 struct _thermal_state core_thresh0;
57 struct _thermal_state core_thresh1;
56}; 58};
57 59
60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val);
62EXPORT_SYMBOL(platform_thermal_notify);
63
58static DEFINE_PER_CPU(struct thermal_state, thermal_state); 64static DEFINE_PER_CPU(struct thermal_state, thermal_state);
59 65
60static atomic_t therm_throt_en = ATOMIC_INIT(0); 66static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -181,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
181 this_cpu, 187 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package", 188 level == CORE_LEVEL ? "Core" : "Package",
183 state->count); 189 state->count);
184
185 add_taint(TAINT_MACHINE_CHECK);
186 return 1; 190 return 1;
187 } 191 }
188 if (old_event) { 192 if (old_event) {
@@ -200,6 +204,22 @@ static int therm_throt_process(bool new_event, int event, int level)
200 return 0; 204 return 0;
201} 205}
202 206
207static int thresh_event_valid(int event)
208{
209 struct _thermal_state *state;
210 unsigned int this_cpu = smp_processor_id();
211 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
212 u64 now = get_jiffies_64();
213
214 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
215
216 if (time_before64(now, state->next_check))
217 return 0;
218
219 state->next_check = now + CHECK_INTERVAL;
220 return 1;
221}
222
203#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
204/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 225static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,32 +333,50 @@ device_initcall(thermal_throttle_init_device);
313#define PACKAGE_THROTTLED ((__u64)2 << 62) 333#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) 334#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315 335
336static void notify_thresholds(__u64 msr_val)
337{
338 /* check whether the interrupt handler is defined;
339 * otherwise simply return
340 */
341 if (!platform_thermal_notify)
342 return;
343
344 /* lower threshold reached */
345 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
346 platform_thermal_notify(msr_val);
347 /* higher threshold reached */
348 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
349 platform_thermal_notify(msr_val);
350}
351
316/* Thermal transition interrupt handler */ 352/* Thermal transition interrupt handler */
317static void intel_thermal_interrupt(void) 353static void intel_thermal_interrupt(void)
318{ 354{
319 __u64 msr_val; 355 __u64 msr_val;
320 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
321 356
322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 357 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
323 358
359 /* Check for violation of core thermal thresholds*/
360 notify_thresholds(msr_val);
361
324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 362 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT, 363 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
327 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
328 366
329 if (cpu_has(c, X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
330 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
331 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
332 CORE_LEVEL) != 0) 370 CORE_LEVEL) != 0)
333 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); 371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
334 372
335 if (cpu_has(c, X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
336 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
337 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
338 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
339 PACKAGE_LEVEL) != 0) 377 PACKAGE_LEVEL) != 0)
340 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); 378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
341 if (cpu_has(c, X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
342 if (therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
343 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
344 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
@@ -350,9 +388,8 @@ static void intel_thermal_interrupt(void)
350 388
351static void unexpected_thermal_interrupt(void) 389static void unexpected_thermal_interrupt(void)
352{ 390{
353 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 391 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
354 smp_processor_id()); 392 smp_processor_id());
355 add_taint(TAINT_MACHINE_CHECK);
356} 393}
357 394
358static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; 395static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -405,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
405 */ 442 */
406 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 443 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
407 444
445 h = lvtthmr_init;
408 /* 446 /*
409 * The initial value of thermal LVT entries on all APs always reads 447 * The initial value of thermal LVT entries on all APs always reads
410 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI 448 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
411 * sequence to them and LVT registers are reset to 0s except for 449 * sequence to them and LVT registers are reset to 0s except for
412 * the mask bits which are set to 1s when APs receive INIT IPI. 450 * the mask bits which are set to 1s when APs receive INIT IPI.
413 * Always restore the value that BIOS has programmed on AP based on 451 * If BIOS takes over the thermal interrupt and sets its interrupt
414 * BSP's info we saved since BIOS is always setting the same value 452 * delivery mode to SMI (not fixed), it restores the value that the
415 * for all threads/cores 453 * BIOS has programmed on AP based on BSP's info we saved since BIOS
454 * is always setting the same value for all threads/cores.
416 */ 455 */
417 apic_write(APIC_LVTTHMR, lvtthmr_init); 456 if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
457 apic_write(APIC_LVTTHMR, lvtthmr_init);
418 458
419 h = lvtthmr_init;
420 459
421 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 460 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
422 printk(KERN_DEBUG 461 printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
827 827
828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
829 return 0; 829 return 0;
830 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 830 if (boot_cpu_data.x86 < 0xf)
831 return 0; 831 return 0;
832 /* In case some hypervisor doesn't pass SYSCFG through: */ 832 /* In case some hypervisor doesn't pass SYSCFG through: */
833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
64 } 64 }
65} 65}
66 66
67/* Get the size of contiguous MTRR range */
68static u64 get_mtrr_size(u64 mask)
69{
70 u64 size;
71
72 mask >>= PAGE_SHIFT;
73 mask |= size_or_mask;
74 size = -mask;
75 size <<= PAGE_SHIFT;
76 return size;
77}
78
67/* 79/*
68 * Returns the effective MTRR type for the region 80 * Check and return the effective type for MTRR-MTRR type overlap.
69 * Error returns: 81 * Returns 1 if the effective type is UNCACHEABLE, else returns 0
70 * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
71 * - 0xFF - when MTRR is not enabled
72 */ 82 */
73u8 mtrr_type_lookup(u64 start, u64 end) 83static int check_type_overlap(u8 *prev, u8 *curr)
84{
85 if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
86 *prev = MTRR_TYPE_UNCACHABLE;
87 *curr = MTRR_TYPE_UNCACHABLE;
88 return 1;
89 }
90
91 if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
92 (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
93 *prev = MTRR_TYPE_WRTHROUGH;
94 *curr = MTRR_TYPE_WRTHROUGH;
95 }
96
97 if (*prev != *curr) {
98 *prev = MTRR_TYPE_UNCACHABLE;
99 *curr = MTRR_TYPE_UNCACHABLE;
100 return 1;
101 }
102
103 return 0;
104}
105
106/*
107 * Error/Semi-error returns:
108 * 0xFF - when MTRR is not enabled
109 * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
110 * corresponds only to [start:*partial_end].
111 * Caller has to lookup again for [*partial_end:end].
112 */
113static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
74{ 114{
75 int i; 115 int i;
76 u64 base, mask; 116 u64 base, mask;
77 u8 prev_match, curr_match; 117 u8 prev_match, curr_match;
78 118
119 *repeat = 0;
79 if (!mtrr_state_set) 120 if (!mtrr_state_set)
80 return 0xFF; 121 return 0xFF;
81 122
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
126 167
127 start_state = ((start & mask) == (base & mask)); 168 start_state = ((start & mask) == (base & mask));
128 end_state = ((end & mask) == (base & mask)); 169 end_state = ((end & mask) == (base & mask));
129 if (start_state != end_state) 170
130 return 0xFE; 171 if (start_state != end_state) {
172 /*
173 * We have start:end spanning across an MTRR.
174 * We split the region into
175 * either
176 * (start:mtrr_end) (mtrr_end:end)
177 * or
178 * (start:mtrr_start) (mtrr_start:end)
179 * depending on kind of overlap.
180 * Return the type for first region and a pointer to
181 * the start of second region so that caller will
182 * lookup again on the second region.
183 * Note: This way we handle multiple overlaps as well.
184 */
185 if (start_state)
186 *partial_end = base + get_mtrr_size(mask);
187 else
188 *partial_end = base;
189
190 if (unlikely(*partial_end <= start)) {
191 WARN_ON(1);
192 *partial_end = start + PAGE_SIZE;
193 }
194
195 end = *partial_end - 1; /* end is inclusive */
196 *repeat = 1;
197 }
131 198
132 if ((start & mask) != (base & mask)) 199 if ((start & mask) != (base & mask))
133 continue; 200 continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
138 continue; 205 continue;
139 } 206 }
140 207
141 if (prev_match == MTRR_TYPE_UNCACHABLE || 208 if (check_type_overlap(&prev_match, &curr_match))
142 curr_match == MTRR_TYPE_UNCACHABLE) { 209 return curr_match;
143 return MTRR_TYPE_UNCACHABLE;
144 }
145
146 if ((prev_match == MTRR_TYPE_WRBACK &&
147 curr_match == MTRR_TYPE_WRTHROUGH) ||
148 (prev_match == MTRR_TYPE_WRTHROUGH &&
149 curr_match == MTRR_TYPE_WRBACK)) {
150 prev_match = MTRR_TYPE_WRTHROUGH;
151 curr_match = MTRR_TYPE_WRTHROUGH;
152 }
153
154 if (prev_match != curr_match)
155 return MTRR_TYPE_UNCACHABLE;
156 } 210 }
157 211
158 if (mtrr_tom2) { 212 if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
166 return mtrr_state.def_type; 220 return mtrr_state.def_type;
167} 221}
168 222
223/*
224 * Returns the effective MTRR type for the region
225 * Error return:
226 * 0xFF - when MTRR is not enabled
227 */
228u8 mtrr_type_lookup(u64 start, u64 end)
229{
230 u8 type, prev_type;
231 int repeat;
232 u64 partial_end;
233
234 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
235
236 /*
237 * Common path is with repeat = 0.
238 * However, we can have cases where [start:end] spans across some
239 * MTRR range. Do repeated lookups for that case here.
240 */
241 while (repeat) {
242 prev_type = type;
243 start = partial_end;
244 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
245
246 if (check_type_overlap(&prev_type, &type))
247 return type;
248 }
249
250 return type;
251}
252
169/* Get the MSR pair relating to a var range */ 253/* Get the MSR pair relating to a var range */
170static void 254static void
171get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 255get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc3..929739a653d1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
45#include <linux/cpu.h> 45#include <linux/cpu.h>
46#include <linux/pci.h> 46#include <linux/pci.h>
47#include <linux/smp.h> 47#include <linux/smp.h>
48#include <linux/syscore_ops.h>
48 49
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/e820.h> 51#include <asm/e820.h>
@@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
292 293
293 /* 294 /*
294 * HACK! 295 * HACK!
295 * We use this same function to initialize the mtrrs on boot. 296 *
296 * The state of the boot cpu's mtrrs has been saved, and we want 297 * We use this same function to initialize the mtrrs during boot,
297 * to replicate across all the APs. 298 * resume, runtime cpu online and on an explicit request to set a
298 * If we're doing that @reg is set to something special... 299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
299 */ 310 */
300 if (reg != ~0U) 311 if (reg != ~0U)
301 mtrr_if->set(reg, base, size, type); 312 mtrr_if->set(reg, base, size, type);
302 else if (!mtrr_aps_delayed_init) 313 else
303 mtrr_if->set_all(); 314 mtrr_if->set_all();
304 315
305 /* Wait for the others */ 316 /* Wait for the others */
@@ -630,7 +641,7 @@ struct mtrr_value {
630 641
631static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 642static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
632 643
633static int mtrr_save(struct sys_device *sysdev, pm_message_t state) 644static int mtrr_save(void)
634{ 645{
635 int i; 646 int i;
636 647
@@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
642 return 0; 653 return 0;
643} 654}
644 655
645static int mtrr_restore(struct sys_device *sysdev) 656static void mtrr_restore(void)
646{ 657{
647 int i; 658 int i;
648 659
@@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev)
653 mtrr_value[i].ltype); 664 mtrr_value[i].ltype);
654 } 665 }
655 } 666 }
656 return 0;
657} 667}
658 668
659 669
660 670
661static struct sysdev_driver mtrr_sysdev_driver = { 671static struct syscore_ops mtrr_syscore_ops = {
662 .suspend = mtrr_save, 672 .suspend = mtrr_save,
663 .resume = mtrr_restore, 673 .resume = mtrr_restore,
664}; 674};
@@ -793,13 +803,21 @@ void set_mtrr_aps_delayed_init(void)
793} 803}
794 804
795/* 805/*
796 * MTRR initialization for all AP's 806 * Delayed MTRR initialization for all AP's
797 */ 807 */
798void mtrr_aps_init(void) 808void mtrr_aps_init(void)
799{ 809{
800 if (!use_intel()) 810 if (!use_intel())
801 return; 811 return;
802 812
813 /*
814 * Check if someone has requested the delay of AP MTRR initialization,
815 * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
816 * then we are done.
817 */
818 if (!mtrr_aps_delayed_init)
819 return;
820
803 set_mtrr(~0U, 0, 0, 0); 821 set_mtrr(~0U, 0, 0, 0);
804 mtrr_aps_delayed_init = false; 822 mtrr_aps_delayed_init = false;
805} 823}
@@ -831,7 +849,7 @@ static int __init mtrr_init_finialize(void)
831 * TBD: is there any system with such CPU which supports 849 * TBD: is there any system with such CPU which supports
832 * suspend/resume? If no, we should remove the code. 850 * suspend/resume? If no, we should remove the code.
833 */ 851 */
834 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); 852 register_syscore_ops(&mtrr_syscore_ops);
835 853
836 return 0; 854 return 0;
837} 855}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad6..3a0338b4b179 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,8 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
34#include <asm/alternative.h>
33 35
34#if 0 36#if 0
35#undef wrmsrl 37#undef wrmsrl
@@ -49,7 +51,6 @@ static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 51copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{ 52{
51 unsigned long offset, addr = (unsigned long)from; 53 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0; 54 unsigned long size, len = 0;
54 struct page *page; 55 struct page *page;
55 void *map; 56 void *map;
@@ -63,9 +64,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
63 offset = addr & (PAGE_SIZE - 1); 64 offset = addr & (PAGE_SIZE - 1);
64 size = min(PAGE_SIZE - offset, n - len); 65 size = min(PAGE_SIZE - offset, n - len);
65 66
66 map = kmap_atomic(page, type); 67 map = kmap_atomic(page);
67 memcpy(to, map+offset, size); 68 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type); 69 kunmap_atomic(map);
69 put_page(page); 70 put_page(page);
70 71
71 len += size; 72 len += size;
@@ -94,6 +95,8 @@ struct amd_nb {
94 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 95 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
95}; 96};
96 97
98struct intel_percore;
99
97#define MAX_LBR_ENTRIES 16 100#define MAX_LBR_ENTRIES 16
98 101
99struct cpu_hw_events { 102struct cpu_hw_events {
@@ -129,6 +132,13 @@ struct cpu_hw_events {
129 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
130 133
131 /* 134 /*
135 * Intel percore register state.
136 * Coordinate shared resources between HT threads.
137 */
138 int percore_used; /* Used by this CPU? */
139 struct intel_percore *per_core;
140
141 /*
132 * AMD specific bits 142 * AMD specific bits
133 */ 143 */
134 struct amd_nb *amd_nb; 144 struct amd_nb *amd_nb;
@@ -167,7 +177,7 @@ struct cpu_hw_events {
167/* 177/*
168 * Constraint on the Event code + UMask 178 * Constraint on the Event code + UMask
169 */ 179 */
170#define PEBS_EVENT_CONSTRAINT(c, n) \ 180#define INTEL_UEVENT_CONSTRAINT(c, n) \
171 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 181 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
172 182
173#define EVENT_CONSTRAINT_END \ 183#define EVENT_CONSTRAINT_END \
@@ -176,6 +186,28 @@ struct cpu_hw_events {
176#define for_each_event_constraint(e, c) \ 186#define for_each_event_constraint(e, c) \
177 for ((e) = (c); (e)->weight; (e)++) 187 for ((e) = (c); (e)->weight; (e)++)
178 188
189/*
190 * Extra registers for specific events.
191 * Some events need large masks and require external MSRs.
192 * Define a mapping to these extra registers.
193 */
194struct extra_reg {
195 unsigned int event;
196 unsigned int msr;
197 u64 config_mask;
198 u64 valid_mask;
199};
200
201#define EVENT_EXTRA_REG(e, ms, m, vm) { \
202 .event = (e), \
203 .msr = (ms), \
204 .config_mask = (m), \
205 .valid_mask = (vm), \
206 }
207#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
208 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
209#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
210
179union perf_capabilities { 211union perf_capabilities {
180 struct { 212 struct {
181 u64 lbr_format : 6; 213 u64 lbr_format : 6;
@@ -220,6 +252,7 @@ struct x86_pmu {
220 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 252 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
221 struct perf_event *event); 253 struct perf_event *event);
222 struct event_constraint *event_constraints; 254 struct event_constraint *event_constraints;
255 struct event_constraint *percore_constraints;
223 void (*quirks)(void); 256 void (*quirks)(void);
224 int perfctr_second_write; 257 int perfctr_second_write;
225 258
@@ -238,6 +271,7 @@ struct x86_pmu {
238 * Intel DebugStore bits 271 * Intel DebugStore bits
239 */ 272 */
240 int bts, pebs; 273 int bts, pebs;
274 int bts_active, pebs_active;
241 int pebs_record_size; 275 int pebs_record_size;
242 void (*drain_pebs)(struct pt_regs *regs); 276 void (*drain_pebs)(struct pt_regs *regs);
243 struct event_constraint *pebs_constraints; 277 struct event_constraint *pebs_constraints;
@@ -247,6 +281,11 @@ struct x86_pmu {
247 */ 281 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 282 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 283 int lbr_nr; /* hardware stack size */
284
285 /*
286 * Extra registers for events
287 */
288 struct extra_reg *extra_regs;
250}; 289};
251 290
252static struct x86_pmu x86_pmu __read_mostly; 291static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 310 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 311 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 312 [PERF_COUNT_HW_CACHE_RESULT_MAX];
313static u64 __read_mostly hw_cache_extra_regs
314 [PERF_COUNT_HW_CACHE_MAX]
315 [PERF_COUNT_HW_CACHE_OP_MAX]
316 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 317
275/* 318/*
276 * Propagate event elapsed time into the generic event. 319 * Propagate event elapsed time into the generic event.
@@ -298,7 +341,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 341 */
299again: 342again:
300 prev_raw_count = local64_read(&hwc->prev_count); 343 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 344 rdmsrl(hwc->event_base, new_raw_count);
302 345
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 346 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 347 new_raw_count) != prev_raw_count)
@@ -321,6 +364,55 @@ again:
321 return new_raw_count; 364 return new_raw_count;
322} 365}
323 366
367static inline int x86_pmu_addr_offset(int index)
368{
369 int offset;
370
371 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372 alternative_io(ASM_NOP2,
373 "shll $1, %%eax",
374 X86_FEATURE_PERFCTR_CORE,
375 "=a" (offset),
376 "a" (index));
377
378 return offset;
379}
380
381static inline unsigned int x86_pmu_config_addr(int index)
382{
383 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
384}
385
386static inline unsigned int x86_pmu_event_addr(int index)
387{
388 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
389}
390
391/*
392 * Find and validate any extra registers to set up.
393 */
394static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395{
396 struct extra_reg *er;
397
398 event->hw.extra_reg = 0;
399 event->hw.extra_config = 0;
400
401 if (!x86_pmu.extra_regs)
402 return 0;
403
404 for (er = x86_pmu.extra_regs; er->msr; er++) {
405 if (er->event != (config & er->config_mask))
406 continue;
407 if (event->attr.config1 & ~er->valid_mask)
408 return -EINVAL;
409 event->hw.extra_reg = er->msr;
410 event->hw.extra_config = event->attr.config1;
411 break;
412 }
413 return 0;
414}
415
324static atomic_t active_events; 416static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 417static DEFINE_MUTEX(pmc_reserve_mutex);
326 418
@@ -330,16 +422,13 @@ static bool reserve_pmc_hardware(void)
330{ 422{
331 int i; 423 int i;
332 424
333 if (nmi_watchdog == NMI_LOCAL_APIC)
334 disable_lapic_nmi_watchdog();
335
336 for (i = 0; i < x86_pmu.num_counters; i++) { 425 for (i = 0; i < x86_pmu.num_counters; i++) {
337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 426 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
338 goto perfctr_fail; 427 goto perfctr_fail;
339 } 428 }
340 429
341 for (i = 0; i < x86_pmu.num_counters; i++) { 430 for (i = 0; i < x86_pmu.num_counters; i++) {
342 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 431 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
343 goto eventsel_fail; 432 goto eventsel_fail;
344 } 433 }
345 434
@@ -347,16 +436,13 @@ static bool reserve_pmc_hardware(void)
347 436
348eventsel_fail: 437eventsel_fail:
349 for (i--; i >= 0; i--) 438 for (i--; i >= 0; i--)
350 release_evntsel_nmi(x86_pmu.eventsel + i); 439 release_evntsel_nmi(x86_pmu_config_addr(i));
351 440
352 i = x86_pmu.num_counters; 441 i = x86_pmu.num_counters;
353 442
354perfctr_fail: 443perfctr_fail:
355 for (i--; i >= 0; i--) 444 for (i--; i >= 0; i--)
356 release_perfctr_nmi(x86_pmu.perfctr + i); 445 release_perfctr_nmi(x86_pmu_event_addr(i));
357
358 if (nmi_watchdog == NMI_LOCAL_APIC)
359 enable_lapic_nmi_watchdog();
360 446
361 return false; 447 return false;
362} 448}
@@ -366,12 +452,9 @@ static void release_pmc_hardware(void)
366 int i; 452 int i;
367 453
368 for (i = 0; i < x86_pmu.num_counters; i++) { 454 for (i = 0; i < x86_pmu.num_counters; i++) {
369 release_perfctr_nmi(x86_pmu.perfctr + i); 455 release_perfctr_nmi(x86_pmu_event_addr(i));
370 release_evntsel_nmi(x86_pmu.eventsel + i); 456 release_evntsel_nmi(x86_pmu_config_addr(i));
371 } 457 }
372
373 if (nmi_watchdog == NMI_LOCAL_APIC)
374 enable_lapic_nmi_watchdog();
375} 458}
376 459
377#else 460#else
@@ -381,7 +464,64 @@ static void release_pmc_hardware(void) {}
381 464
382#endif 465#endif
383 466
384static int reserve_ds_buffers(void); 467static bool check_hw_exists(void)
468{
469 u64 val, val_new = 0;
470 int i, reg, ret = 0;
471
472 /*
473 * Check to see if the BIOS enabled any of the counters, if so
474 * complain and bail.
475 */
476 for (i = 0; i < x86_pmu.num_counters; i++) {
477 reg = x86_pmu_config_addr(i);
478 ret = rdmsrl_safe(reg, &val);
479 if (ret)
480 goto msr_fail;
481 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
482 goto bios_fail;
483 }
484
485 if (x86_pmu.num_counters_fixed) {
486 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
487 ret = rdmsrl_safe(reg, &val);
488 if (ret)
489 goto msr_fail;
490 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
491 if (val & (0x03 << i*4))
492 goto bios_fail;
493 }
494 }
495
496 /*
497 * Now write a value and read it back to see if it matches,
498 * this is needed to detect certain hardware emulators (qemu/kvm)
499 * that don't trap on the MSR access and always return 0s.
500 */
501 val = 0xabcdUL;
502 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
503 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
504 if (ret || val != val_new)
505 goto msr_fail;
506
507 return true;
508
509bios_fail:
510 /*
511 * We still allow the PMU driver to operate:
512 */
513 printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
514 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
515
516 return true;
517
518msr_fail:
519 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
520
521 return false;
522}
523
524static void reserve_ds_buffers(void);
385static void release_ds_buffers(void); 525static void release_ds_buffers(void);
386 526
387static void hw_perf_event_destroy(struct perf_event *event) 527static void hw_perf_event_destroy(struct perf_event *event)
@@ -399,8 +539,9 @@ static inline int x86_pmu_initialized(void)
399} 539}
400 540
401static inline int 541static inline int
402set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 542set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
403{ 543{
544 struct perf_event_attr *attr = &event->attr;
404 unsigned int cache_type, cache_op, cache_result; 545 unsigned int cache_type, cache_op, cache_result;
405 u64 config, val; 546 u64 config, val;
406 547
@@ -427,8 +568,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
427 return -EINVAL; 568 return -EINVAL;
428 569
429 hwc->config |= val; 570 hwc->config |= val;
430 571 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
431 return 0; 572 return x86_pmu_extra_regs(val, event);
432} 573}
433 574
434static int x86_setup_perfctr(struct perf_event *event) 575static int x86_setup_perfctr(struct perf_event *event)
@@ -437,7 +578,7 @@ static int x86_setup_perfctr(struct perf_event *event)
437 struct hw_perf_event *hwc = &event->hw; 578 struct hw_perf_event *hwc = &event->hw;
438 u64 config; 579 u64 config;
439 580
440 if (!hwc->sample_period) { 581 if (!is_sampling_event(event)) {
441 hwc->sample_period = x86_pmu.max_period; 582 hwc->sample_period = x86_pmu.max_period;
442 hwc->last_period = hwc->sample_period; 583 hwc->last_period = hwc->sample_period;
443 local64_set(&hwc->period_left, hwc->sample_period); 584 local64_set(&hwc->period_left, hwc->sample_period);
@@ -452,11 +593,15 @@ static int x86_setup_perfctr(struct perf_event *event)
452 return -EOPNOTSUPP; 593 return -EOPNOTSUPP;
453 } 594 }
454 595
596 /*
597 * Do not allow config1 (extended registers) to propagate,
598 * there's no sane user-space generalization yet:
599 */
455 if (attr->type == PERF_TYPE_RAW) 600 if (attr->type == PERF_TYPE_RAW)
456 return 0; 601 return 0;
457 602
458 if (attr->type == PERF_TYPE_HW_CACHE) 603 if (attr->type == PERF_TYPE_HW_CACHE)
459 return set_ext_hw_attr(hwc, attr); 604 return set_ext_hw_attr(hwc, event);
460 605
461 if (attr->config >= x86_pmu.max_events) 606 if (attr->config >= x86_pmu.max_events)
462 return -EINVAL; 607 return -EINVAL;
@@ -475,10 +620,10 @@ static int x86_setup_perfctr(struct perf_event *event)
475 /* 620 /*
476 * Branch tracing: 621 * Branch tracing:
477 */ 622 */
478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 623 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
479 (hwc->sample_period == 1)) { 624 !attr->freq && hwc->sample_period == 1) {
480 /* BTS is not supported by this architecture. */ 625 /* BTS is not supported by this architecture. */
481 if (!x86_pmu.bts) 626 if (!x86_pmu.bts_active)
482 return -EOPNOTSUPP; 627 return -EOPNOTSUPP;
483 628
484 /* BTS is currently only allowed for user-mode. */ 629 /* BTS is currently only allowed for user-mode. */
@@ -497,12 +642,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
497 int precise = 0; 642 int precise = 0;
498 643
499 /* Support for constant skid */ 644 /* Support for constant skid */
500 if (x86_pmu.pebs) 645 if (x86_pmu.pebs_active) {
501 precise++; 646 precise++;
502 647
503 /* Support for IP fixup */ 648 /* Support for IP fixup */
504 if (x86_pmu.lbr_nr) 649 if (x86_pmu.lbr_nr)
505 precise++; 650 precise++;
651 }
506 652
507 if (event->attr.precise_ip > precise) 653 if (event->attr.precise_ip > precise)
508 return -EOPNOTSUPP; 654 return -EOPNOTSUPP;
@@ -531,7 +677,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
531/* 677/*
532 * Setup the hardware configuration for a given attr_type 678 * Setup the hardware configuration for a given attr_type
533 */ 679 */
534static int __hw_perf_event_init(struct perf_event *event) 680static int __x86_pmu_event_init(struct perf_event *event)
535{ 681{
536 int err; 682 int err;
537 683
@@ -544,11 +690,8 @@ static int __hw_perf_event_init(struct perf_event *event)
544 if (atomic_read(&active_events) == 0) { 690 if (atomic_read(&active_events) == 0) {
545 if (!reserve_pmc_hardware()) 691 if (!reserve_pmc_hardware())
546 err = -EBUSY; 692 err = -EBUSY;
547 else { 693 else
548 err = reserve_ds_buffers(); 694 reserve_ds_buffers();
549 if (err)
550 release_pmc_hardware();
551 }
552 } 695 }
553 if (!err) 696 if (!err)
554 atomic_inc(&active_events); 697 atomic_inc(&active_events);
@@ -576,15 +719,15 @@ static void x86_pmu_disable_all(void)
576 719
577 if (!test_bit(idx, cpuc->active_mask)) 720 if (!test_bit(idx, cpuc->active_mask))
578 continue; 721 continue;
579 rdmsrl(x86_pmu.eventsel + idx, val); 722 rdmsrl(x86_pmu_config_addr(idx), val);
580 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 723 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
581 continue; 724 continue;
582 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 725 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
583 wrmsrl(x86_pmu.eventsel + idx, val); 726 wrmsrl(x86_pmu_config_addr(idx), val);
584 } 727 }
585} 728}
586 729
587void hw_perf_disable(void) 730static void x86_pmu_disable(struct pmu *pmu)
588{ 731{
589 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
590 733
@@ -601,25 +744,30 @@ void hw_perf_disable(void)
601 x86_pmu.disable_all(); 744 x86_pmu.disable_all();
602} 745}
603 746
747static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748 u64 enable_mask)
749{
750 if (hwc->extra_reg)
751 wrmsrl(hwc->extra_reg, hwc->extra_config);
752 wrmsrl(hwc->config_base, hwc->config | enable_mask);
753}
754
604static void x86_pmu_enable_all(int added) 755static void x86_pmu_enable_all(int added)
605{ 756{
606 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 757 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
607 int idx; 758 int idx;
608 759
609 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 760 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
610 struct perf_event *event = cpuc->events[idx]; 761 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
611 u64 val;
612 762
613 if (!test_bit(idx, cpuc->active_mask)) 763 if (!test_bit(idx, cpuc->active_mask))
614 continue; 764 continue;
615 765
616 val = event->hw.config; 766 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
617 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
618 wrmsrl(x86_pmu.eventsel + idx, val);
619 } 767 }
620} 768}
621 769
622static const struct pmu pmu; 770static struct pmu pmu;
623 771
624static inline int is_x86_event(struct perf_event *event) 772static inline int is_x86_event(struct perf_event *event)
625{ 773{
@@ -780,15 +928,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
780 hwc->event_base = 0; 928 hwc->event_base = 0;
781 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 929 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
782 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 930 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
783 /* 931 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
784 * We set it so that event_base + idx in wrmsr/rdmsr maps to
785 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
786 */
787 hwc->event_base =
788 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
789 } else { 932 } else {
790 hwc->config_base = x86_pmu.eventsel; 933 hwc->config_base = x86_pmu_config_addr(hwc->idx);
791 hwc->event_base = x86_pmu.perfctr; 934 hwc->event_base = x86_pmu_event_addr(hwc->idx);
792 } 935 }
793} 936}
794 937
@@ -801,10 +944,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
801 hwc->last_tag == cpuc->tags[i]; 944 hwc->last_tag == cpuc->tags[i];
802} 945}
803 946
804static int x86_pmu_start(struct perf_event *event); 947static void x86_pmu_start(struct perf_event *event, int flags);
805static void x86_pmu_stop(struct perf_event *event); 948static void x86_pmu_stop(struct perf_event *event, int flags);
806 949
807void hw_perf_enable(void) 950static void x86_pmu_enable(struct pmu *pmu)
808{ 951{
809 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 952 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
810 struct perf_event *event; 953 struct perf_event *event;
@@ -840,7 +983,14 @@ void hw_perf_enable(void)
840 match_prev_assignment(hwc, cpuc, i)) 983 match_prev_assignment(hwc, cpuc, i))
841 continue; 984 continue;
842 985
843 x86_pmu_stop(event); 986 /*
987 * Ensure we don't accidentally enable a stopped
988 * counter simply because we rescheduled.
989 */
990 if (hwc->state & PERF_HES_STOPPED)
991 hwc->state |= PERF_HES_ARCH;
992
993 x86_pmu_stop(event, PERF_EF_UPDATE);
844 } 994 }
845 995
846 for (i = 0; i < cpuc->n_events; i++) { 996 for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +1002,10 @@ void hw_perf_enable(void)
852 else if (i < n_running) 1002 else if (i < n_running)
853 continue; 1003 continue;
854 1004
855 x86_pmu_start(event); 1005 if (hwc->state & PERF_HES_ARCH)
1006 continue;
1007
1008 x86_pmu_start(event, PERF_EF_RELOAD);
856 } 1009 }
857 cpuc->n_added = 0; 1010 cpuc->n_added = 0;
858 perf_events_lapic_init(); 1011 perf_events_lapic_init();
@@ -864,17 +1017,11 @@ void hw_perf_enable(void)
864 x86_pmu.enable_all(added); 1017 x86_pmu.enable_all(added);
865} 1018}
866 1019
867static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
868 u64 enable_mask)
869{
870 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
871}
872
873static inline void x86_pmu_disable_event(struct perf_event *event) 1020static inline void x86_pmu_disable_event(struct perf_event *event)
874{ 1021{
875 struct hw_perf_event *hwc = &event->hw; 1022 struct hw_perf_event *hwc = &event->hw;
876 1023
877 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1024 wrmsrl(hwc->config_base, hwc->config);
878} 1025}
879 1026
880static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1027static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -927,7 +1074,7 @@ x86_perf_event_set_period(struct perf_event *event)
927 */ 1074 */
928 local64_set(&hwc->prev_count, (u64)-left); 1075 local64_set(&hwc->prev_count, (u64)-left);
929 1076
930 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1077 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
931 1078
932 /* 1079 /*
933 * Due to erratum on certan cpu we need 1080 * Due to erratum on certan cpu we need
@@ -935,7 +1082,7 @@ x86_perf_event_set_period(struct perf_event *event)
935 * is updated properly 1082 * is updated properly
936 */ 1083 */
937 if (x86_pmu.perfctr_second_write) { 1084 if (x86_pmu.perfctr_second_write) {
938 wrmsrl(hwc->event_base + idx, 1085 wrmsrl(hwc->event_base,
939 (u64)(-left) & x86_pmu.cntval_mask); 1086 (u64)(-left) & x86_pmu.cntval_mask);
940 } 1087 }
941 1088
@@ -946,22 +1093,18 @@ x86_perf_event_set_period(struct perf_event *event)
946 1093
947static void x86_pmu_enable_event(struct perf_event *event) 1094static void x86_pmu_enable_event(struct perf_event *event)
948{ 1095{
949 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1096 if (__this_cpu_read(cpu_hw_events.enabled))
950 if (cpuc->enabled)
951 __x86_pmu_enable_event(&event->hw, 1097 __x86_pmu_enable_event(&event->hw,
952 ARCH_PERFMON_EVENTSEL_ENABLE); 1098 ARCH_PERFMON_EVENTSEL_ENABLE);
953} 1099}
954 1100
955/* 1101/*
956 * activate a single event 1102 * Add a single event to the PMU.
957 * 1103 *
958 * The event is added to the group of enabled events 1104 * The event is added to the group of enabled events
959 * but only if it can be scehduled with existing events. 1105 * but only if it can be scehduled with existing events.
960 *
961 * Called with PMU disabled. If successful and return value 1,
962 * then guaranteed to call perf_enable() and hw_perf_enable()
963 */ 1106 */
964static int x86_pmu_enable(struct perf_event *event) 1107static int x86_pmu_add(struct perf_event *event, int flags)
965{ 1108{
966 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1109 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
967 struct hw_perf_event *hwc; 1110 struct hw_perf_event *hwc;
@@ -970,58 +1113,67 @@ static int x86_pmu_enable(struct perf_event *event)
970 1113
971 hwc = &event->hw; 1114 hwc = &event->hw;
972 1115
1116 perf_pmu_disable(event->pmu);
973 n0 = cpuc->n_events; 1117 n0 = cpuc->n_events;
974 n = collect_events(cpuc, event, false); 1118 ret = n = collect_events(cpuc, event, false);
975 if (n < 0) 1119 if (ret < 0)
976 return n; 1120 goto out;
1121
1122 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1123 if (!(flags & PERF_EF_START))
1124 hwc->state |= PERF_HES_ARCH;
977 1125
978 /* 1126 /*
979 * If group events scheduling transaction was started, 1127 * If group events scheduling transaction was started,
980 * skip the schedulability test here, it will be peformed 1128 * skip the schedulability test here, it will be performed
981 * at commit time(->commit_txn) as a whole 1129 * at commit time (->commit_txn) as a whole
982 */ 1130 */
983 if (cpuc->group_flag & PERF_EVENT_TXN) 1131 if (cpuc->group_flag & PERF_EVENT_TXN)
984 goto out; 1132 goto done_collect;
985 1133
986 ret = x86_pmu.schedule_events(cpuc, n, assign); 1134 ret = x86_pmu.schedule_events(cpuc, n, assign);
987 if (ret) 1135 if (ret)
988 return ret; 1136 goto out;
989 /* 1137 /*
990 * copy new assignment, now we know it is possible 1138 * copy new assignment, now we know it is possible
991 * will be used by hw_perf_enable() 1139 * will be used by hw_perf_enable()
992 */ 1140 */
993 memcpy(cpuc->assign, assign, n*sizeof(int)); 1141 memcpy(cpuc->assign, assign, n*sizeof(int));
994 1142
995out: 1143done_collect:
996 cpuc->n_events = n; 1144 cpuc->n_events = n;
997 cpuc->n_added += n - n0; 1145 cpuc->n_added += n - n0;
998 cpuc->n_txn += n - n0; 1146 cpuc->n_txn += n - n0;
999 1147
1000 return 0; 1148 ret = 0;
1149out:
1150 perf_pmu_enable(event->pmu);
1151 return ret;
1001} 1152}
1002 1153
1003static int x86_pmu_start(struct perf_event *event) 1154static void x86_pmu_start(struct perf_event *event, int flags)
1004{ 1155{
1005 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1156 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1006 int idx = event->hw.idx; 1157 int idx = event->hw.idx;
1007 1158
1008 if (idx == -1) 1159 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1009 return -EAGAIN; 1160 return;
1161
1162 if (WARN_ON_ONCE(idx == -1))
1163 return;
1164
1165 if (flags & PERF_EF_RELOAD) {
1166 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1167 x86_perf_event_set_period(event);
1168 }
1169
1170 event->hw.state = 0;
1010 1171
1011 x86_perf_event_set_period(event);
1012 cpuc->events[idx] = event; 1172 cpuc->events[idx] = event;
1013 __set_bit(idx, cpuc->active_mask); 1173 __set_bit(idx, cpuc->active_mask);
1014 __set_bit(idx, cpuc->running); 1174 __set_bit(idx, cpuc->running);
1015 x86_pmu.enable(event); 1175 x86_pmu.enable(event);
1016 perf_event_update_userpage(event); 1176 perf_event_update_userpage(event);
1017
1018 return 0;
1019}
1020
1021static void x86_pmu_unthrottle(struct perf_event *event)
1022{
1023 int ret = x86_pmu_start(event);
1024 WARN_ON_ONCE(ret);
1025} 1177}
1026 1178
1027void perf_event_print_debug(void) 1179void perf_event_print_debug(void)
@@ -1057,8 +1209,8 @@ void perf_event_print_debug(void)
1057 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1209 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1058 1210
1059 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1211 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1060 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1212 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1061 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1213 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1062 1214
1063 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1215 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1064 1216
@@ -1078,27 +1230,29 @@ void perf_event_print_debug(void)
1078 local_irq_restore(flags); 1230 local_irq_restore(flags);
1079} 1231}
1080 1232
1081static void x86_pmu_stop(struct perf_event *event) 1233static void x86_pmu_stop(struct perf_event *event, int flags)
1082{ 1234{
1083 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1235 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1084 struct hw_perf_event *hwc = &event->hw; 1236 struct hw_perf_event *hwc = &event->hw;
1085 int idx = hwc->idx;
1086
1087 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1088 return;
1089 1237
1090 x86_pmu.disable(event); 1238 if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1091 1239 x86_pmu.disable(event);
1092 /* 1240 cpuc->events[hwc->idx] = NULL;
1093 * Drain the remaining delta count out of a event 1241 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1094 * that we are disabling: 1242 hwc->state |= PERF_HES_STOPPED;
1095 */ 1243 }
1096 x86_perf_event_update(event);
1097 1244
1098 cpuc->events[idx] = NULL; 1245 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1246 /*
1247 * Drain the remaining delta count out of a event
1248 * that we are disabling:
1249 */
1250 x86_perf_event_update(event);
1251 hwc->state |= PERF_HES_UPTODATE;
1252 }
1099} 1253}
1100 1254
1101static void x86_pmu_disable(struct perf_event *event) 1255static void x86_pmu_del(struct perf_event *event, int flags)
1102{ 1256{
1103 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1257 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1104 int i; 1258 int i;
@@ -1111,7 +1265,7 @@ static void x86_pmu_disable(struct perf_event *event)
1111 if (cpuc->group_flag & PERF_EVENT_TXN) 1265 if (cpuc->group_flag & PERF_EVENT_TXN)
1112 return; 1266 return;
1113 1267
1114 x86_pmu_stop(event); 1268 x86_pmu_stop(event, PERF_EF_UPDATE);
1115 1269
1116 for (i = 0; i < cpuc->n_events; i++) { 1270 for (i = 0; i < cpuc->n_events; i++) {
1117 if (event == cpuc->event_list[i]) { 1271 if (event == cpuc->event_list[i]) {
@@ -1134,7 +1288,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1134 struct perf_sample_data data; 1288 struct perf_sample_data data;
1135 struct cpu_hw_events *cpuc; 1289 struct cpu_hw_events *cpuc;
1136 struct perf_event *event; 1290 struct perf_event *event;
1137 struct hw_perf_event *hwc;
1138 int idx, handled = 0; 1291 int idx, handled = 0;
1139 u64 val; 1292 u64 val;
1140 1293
@@ -1142,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1142 1295
1143 cpuc = &__get_cpu_var(cpu_hw_events); 1296 cpuc = &__get_cpu_var(cpu_hw_events);
1144 1297
1298 /*
1299 * Some chipsets need to unmask the LVTPC in a particular spot
1300 * inside the nmi handler. As a result, the unmasking was pushed
1301 * into all the nmi handlers.
1302 *
1303 * This generic handler doesn't seem to have any issues where the
1304 * unmasking occurs so it was left at the top.
1305 */
1306 apic_write(APIC_LVTPC, APIC_DM_NMI);
1307
1145 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1308 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1146 if (!test_bit(idx, cpuc->active_mask)) { 1309 if (!test_bit(idx, cpuc->active_mask)) {
1147 /* 1310 /*
@@ -1155,7 +1318,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1155 } 1318 }
1156 1319
1157 event = cpuc->events[idx]; 1320 event = cpuc->events[idx];
1158 hwc = &event->hw;
1159 1321
1160 val = x86_perf_event_update(event); 1322 val = x86_perf_event_update(event);
1161 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1323 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1333,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1171 continue; 1333 continue;
1172 1334
1173 if (perf_event_overflow(event, 1, &data, regs)) 1335 if (perf_event_overflow(event, 1, &data, regs))
1174 x86_pmu_stop(event); 1336 x86_pmu_stop(event, 0);
1175 } 1337 }
1176 1338
1177 if (handled) 1339 if (handled)
@@ -1180,25 +1342,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1180 return handled; 1342 return handled;
1181} 1343}
1182 1344
1183void smp_perf_pending_interrupt(struct pt_regs *regs)
1184{
1185 irq_enter();
1186 ack_APIC_irq();
1187 inc_irq_stat(apic_pending_irqs);
1188 perf_event_do_pending();
1189 irq_exit();
1190}
1191
1192void set_perf_event_pending(void)
1193{
1194#ifdef CONFIG_X86_LOCAL_APIC
1195 if (!x86_pmu.apic || !x86_pmu_initialized())
1196 return;
1197
1198 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1199#endif
1200}
1201
1202void perf_events_lapic_init(void) 1345void perf_events_lapic_init(void)
1203{ 1346{
1204 if (!x86_pmu.apic || !x86_pmu_initialized()) 1347 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1230,11 +1373,10 @@ perf_event_nmi_handler(struct notifier_block *self,
1230 1373
1231 switch (cmd) { 1374 switch (cmd) {
1232 case DIE_NMI: 1375 case DIE_NMI:
1233 case DIE_NMI_IPI:
1234 break; 1376 break;
1235 case DIE_NMIUNKNOWN: 1377 case DIE_NMIUNKNOWN:
1236 this_nmi = percpu_read(irq_stat.__nmi_count); 1378 this_nmi = percpu_read(irq_stat.__nmi_count);
1237 if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1379 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1238 /* let the kernel handle the unknown nmi */ 1380 /* let the kernel handle the unknown nmi */
1239 return NOTIFY_DONE; 1381 return NOTIFY_DONE;
1240 /* 1382 /*
@@ -1249,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self,
1249 return NOTIFY_DONE; 1391 return NOTIFY_DONE;
1250 } 1392 }
1251 1393
1252 apic_write(APIC_LVTPC, APIC_DM_NMI);
1253
1254 handled = x86_pmu.handle_irq(args->regs); 1394 handled = x86_pmu.handle_irq(args->regs);
1255 if (!handled) 1395 if (!handled)
1256 return NOTIFY_DONE; 1396 return NOTIFY_DONE;
@@ -1258,8 +1398,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1258 this_nmi = percpu_read(irq_stat.__nmi_count); 1398 this_nmi = percpu_read(irq_stat.__nmi_count);
1259 if ((handled > 1) || 1399 if ((handled > 1) ||
1260 /* the next nmi could be a back-to-back nmi */ 1400 /* the next nmi could be a back-to-back nmi */
1261 ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1401 ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1262 (__get_cpu_var(pmu_nmi).handled > 1))) { 1402 (__this_cpu_read(pmu_nmi.handled) > 1))) {
1263 /* 1403 /*
1264 * We could have two subsequent back-to-back nmis: The 1404 * We could have two subsequent back-to-back nmis: The
1265 * first handles more than one counter, the 2nd 1405 * first handles more than one counter, the 2nd
@@ -1270,8 +1410,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1270 * handling more than one counter. We will mark the 1410 * handling more than one counter. We will mark the
1271 * next (3rd) and then drop it if unhandled. 1411 * next (3rd) and then drop it if unhandled.
1272 */ 1412 */
1273 __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1413 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1274 __get_cpu_var(pmu_nmi).handled = handled; 1414 __this_cpu_write(pmu_nmi.handled, handled);
1275 } 1415 }
1276 1416
1277 return NOTIFY_STOP; 1417 return NOTIFY_STOP;
@@ -1280,7 +1420,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1280static __read_mostly struct notifier_block perf_event_nmi_notifier = { 1420static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1281 .notifier_call = perf_event_nmi_handler, 1421 .notifier_call = perf_event_nmi_handler,
1282 .next = NULL, 1422 .next = NULL,
1283 .priority = 1 1423 .priority = NMI_LOCAL_LOW_PRIOR,
1284}; 1424};
1285 1425
1286static struct event_constraint unconstrained; 1426static struct event_constraint unconstrained;
@@ -1353,7 +1493,7 @@ static void __init pmu_check_apic(void)
1353 pr_info("no hardware sampling interrupt available.\n"); 1493 pr_info("no hardware sampling interrupt available.\n");
1354} 1494}
1355 1495
1356void __init init_hw_perf_events(void) 1496static int __init init_hw_perf_events(void)
1357{ 1497{
1358 struct event_constraint *c; 1498 struct event_constraint *c;
1359 int err; 1499 int err;
@@ -1368,15 +1508,19 @@ void __init init_hw_perf_events(void)
1368 err = amd_pmu_init(); 1508 err = amd_pmu_init();
1369 break; 1509 break;
1370 default: 1510 default:
1371 return; 1511 return 0;
1372 } 1512 }
1373 if (err != 0) { 1513 if (err != 0) {
1374 pr_cont("no PMU driver, software events only.\n"); 1514 pr_cont("no PMU driver, software events only.\n");
1375 return; 1515 return 0;
1376 } 1516 }
1377 1517
1378 pmu_check_apic(); 1518 pmu_check_apic();
1379 1519
1520 /* sanity check that the hardware exists or is emulated */
1521 if (!check_hw_exists())
1522 return 0;
1523
1380 pr_cont("%s PMU driver.\n", x86_pmu.name); 1524 pr_cont("%s PMU driver.\n", x86_pmu.name);
1381 1525
1382 if (x86_pmu.quirks) 1526 if (x86_pmu.quirks)
@@ -1388,7 +1532,6 @@ void __init init_hw_perf_events(void)
1388 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 1532 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1389 } 1533 }
1390 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1534 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1391 perf_max_events = x86_pmu.num_counters;
1392 1535
1393 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1536 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1394 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1537 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,8 +1567,12 @@ void __init init_hw_perf_events(void)
1424 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1567 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1425 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1568 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1426 1569
1570 perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1427 perf_cpu_notifier(x86_pmu_notifier); 1571 perf_cpu_notifier(x86_pmu_notifier);
1572
1573 return 0;
1428} 1574}
1575early_initcall(init_hw_perf_events);
1429 1576
1430static inline void x86_pmu_read(struct perf_event *event) 1577static inline void x86_pmu_read(struct perf_event *event)
1431{ 1578{
@@ -1437,12 +1584,11 @@ static inline void x86_pmu_read(struct perf_event *event)
1437 * Set the flag to make pmu::enable() not perform the 1584 * Set the flag to make pmu::enable() not perform the
1438 * schedulability test, it will be performed at commit time 1585 * schedulability test, it will be performed at commit time
1439 */ 1586 */
1440static void x86_pmu_start_txn(const struct pmu *pmu) 1587static void x86_pmu_start_txn(struct pmu *pmu)
1441{ 1588{
1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1589 perf_pmu_disable(pmu);
1443 1590 __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1444 cpuc->group_flag |= PERF_EVENT_TXN; 1591 __this_cpu_write(cpu_hw_events.n_txn, 0);
1445 cpuc->n_txn = 0;
1446} 1592}
1447 1593
1448/* 1594/*
@@ -1450,16 +1596,15 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1450 * Clear the flag and pmu::enable() will perform the 1596 * Clear the flag and pmu::enable() will perform the
1451 * schedulability test. 1597 * schedulability test.
1452 */ 1598 */
1453static void x86_pmu_cancel_txn(const struct pmu *pmu) 1599static void x86_pmu_cancel_txn(struct pmu *pmu)
1454{ 1600{
1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1601 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1456
1457 cpuc->group_flag &= ~PERF_EVENT_TXN;
1458 /* 1602 /*
1459 * Truncate the collected events. 1603 * Truncate the collected events.
1460 */ 1604 */
1461 cpuc->n_added -= cpuc->n_txn; 1605 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1462 cpuc->n_events -= cpuc->n_txn; 1606 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1607 perf_pmu_enable(pmu);
1463} 1608}
1464 1609
1465/* 1610/*
@@ -1467,7 +1612,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1467 * Perform the group schedulability test as a whole 1612 * Perform the group schedulability test as a whole
1468 * Return 0 if success 1613 * Return 0 if success
1469 */ 1614 */
1470static int x86_pmu_commit_txn(const struct pmu *pmu) 1615static int x86_pmu_commit_txn(struct pmu *pmu)
1471{ 1616{
1472 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1617 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1473 int assign[X86_PMC_IDX_MAX]; 1618 int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1634,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1489 memcpy(cpuc->assign, assign, n*sizeof(int)); 1634 memcpy(cpuc->assign, assign, n*sizeof(int));
1490 1635
1491 cpuc->group_flag &= ~PERF_EVENT_TXN; 1636 cpuc->group_flag &= ~PERF_EVENT_TXN;
1492 1637 perf_pmu_enable(pmu);
1493 return 0; 1638 return 0;
1494} 1639}
1495 1640
1496static const struct pmu pmu = {
1497 .enable = x86_pmu_enable,
1498 .disable = x86_pmu_disable,
1499 .start = x86_pmu_start,
1500 .stop = x86_pmu_stop,
1501 .read = x86_pmu_read,
1502 .unthrottle = x86_pmu_unthrottle,
1503 .start_txn = x86_pmu_start_txn,
1504 .cancel_txn = x86_pmu_cancel_txn,
1505 .commit_txn = x86_pmu_commit_txn,
1506};
1507
1508/* 1641/*
1509 * validate that we can schedule this event 1642 * validate that we can schedule this event
1510 */ 1643 */
@@ -1579,12 +1712,22 @@ out:
1579 return ret; 1712 return ret;
1580} 1713}
1581 1714
1582const struct pmu *hw_perf_event_init(struct perf_event *event) 1715static int x86_pmu_event_init(struct perf_event *event)
1583{ 1716{
1584 const struct pmu *tmp; 1717 struct pmu *tmp;
1585 int err; 1718 int err;
1586 1719
1587 err = __hw_perf_event_init(event); 1720 switch (event->attr.type) {
1721 case PERF_TYPE_RAW:
1722 case PERF_TYPE_HARDWARE:
1723 case PERF_TYPE_HW_CACHE:
1724 break;
1725
1726 default:
1727 return -ENOENT;
1728 }
1729
1730 err = __x86_pmu_event_init(event);
1588 if (!err) { 1731 if (!err) {
1589 /* 1732 /*
1590 * we temporarily connect event to its pmu 1733 * we temporarily connect event to its pmu
@@ -1604,37 +1747,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1604 if (err) { 1747 if (err) {
1605 if (event->destroy) 1748 if (event->destroy)
1606 event->destroy(event); 1749 event->destroy(event);
1607 return ERR_PTR(err);
1608 } 1750 }
1609 1751
1610 return &pmu; 1752 return err;
1611} 1753}
1612 1754
1613/* 1755static struct pmu pmu = {
1614 * callchain support 1756 .pmu_enable = x86_pmu_enable,
1615 */ 1757 .pmu_disable = x86_pmu_disable,
1616
1617static inline
1618void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1619{
1620 if (entry->nr < PERF_MAX_STACK_DEPTH)
1621 entry->ip[entry->nr++] = ip;
1622}
1623 1758
1624static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1759 .event_init = x86_pmu_event_init,
1625static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1626 1760
1761 .add = x86_pmu_add,
1762 .del = x86_pmu_del,
1763 .start = x86_pmu_start,
1764 .stop = x86_pmu_stop,
1765 .read = x86_pmu_read,
1627 1766
1628static void 1767 .start_txn = x86_pmu_start_txn,
1629backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) 1768 .cancel_txn = x86_pmu_cancel_txn,
1630{ 1769 .commit_txn = x86_pmu_commit_txn,
1631 /* Ignore warnings */ 1770};
1632}
1633 1771
1634static void backtrace_warning(void *data, char *msg) 1772/*
1635{ 1773 * callchain support
1636 /* Ignore warnings */ 1774 */
1637}
1638 1775
1639static int backtrace_stack(void *data, char *name) 1776static int backtrace_stack(void *data, char *name)
1640{ 1777{
@@ -1645,24 +1782,26 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1645{ 1782{
1646 struct perf_callchain_entry *entry = data; 1783 struct perf_callchain_entry *entry = data;
1647 1784
1648 callchain_store(entry, addr); 1785 perf_callchain_store(entry, addr);
1649} 1786}
1650 1787
1651static const struct stacktrace_ops backtrace_ops = { 1788static const struct stacktrace_ops backtrace_ops = {
1652 .warning = backtrace_warning,
1653 .warning_symbol = backtrace_warning_symbol,
1654 .stack = backtrace_stack, 1789 .stack = backtrace_stack,
1655 .address = backtrace_address, 1790 .address = backtrace_address,
1656 .walk_stack = print_context_stack_bp, 1791 .walk_stack = print_context_stack_bp,
1657}; 1792};
1658 1793
1659static void 1794void
1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1795perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1661{ 1796{
1662 callchain_store(entry, PERF_CONTEXT_KERNEL); 1797 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1663 callchain_store(entry, regs->ip); 1798 /* TODO: We don't support guest os callchain now */
1799 return;
1800 }
1801
1802 perf_callchain_store(entry, regs->ip);
1664 1803
1665 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1804 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1666} 1805}
1667 1806
1668#ifdef CONFIG_COMPAT 1807#ifdef CONFIG_COMPAT
@@ -1689,7 +1828,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1689 if (fp < compat_ptr(regs->sp)) 1828 if (fp < compat_ptr(regs->sp))
1690 break; 1829 break;
1691 1830
1692 callchain_store(entry, frame.return_address); 1831 perf_callchain_store(entry, frame.return_address);
1693 fp = compat_ptr(frame.next_frame); 1832 fp = compat_ptr(frame.next_frame);
1694 } 1833 }
1695 return 1; 1834 return 1;
@@ -1702,19 +1841,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1702} 1841}
1703#endif 1842#endif
1704 1843
1705static void 1844void
1706perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1845perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1707{ 1846{
1708 struct stack_frame frame; 1847 struct stack_frame frame;
1709 const void __user *fp; 1848 const void __user *fp;
1710 1849
1711 if (!user_mode(regs)) 1850 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1712 regs = task_pt_regs(current); 1851 /* TODO: We don't support guest os callchain now */
1852 return;
1853 }
1713 1854
1714 fp = (void __user *)regs->bp; 1855 fp = (void __user *)regs->bp;
1715 1856
1716 callchain_store(entry, PERF_CONTEXT_USER); 1857 perf_callchain_store(entry, regs->ip);
1717 callchain_store(entry, regs->ip);
1718 1858
1719 if (perf_callchain_user32(regs, entry)) 1859 if (perf_callchain_user32(regs, entry))
1720 return; 1860 return;
@@ -1731,52 +1871,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1731 if ((unsigned long)fp < regs->sp) 1871 if ((unsigned long)fp < regs->sp)
1732 break; 1872 break;
1733 1873
1734 callchain_store(entry, frame.return_address); 1874 perf_callchain_store(entry, frame.return_address);
1735 fp = frame.next_frame; 1875 fp = frame.next_frame;
1736 } 1876 }
1737} 1877}
1738 1878
1739static void
1740perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1741{
1742 int is_user;
1743
1744 if (!regs)
1745 return;
1746
1747 is_user = user_mode(regs);
1748
1749 if (is_user && current->state != TASK_RUNNING)
1750 return;
1751
1752 if (!is_user)
1753 perf_callchain_kernel(regs, entry);
1754
1755 if (current->mm)
1756 perf_callchain_user(regs, entry);
1757}
1758
1759struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1760{
1761 struct perf_callchain_entry *entry;
1762
1763 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1764 /* TODO: We don't support guest os callchain now */
1765 return NULL;
1766 }
1767
1768 if (in_nmi())
1769 entry = &__get_cpu_var(pmc_nmi_entry);
1770 else
1771 entry = &__get_cpu_var(pmc_irq_entry);
1772
1773 entry->nr = 0;
1774
1775 perf_do_callchain(regs, entry);
1776
1777 return entry;
1778}
1779
1780unsigned long perf_instruction_pointer(struct pt_regs *regs) 1879unsigned long perf_instruction_pointer(struct pt_regs *regs)
1781{ 1880{
1782 unsigned long ip; 1881 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..fe29c1d2219e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
1#ifdef CONFIG_CPU_SUP_AMD 1#ifdef CONFIG_CPU_SUP_AMD
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst const u64 amd_hw_cache_event_ids 3static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 4 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 5 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -10,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
10 [ C(L1D) ] = { 8 [ C(L1D) ] = {
11 [ C(OP_READ) ] = { 9 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 11 [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
14 }, 12 },
15 [ C(OP_WRITE) ] = { 13 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ 14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -52,7 +50,7 @@ static __initconst const u64 amd_hw_cache_event_ids
52 [ C(DTLB) ] = { 50 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = { 51 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 52 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ 53 [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
56 }, 54 },
57 [ C(OP_WRITE) ] = { 55 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0, 56 [ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +64,7 @@ static __initconst const u64 amd_hw_cache_event_ids
66 [ C(ITLB) ] = { 64 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = { 65 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ 66 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ 67 [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
70 }, 68 },
71 [ C(OP_WRITE) ] = { 69 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1, 70 [ C(RESULT_ACCESS) ] = -1,
@@ -98,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
98 */ 96 */
99static const u64 amd_perfmon_event_map[] = 97static const u64 amd_perfmon_event_map[] =
100{ 98{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, 99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, 103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, 104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
105 [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
106 [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
107}; 107};
108 108
109static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
@@ -129,6 +129,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
129/* 129/*
130 * AMD64 events are detected based on their event codes. 130 * AMD64 events are detected based on their event codes.
131 */ 131 */
132static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
133{
134 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
135}
136
132static inline int amd_is_nb_event(struct hw_perf_event *hwc) 137static inline int amd_is_nb_event(struct hw_perf_event *hwc)
133{ 138{
134 return (hwc->config & 0xe0) == 0xe0; 139 return (hwc->config & 0xe0) == 0xe0;
@@ -275,17 +280,17 @@ done:
275 return &emptyconstraint; 280 return &emptyconstraint;
276} 281}
277 282
278static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) 283static struct amd_nb *amd_alloc_nb(int cpu)
279{ 284{
280 struct amd_nb *nb; 285 struct amd_nb *nb;
281 int i; 286 int i;
282 287
283 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); 288 nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
289 cpu_to_node(cpu));
284 if (!nb) 290 if (!nb)
285 return NULL; 291 return NULL;
286 292
287 memset(nb, 0, sizeof(*nb)); 293 nb->nb_id = -1;
288 nb->nb_id = nb_id;
289 294
290 /* 295 /*
291 * initialize all possible NB constraints 296 * initialize all possible NB constraints
@@ -306,7 +311,7 @@ static int amd_pmu_cpu_prepare(int cpu)
306 if (boot_cpu_data.x86_max_cores < 2) 311 if (boot_cpu_data.x86_max_cores < 2)
307 return NOTIFY_OK; 312 return NOTIFY_OK;
308 313
309 cpuc->amd_nb = amd_alloc_nb(cpu, -1); 314 cpuc->amd_nb = amd_alloc_nb(cpu);
310 if (!cpuc->amd_nb) 315 if (!cpuc->amd_nb)
311 return NOTIFY_BAD; 316 return NOTIFY_BAD;
312 317
@@ -325,8 +330,6 @@ static void amd_pmu_cpu_starting(int cpu)
325 nb_id = amd_get_nb_id(cpu); 330 nb_id = amd_get_nb_id(cpu);
326 WARN_ON_ONCE(nb_id == BAD_APICID); 331 WARN_ON_ONCE(nb_id == BAD_APICID);
327 332
328 raw_spin_lock(&amd_nb_lock);
329
330 for_each_online_cpu(i) { 333 for_each_online_cpu(i) {
331 nb = per_cpu(cpu_hw_events, i).amd_nb; 334 nb = per_cpu(cpu_hw_events, i).amd_nb;
332 if (WARN_ON_ONCE(!nb)) 335 if (WARN_ON_ONCE(!nb))
@@ -341,8 +344,6 @@ static void amd_pmu_cpu_starting(int cpu)
341 344
342 cpuc->amd_nb->nb_id = nb_id; 345 cpuc->amd_nb->nb_id = nb_id;
343 cpuc->amd_nb->refcnt++; 346 cpuc->amd_nb->refcnt++;
344
345 raw_spin_unlock(&amd_nb_lock);
346} 347}
347 348
348static void amd_pmu_cpu_dead(int cpu) 349static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +355,6 @@ static void amd_pmu_cpu_dead(int cpu)
354 355
355 cpuhw = &per_cpu(cpu_hw_events, cpu); 356 cpuhw = &per_cpu(cpu_hw_events, cpu);
356 357
357 raw_spin_lock(&amd_nb_lock);
358
359 if (cpuhw->amd_nb) { 358 if (cpuhw->amd_nb) {
360 struct amd_nb *nb = cpuhw->amd_nb; 359 struct amd_nb *nb = cpuhw->amd_nb;
361 360
@@ -364,8 +363,6 @@ static void amd_pmu_cpu_dead(int cpu)
364 363
365 cpuhw->amd_nb = NULL; 364 cpuhw->amd_nb = NULL;
366 } 365 }
367
368 raw_spin_unlock(&amd_nb_lock);
369} 366}
370 367
371static __initconst const struct x86_pmu amd_pmu = { 368static __initconst const struct x86_pmu amd_pmu = {
@@ -395,13 +392,195 @@ static __initconst const struct x86_pmu amd_pmu = {
395 .cpu_dead = amd_pmu_cpu_dead, 392 .cpu_dead = amd_pmu_cpu_dead,
396}; 393};
397 394
395/* AMD Family 15h */
396
397#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
398
399#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
400#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
401#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
402#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
403#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
404#define AMD_EVENT_EX_LS 0x000000C0ULL
405#define AMD_EVENT_DE 0x000000D0ULL
406#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
407
408/*
409 * AMD family 15h event code/PMC mappings:
410 *
411 * type = event_code & 0x0F0:
412 *
413 * 0x000 FP PERF_CTL[5:3]
414 * 0x010 FP PERF_CTL[5:3]
415 * 0x020 LS PERF_CTL[5:0]
416 * 0x030 LS PERF_CTL[5:0]
417 * 0x040 DC PERF_CTL[5:0]
418 * 0x050 DC PERF_CTL[5:0]
419 * 0x060 CU PERF_CTL[2:0]
420 * 0x070 CU PERF_CTL[2:0]
421 * 0x080 IC/DE PERF_CTL[2:0]
422 * 0x090 IC/DE PERF_CTL[2:0]
423 * 0x0A0 ---
424 * 0x0B0 ---
425 * 0x0C0 EX/LS PERF_CTL[5:0]
426 * 0x0D0 DE PERF_CTL[2:0]
427 * 0x0E0 NB NB_PERF_CTL[3:0]
428 * 0x0F0 NB NB_PERF_CTL[3:0]
429 *
430 * Exceptions:
431 *
432 * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
433 * 0x003 FP PERF_CTL[3]
434 * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
435 * 0x00B FP PERF_CTL[3]
436 * 0x00D FP PERF_CTL[3]
437 * 0x023 DE PERF_CTL[2:0]
438 * 0x02D LS PERF_CTL[3]
439 * 0x02E LS PERF_CTL[3,0]
440 * 0x043 CU PERF_CTL[2:0]
441 * 0x045 CU PERF_CTL[2:0]
442 * 0x046 CU PERF_CTL[2:0]
443 * 0x054 CU PERF_CTL[2:0]
444 * 0x055 CU PERF_CTL[2:0]
445 * 0x08F IC PERF_CTL[0]
446 * 0x187 DE PERF_CTL[0]
447 * 0x188 DE PERF_CTL[0]
448 * 0x0DB EX PERF_CTL[5:0]
449 * 0x0DC LS PERF_CTL[5:0]
450 * 0x0DD LS PERF_CTL[5:0]
451 * 0x0DE LS PERF_CTL[5:0]
452 * 0x0DF LS PERF_CTL[5:0]
453 * 0x1D6 EX PERF_CTL[5:0]
454 * 0x1D8 EX PERF_CTL[5:0]
455 *
456 * (*) depending on the umask all FPU counters may be used
457 */
458
459static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
460static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
461static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
462static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
463static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
464static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
465
466static struct event_constraint *
467amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
468{
469 struct hw_perf_event *hwc = &event->hw;
470 unsigned int event_code = amd_get_event_code(hwc);
471
472 switch (event_code & AMD_EVENT_TYPE_MASK) {
473 case AMD_EVENT_FP:
474 switch (event_code) {
475 case 0x000:
476 if (!(hwc->config & 0x0000F000ULL))
477 break;
478 if (!(hwc->config & 0x00000F00ULL))
479 break;
480 return &amd_f15_PMC3;
481 case 0x004:
482 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
483 break;
484 return &amd_f15_PMC3;
485 case 0x003:
486 case 0x00B:
487 case 0x00D:
488 return &amd_f15_PMC3;
489 }
490 return &amd_f15_PMC53;
491 case AMD_EVENT_LS:
492 case AMD_EVENT_DC:
493 case AMD_EVENT_EX_LS:
494 switch (event_code) {
495 case 0x023:
496 case 0x043:
497 case 0x045:
498 case 0x046:
499 case 0x054:
500 case 0x055:
501 return &amd_f15_PMC20;
502 case 0x02D:
503 return &amd_f15_PMC3;
504 case 0x02E:
505 return &amd_f15_PMC30;
506 default:
507 return &amd_f15_PMC50;
508 }
509 case AMD_EVENT_CU:
510 case AMD_EVENT_IC_DE:
511 case AMD_EVENT_DE:
512 switch (event_code) {
513 case 0x08F:
514 case 0x187:
515 case 0x188:
516 return &amd_f15_PMC0;
517 case 0x0DB ... 0x0DF:
518 case 0x1D6:
519 case 0x1D8:
520 return &amd_f15_PMC50;
521 default:
522 return &amd_f15_PMC20;
523 }
524 case AMD_EVENT_NB:
525 /* not yet implemented */
526 return &emptyconstraint;
527 default:
528 return &emptyconstraint;
529 }
530}
531
532static __initconst const struct x86_pmu amd_pmu_f15h = {
533 .name = "AMD Family 15h",
534 .handle_irq = x86_pmu_handle_irq,
535 .disable_all = x86_pmu_disable_all,
536 .enable_all = x86_pmu_enable_all,
537 .enable = x86_pmu_enable_event,
538 .disable = x86_pmu_disable_event,
539 .hw_config = amd_pmu_hw_config,
540 .schedule_events = x86_schedule_events,
541 .eventsel = MSR_F15H_PERF_CTL,
542 .perfctr = MSR_F15H_PERF_CTR,
543 .event_map = amd_pmu_event_map,
544 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
545 .num_counters = 6,
546 .cntval_bits = 48,
547 .cntval_mask = (1ULL << 48) - 1,
548 .apic = 1,
549 /* use highest bit to detect overflow */
550 .max_period = (1ULL << 47) - 1,
551 .get_event_constraints = amd_get_event_constraints_f15h,
552 /* nortbridge counters not yet implemented: */
553#if 0
554 .put_event_constraints = amd_put_event_constraints,
555
556 .cpu_prepare = amd_pmu_cpu_prepare,
557 .cpu_starting = amd_pmu_cpu_starting,
558 .cpu_dead = amd_pmu_cpu_dead,
559#endif
560};
561
398static __init int amd_pmu_init(void) 562static __init int amd_pmu_init(void)
399{ 563{
400 /* Performance-monitoring supported from K7 and later: */ 564 /* Performance-monitoring supported from K7 and later: */
401 if (boot_cpu_data.x86 < 6) 565 if (boot_cpu_data.x86 < 6)
402 return -ENODEV; 566 return -ENODEV;
403 567
404 x86_pmu = amd_pmu; 568 /*
569 * If core performance counter extensions exists, it must be
570 * family 15h, otherwise fail. See x86_pmu_addr_offset().
571 */
572 switch (boot_cpu_data.x86) {
573 case 0x15:
574 if (!cpu_has_perfctr_core)
575 return -ENODEV;
576 x86_pmu = amd_pmu_f15h;
577 break;
578 default:
579 if (cpu_has_perfctr_core)
580 return -ENODEV;
581 x86_pmu = amd_pmu;
582 break;
583 }
405 584
406 /* Events are common for all AMDs */ 585 /* Events are common for all AMDs */
407 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 586 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d2..41178c826c48 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,9 +1,31 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
6static const u64 intel_perfmon_event_map[] = 28static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
7{ 29{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 30 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 31 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -14,7 +36,7 @@ static const u64 intel_perfmon_event_map[] =
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15}; 37};
16 38
17static struct event_constraint intel_core_event_constraints[] = 39static struct event_constraint intel_core_event_constraints[] __read_mostly =
18{ 40{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -25,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
25 EVENT_CONSTRAINT_END 47 EVENT_CONSTRAINT_END
26}; 48};
27 49
28static struct event_constraint intel_core2_event_constraints[] = 50static struct event_constraint intel_core2_event_constraints[] __read_mostly =
29{ 51{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -48,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
48 EVENT_CONSTRAINT_END 70 EVENT_CONSTRAINT_END
49}; 71};
50 72
51static struct event_constraint intel_nehalem_event_constraints[] = 73static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
52{ 74{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -64,7 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
67static struct event_constraint intel_westmere_event_constraints[] = 89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
101static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -76,7 +110,34 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
79static struct event_constraint intel_gen_event_constraints[] = 113static struct event_constraint intel_snb_event_constraints[] __read_mostly =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
140static struct event_constraint intel_gen_event_constraints[] __read_mostly =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
82 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 [ C(OP_READ) ] = {
188 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
189 [ C(RESULT_ACCESS) ] = 0x01b7,
190 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
191 [ C(RESULT_MISS) ] = 0x01b7,
192 },
193 [ C(OP_WRITE) ] = {
194 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
195 [ C(RESULT_ACCESS) ] = 0x01b7,
196 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
197 [ C(RESULT_MISS) ] = 0x01b7,
198 },
199 [ C(OP_PREFETCH) ] = {
200 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
201 [ C(RESULT_ACCESS) ] = 0x01b7,
202 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
203 [ C(RESULT_MISS) ] = 0x01b7,
204 },
205 },
206 [ C(DTLB) ] = {
207 [ C(OP_READ) ] = {
208 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
209 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
210 },
211 [ C(OP_WRITE) ] = {
212 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
213 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
214 },
215 [ C(OP_PREFETCH) ] = {
216 [ C(RESULT_ACCESS) ] = 0x0,
217 [ C(RESULT_MISS) ] = 0x0,
218 },
219 },
220 [ C(ITLB) ] = {
221 [ C(OP_READ) ] = {
222 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
223 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
224 },
225 [ C(OP_WRITE) ] = {
226 [ C(RESULT_ACCESS) ] = -1,
227 [ C(RESULT_MISS) ] = -1,
228 },
229 [ C(OP_PREFETCH) ] = {
230 [ C(RESULT_ACCESS) ] = -1,
231 [ C(RESULT_MISS) ] = -1,
232 },
233 },
234 [ C(BPU ) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
237 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248};
249
92static __initconst const u64 westmere_hw_cache_event_ids 250static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 251 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 252 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 282 },
125 [ C(LL ) ] = { 283 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 284 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 285 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 286 [ C(RESULT_ACCESS) ] = 0x01b7,
287 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
288 [ C(RESULT_MISS) ] = 0x01b7,
129 }, 289 },
290 /*
291 * Use RFO, not WRITEBACK, because a write miss would typically occur
292 * on RFO.
293 */
130 [ C(OP_WRITE) ] = { 294 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 295 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 296 [ C(RESULT_ACCESS) ] = 0x01b7,
297 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
298 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 299 },
134 [ C(OP_PREFETCH) ] = { 300 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 301 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 302 [ C(RESULT_ACCESS) ] = 0x01b7,
303 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
304 [ C(RESULT_MISS) ] = 0x01b7,
137 }, 305 },
138 }, 306 },
139 [ C(DTLB) ] = { 307 [ C(DTLB) ] = {
@@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 348 },
181}; 349};
182 350
351/*
352 * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
353 * See IA32 SDM Vol 3B 30.6.1.3
354 */
355
356#define NHM_DMND_DATA_RD (1 << 0)
357#define NHM_DMND_RFO (1 << 1)
358#define NHM_DMND_IFETCH (1 << 2)
359#define NHM_DMND_WB (1 << 3)
360#define NHM_PF_DATA_RD (1 << 4)
361#define NHM_PF_DATA_RFO (1 << 5)
362#define NHM_PF_IFETCH (1 << 6)
363#define NHM_OFFCORE_OTHER (1 << 7)
364#define NHM_UNCORE_HIT (1 << 8)
365#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
366#define NHM_OTHER_CORE_HITM (1 << 10)
367 /* reserved */
368#define NHM_REMOTE_CACHE_FWD (1 << 12)
369#define NHM_REMOTE_DRAM (1 << 13)
370#define NHM_LOCAL_DRAM (1 << 14)
371#define NHM_NON_DRAM (1 << 15)
372
373#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
374
375#define NHM_DMND_READ (NHM_DMND_DATA_RD)
376#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
377#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
378
379#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
380#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
381#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
382
383static __initconst const u64 nehalem_hw_cache_extra_regs
384 [PERF_COUNT_HW_CACHE_MAX]
385 [PERF_COUNT_HW_CACHE_OP_MAX]
386 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
387{
388 [ C(LL ) ] = {
389 [ C(OP_READ) ] = {
390 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
391 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
392 },
393 [ C(OP_WRITE) ] = {
394 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
395 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
396 },
397 [ C(OP_PREFETCH) ] = {
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 },
401 }
402};
403
183static __initconst const u64 nehalem_hw_cache_event_ids 404static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 405 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 406 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
187{ 408{
188 [ C(L1D) ] = { 409 [ C(L1D) ] = {
189 [ C(OP_READ) ] = { 410 [ C(OP_READ) ] = {
190 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ 411 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
191 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ 412 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
192 }, 413 },
193 [ C(OP_WRITE) ] = { 414 [ C(OP_WRITE) ] = {
194 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ 415 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
195 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ 416 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
196 }, 417 },
197 [ C(OP_PREFETCH) ] = { 418 [ C(OP_PREFETCH) ] = {
198 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ 419 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 436 },
216 [ C(LL ) ] = { 437 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 438 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 439 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 440 [ C(RESULT_ACCESS) ] = 0x01b7,
441 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
442 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 443 },
444 /*
445 * Use RFO, not WRITEBACK, because a write miss would typically occur
446 * on RFO.
447 */
221 [ C(OP_WRITE) ] = { 448 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 449 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 450 [ C(RESULT_ACCESS) ] = 0x01b7,
451 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
452 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 453 },
225 [ C(OP_PREFETCH) ] = { 454 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 455 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 456 [ C(RESULT_ACCESS) ] = 0x01b7,
457 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
458 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 459 },
229 }, 460 },
230 [ C(DTLB) ] = { 461 [ C(DTLB) ] = {
@@ -649,7 +880,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
649 struct hw_perf_event *hwc = &event->hw; 880 struct hw_perf_event *hwc = &event->hw;
650 881
651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 882 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
652 if (!__get_cpu_var(cpu_hw_events).enabled) 883 if (!__this_cpu_read(cpu_hw_events.enabled))
653 return; 884 return;
654 885
655 intel_pmu_enable_bts(hwc->config); 886 intel_pmu_enable_bts(hwc->config);
@@ -679,7 +910,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
679 910
680static void intel_pmu_reset(void) 911static void intel_pmu_reset(void)
681{ 912{
682 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; 913 struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
683 unsigned long flags; 914 unsigned long flags;
684 int idx; 915 int idx;
685 916
@@ -691,8 +922,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 922 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 923
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 924 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 925 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 926 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 927 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 928 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 929 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -713,18 +944,28 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
713 struct cpu_hw_events *cpuc; 944 struct cpu_hw_events *cpuc;
714 int bit, loops; 945 int bit, loops;
715 u64 status; 946 u64 status;
716 int handled = 0; 947 int handled;
717 948
718 perf_sample_data_init(&data, 0); 949 perf_sample_data_init(&data, 0);
719 950
720 cpuc = &__get_cpu_var(cpu_hw_events); 951 cpuc = &__get_cpu_var(cpu_hw_events);
721 952
953 /*
954 * Some chipsets need to unmask the LVTPC in a particular spot
955 * inside the nmi handler. As a result, the unmasking was pushed
956 * into all the nmi handlers.
957 *
958 * This handler doesn't seem to have any issues with the unmasking
959 * so it was left at the top.
960 */
961 apic_write(APIC_LVTPC, APIC_DM_NMI);
962
722 intel_pmu_disable_all(); 963 intel_pmu_disable_all();
723 intel_pmu_drain_bts_buffer(); 964 handled = intel_pmu_drain_bts_buffer();
724 status = intel_pmu_get_status(); 965 status = intel_pmu_get_status();
725 if (!status) { 966 if (!status) {
726 intel_pmu_enable_all(0); 967 intel_pmu_enable_all(0);
727 return 0; 968 return handled;
728 } 969 }
729 970
730 loops = 0; 971 loops = 0;
@@ -763,7 +1004,7 @@ again:
763 data.period = event->hw.last_period; 1004 data.period = event->hw.last_period;
764 1005
765 if (perf_event_overflow(event, 1, &data, regs)) 1006 if (perf_event_overflow(event, 1, &data, regs))
766 x86_pmu_stop(event); 1007 x86_pmu_stop(event, 0);
767 } 1008 }
768 1009
769 /* 1010 /*
@@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
784 struct hw_perf_event *hwc = &event->hw; 1025 struct hw_perf_event *hwc = &event->hw;
785 unsigned int hw_event, bts_event; 1026 unsigned int hw_event, bts_event;
786 1027
1028 if (event->attr.freq)
1029 return NULL;
1030
787 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; 1031 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
788 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 1032 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
789 1033
@@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1038}
795 1039
796static struct event_constraint * 1040static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1042{
1043 struct hw_perf_event *hwc = &event->hw;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era;
1048 int i;
1049 int free_slot;
1050 int found;
1051
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1053 return NULL;
1054
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1056 if (e != c->code)
1057 continue;
1058
1059 /*
1060 * Allocate resource per core.
1061 */
1062 pc = cpuc->per_core;
1063 if (!pc)
1064 break;
1065 c = &emptyconstraint;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 }
1097
1098 return NULL;
1099}
1100
1101static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1102intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1103{
799 struct event_constraint *c; 1104 struct event_constraint *c;
@@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1111 if (c)
807 return c; 1112 return c;
808 1113
1114 c = intel_percore_constraints(cpuc, event);
1115 if (c)
1116 return c;
1117
809 return x86_get_event_constraints(cpuc, event); 1118 return x86_get_event_constraints(cpuc, event);
810} 1119}
811 1120
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event)
1123{
1124 struct extra_reg *er;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129
1130 if (!cpuc->percore_used)
1131 return;
1132
1133 for (er = x86_pmu.extra_regs; er->msr; er++) {
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136
1137 pc = cpuc->per_core;
1138 raw_spin_lock(&pc->lock);
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1140 era = &pc->regs[i];
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157}
1158
812static int intel_pmu_hw_config(struct perf_event *event) 1159static int intel_pmu_hw_config(struct perf_event *event)
813{ 1160{
814 int ret = x86_pmu_hw_config(event); 1161 int ret = x86_pmu_hw_config(event);
@@ -816,6 +1163,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
816 if (ret) 1163 if (ret)
817 return ret; 1164 return ret;
818 1165
1166 if (event->attr.precise_ip &&
1167 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1168 /*
1169 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1170 * (0x003c) so that we can use it with PEBS.
1171 *
1172 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
1173 * PEBS capable. However we can use INST_RETIRED.ANY_P
1174 * (0x00c0), which is a PEBS capable event, to get the same
1175 * count.
1176 *
1177 * INST_RETIRED.ANY_P counts the number of cycles that retires
1178 * CNTMASK instructions. By setting CNTMASK to a value (16)
1179 * larger than the maximum number of instructions that can be
1180 * retired per cycle (4) and then inverting the condition, we
1181 * count all cycles that retire 16 or less instructions, which
1182 * is every cycle.
1183 *
1184 * Thereby we gain a PEBS capable cycle counter.
1185 */
1186 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
1187
1188 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1189 event->hw.config = alt_config;
1190 }
1191
819 if (event->attr.type != PERF_TYPE_RAW) 1192 if (event->attr.type != PERF_TYPE_RAW)
820 return 0; 1193 return 0;
821 1194
@@ -854,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = {
854 */ 1227 */
855 .max_period = (1ULL << 31) - 1, 1228 .max_period = (1ULL << 31) - 1,
856 .get_event_constraints = intel_get_event_constraints, 1229 .get_event_constraints = intel_get_event_constraints,
1230 .put_event_constraints = intel_put_event_constraints,
857 .event_constraints = intel_core_event_constraints, 1231 .event_constraints = intel_core_event_constraints,
858}; 1232};
859 1233
1234static int intel_pmu_cpu_prepare(int cpu)
1235{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237
1238 if (!cpu_has_ht_siblings())
1239 return NOTIFY_OK;
1240
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1242 GFP_KERNEL, cpu_to_node(cpu));
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD;
1245
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK;
1249}
1250
860static void intel_pmu_cpu_starting(int cpu) 1251static void intel_pmu_cpu_starting(int cpu)
861{ 1252{
1253 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1254 int core_id = topology_core_id(cpu);
1255 int i;
1256
862 init_debug_store_on_cpu(cpu); 1257 init_debug_store_on_cpu(cpu);
863 /* 1258 /*
864 * Deal with CPUs that don't clear their LBRs on power-up. 1259 * Deal with CPUs that don't clear their LBRs on power-up.
865 */ 1260 */
866 intel_pmu_lbr_reset(); 1261 intel_pmu_lbr_reset();
1262
1263 if (!cpu_has_ht_siblings())
1264 return;
1265
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1268
1269 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core);
1271 cpuc->per_core = pc;
1272 break;
1273 }
1274 }
1275
1276 cpuc->per_core->core_id = core_id;
1277 cpuc->per_core->refcnt++;
867} 1278}
868 1279
869static void intel_pmu_cpu_dying(int cpu) 1280static void intel_pmu_cpu_dying(int cpu)
870{ 1281{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core;
1284
1285 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc);
1288 cpuc->per_core = NULL;
1289 }
1290
871 fini_debug_store_on_cpu(cpu); 1291 fini_debug_store_on_cpu(cpu);
872} 1292}
873 1293
@@ -892,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = {
892 */ 1312 */
893 .max_period = (1ULL << 31) - 1, 1313 .max_period = (1ULL << 31) - 1,
894 .get_event_constraints = intel_get_event_constraints, 1314 .get_event_constraints = intel_get_event_constraints,
1315 .put_event_constraints = intel_put_event_constraints,
895 1316
1317 .cpu_prepare = intel_pmu_cpu_prepare,
896 .cpu_starting = intel_pmu_cpu_starting, 1318 .cpu_starting = intel_pmu_cpu_starting,
897 .cpu_dying = intel_pmu_cpu_dying, 1319 .cpu_dying = intel_pmu_cpu_dying,
898}; 1320};
@@ -913,7 +1335,7 @@ static void intel_clovertown_quirks(void)
913 * AJ106 could possibly be worked around by not allowing LBR 1335 * AJ106 could possibly be worked around by not allowing LBR
914 * usage from PEBS, including the fixup. 1336 * usage from PEBS, including the fixup.
915 * AJ68 could possibly be worked around by always programming 1337 * AJ68 could possibly be worked around by always programming
916 * a pebs_event_reset[0] value and coping with the lost events. 1338 * a pebs_event_reset[0] value and coping with the lost events.
917 * 1339 *
918 * But taken together it might just make sense to not enable PEBS on 1340 * But taken together it might just make sense to not enable PEBS on
919 * these chips. 1341 * these chips.
@@ -998,6 +1420,7 @@ static __init int intel_pmu_init(void)
998 intel_pmu_lbr_init_core(); 1420 intel_pmu_lbr_init_core();
999 1421
1000 x86_pmu.event_constraints = intel_core2_event_constraints; 1422 x86_pmu.event_constraints = intel_core2_event_constraints;
1423 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1001 pr_cont("Core2 events, "); 1424 pr_cont("Core2 events, ");
1002 break; 1425 break;
1003 1426
@@ -1006,11 +1429,33 @@ static __init int intel_pmu_init(void)
1006 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1429 case 46: /* 45 nm nehalem-ex, "Beckton" */
1007 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1430 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1008 sizeof(hw_cache_event_ids)); 1431 sizeof(hw_cache_event_ids));
1432 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1433 sizeof(hw_cache_extra_regs));
1009 1434
1010 intel_pmu_lbr_init_nhm(); 1435 intel_pmu_lbr_init_nhm();
1011 1436
1012 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1437 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1013 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442
1443 /* UOPS_ISSUED.STALLED_CYCLES */
1444 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1445 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1446 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1447
1448 if (ebx & 0x40) {
1449 /*
1450 * Erratum AAJ80 detected, we work it around by using
1451 * the BR_MISP_EXEC.ANY event. This will over-count
1452 * branch-misses, but it's still much better than the
1453 * architectural event which is often completely bogus:
1454 */
1455 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1456
1457 pr_cont("erratum AAJ80 worked around, ");
1458 }
1014 pr_cont("Nehalem events, "); 1459 pr_cont("Nehalem events, ");
1015 break; 1460 break;
1016 1461
@@ -1021,21 +1466,51 @@ static __init int intel_pmu_init(void)
1021 intel_pmu_lbr_init_atom(); 1466 intel_pmu_lbr_init_atom();
1022 1467
1023 x86_pmu.event_constraints = intel_gen_event_constraints; 1468 x86_pmu.event_constraints = intel_gen_event_constraints;
1469 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1024 pr_cont("Atom events, "); 1470 pr_cont("Atom events, ");
1025 break; 1471 break;
1026 1472
1027 case 37: /* 32 nm nehalem, "Clarkdale" */ 1473 case 37: /* 32 nm nehalem, "Clarkdale" */
1028 case 44: /* 32 nm nehalem, "Gulftown" */ 1474 case 44: /* 32 nm nehalem, "Gulftown" */
1475 case 47: /* 32 nm Xeon E7 */
1029 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1476 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1030 sizeof(hw_cache_event_ids)); 1477 sizeof(hw_cache_event_ids));
1478 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1479 sizeof(hw_cache_extra_regs));
1031 1480
1032 intel_pmu_lbr_init_nhm(); 1481 intel_pmu_lbr_init_nhm();
1033 1482
1034 x86_pmu.event_constraints = intel_westmere_event_constraints; 1483 x86_pmu.event_constraints = intel_westmere_event_constraints;
1484 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1035 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1487 x86_pmu.extra_regs = intel_westmere_extra_regs;
1488
1489 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1491 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1492 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1493
1036 pr_cont("Westmere events, "); 1494 pr_cont("Westmere events, ");
1037 break; 1495 break;
1038 1496
1497 case 42: /* SandyBridge */
1498 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1499 sizeof(hw_cache_event_ids));
1500
1501 intel_pmu_lbr_init_nhm();
1502
1503 x86_pmu.event_constraints = intel_snb_event_constraints;
1504 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1505
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1508 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1509 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
1510
1511 pr_cont("SandyBridge events, ");
1512 break;
1513
1039 default: 1514 default:
1040 /* 1515 /*
1041 * default constraints for v2 and up 1516 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); 74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75} 75}
76 76
77static int alloc_pebs_buffer(int cpu)
78{
79 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
80 int node = cpu_to_node(cpu);
81 int max, thresh = 1; /* always use a single PEBS record */
82 void *buffer;
83
84 if (!x86_pmu.pebs)
85 return 0;
86
87 buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
88 if (unlikely(!buffer))
89 return -ENOMEM;
90
91 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
92
93 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
94 ds->pebs_index = ds->pebs_buffer_base;
95 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
96 max * x86_pmu.pebs_record_size;
97
98 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
99 thresh * x86_pmu.pebs_record_size;
100
101 return 0;
102}
103
104static void release_pebs_buffer(int cpu)
105{
106 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
107
108 if (!ds || !x86_pmu.pebs)
109 return;
110
111 kfree((void *)(unsigned long)ds->pebs_buffer_base);
112 ds->pebs_buffer_base = 0;
113}
114
115static int alloc_bts_buffer(int cpu)
116{
117 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
118 int node = cpu_to_node(cpu);
119 int max, thresh;
120 void *buffer;
121
122 if (!x86_pmu.bts)
123 return 0;
124
125 buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
126 if (unlikely(!buffer))
127 return -ENOMEM;
128
129 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
130 thresh = max / 16;
131
132 ds->bts_buffer_base = (u64)(unsigned long)buffer;
133 ds->bts_index = ds->bts_buffer_base;
134 ds->bts_absolute_maximum = ds->bts_buffer_base +
135 max * BTS_RECORD_SIZE;
136 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
137 thresh * BTS_RECORD_SIZE;
138
139 return 0;
140}
141
142static void release_bts_buffer(int cpu)
143{
144 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
145
146 if (!ds || !x86_pmu.bts)
147 return;
148
149 kfree((void *)(unsigned long)ds->bts_buffer_base);
150 ds->bts_buffer_base = 0;
151}
152
153static int alloc_ds_buffer(int cpu)
154{
155 int node = cpu_to_node(cpu);
156 struct debug_store *ds;
157
158 ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
159 if (unlikely(!ds))
160 return -ENOMEM;
161
162 per_cpu(cpu_hw_events, cpu).ds = ds;
163
164 return 0;
165}
166
167static void release_ds_buffer(int cpu)
168{
169 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
170
171 if (!ds)
172 return;
173
174 per_cpu(cpu_hw_events, cpu).ds = NULL;
175 kfree(ds);
176}
177
77static void release_ds_buffers(void) 178static void release_ds_buffers(void)
78{ 179{
79 int cpu; 180 int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
82 return; 183 return;
83 184
84 get_online_cpus(); 185 get_online_cpus();
85
86 for_each_online_cpu(cpu) 186 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu); 187 fini_debug_store_on_cpu(cpu);
88 188
89 for_each_possible_cpu(cpu) { 189 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 190 release_pebs_buffer(cpu);
91 191 release_bts_buffer(cpu);
92 if (!ds) 192 release_ds_buffer(cpu);
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 } 193 }
101
102 put_online_cpus(); 194 put_online_cpus();
103} 195}
104 196
105static int reserve_ds_buffers(void) 197static void reserve_ds_buffers(void)
106{ 198{
107 int cpu, err = 0; 199 int bts_err = 0, pebs_err = 0;
200 int cpu;
201
202 x86_pmu.bts_active = 0;
203 x86_pmu.pebs_active = 0;
108 204
109 if (!x86_pmu.bts && !x86_pmu.pebs) 205 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0; 206 return;
207
208 if (!x86_pmu.bts)
209 bts_err = 1;
210
211 if (!x86_pmu.pebs)
212 pebs_err = 1;
111 213
112 get_online_cpus(); 214 get_online_cpus();
113 215
114 for_each_possible_cpu(cpu) { 216 for_each_possible_cpu(cpu) {
115 struct debug_store *ds; 217 if (alloc_ds_buffer(cpu)) {
116 void *buffer; 218 bts_err = 1;
117 int max, thresh; 219 pebs_err = 1;
220 }
118 221
119 err = -ENOMEM; 222 if (!bts_err && alloc_bts_buffer(cpu))
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL); 223 bts_err = 1;
121 if (unlikely(!ds)) 224
225 if (!pebs_err && alloc_pebs_buffer(cpu))
226 pebs_err = 1;
227
228 if (bts_err && pebs_err)
122 break; 229 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds; 230 }
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140 231
141 if (x86_pmu.pebs) { 232 if (bts_err) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); 233 for_each_possible_cpu(cpu)
143 if (unlikely(!buffer)) 234 release_bts_buffer(cpu);
144 break; 235 }
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158 236
159 err = 0; 237 if (pebs_err) {
238 for_each_possible_cpu(cpu)
239 release_pebs_buffer(cpu);
160 } 240 }
161 241
162 if (err) 242 if (bts_err && pebs_err) {
163 release_ds_buffers(); 243 for_each_possible_cpu(cpu)
164 else { 244 release_ds_buffer(cpu);
245 } else {
246 if (x86_pmu.bts && !bts_err)
247 x86_pmu.bts_active = 1;
248
249 if (x86_pmu.pebs && !pebs_err)
250 x86_pmu.pebs_active = 1;
251
165 for_each_online_cpu(cpu) 252 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu); 253 init_debug_store_on_cpu(cpu);
167 } 254 }
168 255
169 put_online_cpus(); 256 put_online_cpus();
170
171 return err;
172} 257}
173 258
174/* 259/*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
214 update_debugctlmsr(debugctlmsr); 299 update_debugctlmsr(debugctlmsr);
215} 300}
216 301
217static void intel_pmu_drain_bts_buffer(void) 302static int intel_pmu_drain_bts_buffer(void)
218{ 303{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 304 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds; 305 struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
231 struct pt_regs regs; 316 struct pt_regs regs;
232 317
233 if (!event) 318 if (!event)
234 return; 319 return 0;
235 320
236 if (!ds) 321 if (!x86_pmu.bts_active)
237 return; 322 return 0;
238 323
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 324 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index; 325 top = (struct bts_record *)(unsigned long)ds->bts_index;
241 326
242 if (top <= at) 327 if (top <= at)
243 return; 328 return 0;
244 329
245 ds->bts_index = ds->bts_buffer_base; 330 ds->bts_index = ds->bts_buffer_base;
246 331
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
256 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
257 342
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return; 344 return 1;
260 345
261 for (; at < top; at++) { 346 for (; at < top; at++) {
262 data.ip = at->from; 347 data.ip = at->from;
@@ -270,35 +355,76 @@ static void intel_pmu_drain_bts_buffer(void)
270 /* There's new data available. */ 355 /* There's new data available. */
271 event->hw.interrupts++; 356 event->hw.interrupts++;
272 event->pending_kill = POLL_IN; 357 event->pending_kill = POLL_IN;
358 return 1;
273} 359}
274 360
275/* 361/*
276 * PEBS 362 * PEBS
277 */ 363 */
364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
368 INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
370 EVENT_CONSTRAINT_END
371};
372
373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
377 EVENT_CONSTRAINT_END
378};
379
380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
387 INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
389 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
392 EVENT_CONSTRAINT_END
393};
278 394
279static struct event_constraint intel_core_pebs_events[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 401 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 402 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 403 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 404 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
405 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
406 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
289 EVENT_CONSTRAINT_END 407 EVENT_CONSTRAINT_END
290}; 408};
291 409
292static struct event_constraint intel_nehalem_pebs_events[] = { 410static struct event_constraint intel_snb_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
302 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
303}; 429};
304 430
@@ -491,7 +617,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
491 regs.flags &= ~PERF_EFLAGS_EXACT; 617 regs.flags &= ~PERF_EFLAGS_EXACT;
492 618
493 if (perf_event_overflow(event, 1, &data, &regs)) 619 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event); 620 x86_pmu_stop(event, 0);
495} 621}
496 622
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) 623static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +628,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
502 struct pebs_record_core *at, *top; 628 struct pebs_record_core *at, *top;
503 int n; 629 int n;
504 630
505 if (!ds || !x86_pmu.pebs) 631 if (!x86_pmu.pebs_active)
506 return; 632 return;
507 633
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; 634 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
544 u64 status = 0; 670 u64 status = 0;
545 int bit, n; 671 int bit, n;
546 672
547 if (!ds || !x86_pmu.pebs) 673 if (!x86_pmu.pebs_active)
548 return; 674 return;
549 675
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; 676 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -609,29 +735,25 @@ static void intel_ds_init(void)
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 735 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 736 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 737 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break; 738 break;
614 739
615 case 1: 740 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 741 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 742 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 743 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break; 744 break;
621 745
622 default: 746 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 747 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0; 748 x86_pmu.pebs = 0;
625 break;
626 } 749 }
627 } 750 }
628} 751}
629 752
630#else /* CONFIG_CPU_SUP_INTEL */ 753#else /* CONFIG_CPU_SUP_INTEL */
631 754
632static int reserve_ds_buffers(void) 755static void reserve_ds_buffers(void)
633{ 756{
634 return 0;
635} 757}
636 758
637static void release_ds_buffers(void) 759static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 249015173992..ead584fb6a7d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -18,6 +18,8 @@
18struct p4_event_bind { 18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */ 19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */ 20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 unsigned int escr_emask; /* valid ESCR EventMask bits */
22 unsigned int shared; /* event is shared across threads */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 23 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 24};
23 25
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = { 68 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), 69 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 70 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
71 .escr_emask =
72 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
73 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
74 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
75 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
76 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
77 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
78 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
79 .shared = 1,
69 .cntr = { {4, 5, -1}, {6, 7, -1} }, 80 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 }, 81 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = { 82 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), 83 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, 84 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
85 .escr_emask =
86 P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
74 .cntr = { {0, -1, -1}, {2, -1, -1} }, 87 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 }, 88 },
76 [P4_EVENT_ITLB_REFERENCE] = { 89 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), 90 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, 91 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
92 .escr_emask =
93 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
94 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
95 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
79 .cntr = { {0, -1, -1}, {2, -1, -1} }, 96 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 }, 97 },
81 [P4_EVENT_MEMORY_CANCEL] = { 98 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), 99 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 100 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
101 .escr_emask =
102 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
103 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
84 .cntr = { {8, 9, -1}, {10, 11, -1} }, 104 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 }, 105 },
86 [P4_EVENT_MEMORY_COMPLETE] = { 106 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), 107 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 108 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
109 .escr_emask =
110 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
111 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
89 .cntr = { {8, 9, -1}, {10, 11, -1} }, 112 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 }, 113 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = { 114 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), 115 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, 116 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
117 .escr_emask =
118 P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
94 .cntr = { {8, 9, -1}, {10, 11, -1} }, 119 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 }, 120 },
96 [P4_EVENT_STORE_PORT_REPLAY] = { 121 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), 122 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 123 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
124 .escr_emask =
125 P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
99 .cntr = { {8, 9, -1}, {10, 11, -1} }, 126 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 }, 127 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = { 128 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), 129 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, 130 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
131 .escr_emask =
132 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
133 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
134 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
135 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
104 .cntr = { {0, -1, -1}, {2, -1, -1} }, 136 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 }, 137 },
106 [P4_EVENT_PAGE_WALK_TYPE] = { 138 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), 139 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, 140 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
141 .escr_emask =
142 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
143 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
144 .shared = 1,
109 .cntr = { {0, -1, -1}, {2, -1, -1} }, 145 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 }, 146 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = { 147 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), 148 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, 149 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
150 .escr_emask =
151 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
152 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
153 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
154 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
155 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
156 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
157 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
158 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
159 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
114 .cntr = { {0, -1, -1}, {2, -1, -1} }, 160 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 }, 161 },
116 [P4_EVENT_IOQ_ALLOCATION] = { 162 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), 163 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 164 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
165 .escr_emask =
166 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
167 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
168 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
169 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
170 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
171 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
172 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
173 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
174 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
175 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
176 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
119 .cntr = { {0, -1, -1}, {2, -1, -1} }, 177 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 }, 178 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 179 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), 180 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, 181 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
182 .escr_emask =
183 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
184 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
185 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
186 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
187 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
188 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
189 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
190 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
191 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
192 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
193 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
124 .cntr = { {2, -1, -1}, {3, -1, -1} }, 194 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 }, 195 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = { 196 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), 197 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 198 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
199 .escr_emask =
200 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
201 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
202 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
203 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
204 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
205 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
206 .shared = 1,
129 .cntr = { {0, -1, -1}, {2, -1, -1} }, 207 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 }, 208 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ 209 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), 210 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, 211 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
212 .escr_emask =
213 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
214 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
215 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
216 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
217 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
218 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
219 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
220 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
221 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
222 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
223 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
224 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
225 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
134 .cntr = { {0, -1, -1}, {1, -1, -1} }, 226 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 }, 227 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 228 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), 229 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, 230 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
231 .escr_emask =
232 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
233 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
234 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
235 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
236 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
237 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
238 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
239 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
240 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
241 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
242 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
243 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
244 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
139 .cntr = { {2, -1, -1}, {3, -1, -1} }, 245 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 }, 246 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = { 247 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), 248 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 249 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
250 .escr_emask =
251 P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
252 .shared = 1,
144 .cntr = { {8, 9, -1}, {10, 11, -1} }, 253 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 }, 254 },
146 [P4_EVENT_PACKED_SP_UOP] = { 255 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), 256 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 257 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
258 .escr_emask =
259 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
260 .shared = 1,
149 .cntr = { {8, 9, -1}, {10, 11, -1} }, 261 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 }, 262 },
151 [P4_EVENT_PACKED_DP_UOP] = { 263 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), 264 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 265 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
266 .escr_emask =
267 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
268 .shared = 1,
154 .cntr = { {8, 9, -1}, {10, 11, -1} }, 269 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 }, 270 },
156 [P4_EVENT_SCALAR_SP_UOP] = { 271 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), 272 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 273 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
274 .escr_emask =
275 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
276 .shared = 1,
159 .cntr = { {8, 9, -1}, {10, 11, -1} }, 277 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 }, 278 },
161 [P4_EVENT_SCALAR_DP_UOP] = { 279 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), 280 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 281 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
282 .escr_emask =
283 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
284 .shared = 1,
164 .cntr = { {8, 9, -1}, {10, 11, -1} }, 285 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 }, 286 },
166 [P4_EVENT_64BIT_MMX_UOP] = { 287 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), 288 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 289 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
290 .escr_emask =
291 P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
292 .shared = 1,
169 .cntr = { {8, 9, -1}, {10, 11, -1} }, 293 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 }, 294 },
171 [P4_EVENT_128BIT_MMX_UOP] = { 295 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), 296 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 297 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
298 .escr_emask =
299 P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
300 .shared = 1,
174 .cntr = { {8, 9, -1}, {10, 11, -1} }, 301 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 }, 302 },
176 [P4_EVENT_X87_FP_UOP] = { 303 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), 304 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 305 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
306 .escr_emask =
307 P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
308 .shared = 1,
179 .cntr = { {8, 9, -1}, {10, 11, -1} }, 309 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 }, 310 },
181 [P4_EVENT_TC_MISC] = { 311 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC), 312 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 313 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
314 .escr_emask =
315 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
184 .cntr = { {4, 5, -1}, {6, 7, -1} }, 316 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 }, 317 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = { 318 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), 319 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 320 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
321 .escr_emask =
322 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
189 .cntr = { {0, -1, -1}, {2, -1, -1} }, 323 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 }, 324 },
191 [P4_EVENT_TC_MS_XFER] = { 325 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), 326 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 327 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
328 .escr_emask =
329 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
194 .cntr = { {4, 5, -1}, {6, 7, -1} }, 330 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 }, 331 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = { 332 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), 333 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 334 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
335 .escr_emask =
336 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
337 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
338 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
199 .cntr = { {4, 5, -1}, {6, 7, -1} }, 339 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 }, 340 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { 341 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), 342 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, 343 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
344 .escr_emask =
345 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
346 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
347 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
348 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
204 .cntr = { {4, 5, -1}, {6, 7, -1} }, 349 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 }, 350 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = { 351 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), 352 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, 353 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
354 .escr_emask =
355 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
356 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
209 .cntr = { {4, 5, -1}, {6, 7, -1} }, 359 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 }, 360 },
211 [P4_EVENT_RESOURCE_STALL] = { 361 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), 362 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, 363 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
364 .escr_emask =
365 P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
214 .cntr = { {12, 13, 16}, {14, 15, 17} }, 366 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 }, 367 },
216 [P4_EVENT_WC_BUFFER] = { 368 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), 369 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 370 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
371 .escr_emask =
372 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
374 .shared = 1,
219 .cntr = { {8, 9, -1}, {10, 11, -1} }, 375 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 }, 376 },
221 [P4_EVENT_B2B_CYCLES] = { 377 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), 378 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 379 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
380 .escr_emask = 0,
224 .cntr = { {0, -1, -1}, {2, -1, -1} }, 381 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 }, 382 },
226 [P4_EVENT_BNR] = { 383 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR), 384 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 385 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
386 .escr_emask = 0,
229 .cntr = { {0, -1, -1}, {2, -1, -1} }, 387 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 }, 388 },
231 [P4_EVENT_SNOOP] = { 389 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP), 390 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 391 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
392 .escr_emask = 0,
234 .cntr = { {0, -1, -1}, {2, -1, -1} }, 393 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 }, 394 },
236 [P4_EVENT_RESPONSE] = { 395 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE), 396 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 397 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
398 .escr_emask = 0,
239 .cntr = { {0, -1, -1}, {2, -1, -1} }, 399 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 }, 400 },
241 [P4_EVENT_FRONT_END_EVENT] = { 401 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), 402 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 403 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
404 .escr_emask =
405 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
406 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
244 .cntr = { {12, 13, 16}, {14, 15, 17} }, 407 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 }, 408 },
246 [P4_EVENT_EXECUTION_EVENT] = { 409 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), 410 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 411 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
412 .escr_emask =
413 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
414 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
415 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
416 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
417 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
418 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
419 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
420 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
249 .cntr = { {12, 13, 16}, {14, 15, 17} }, 421 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 }, 422 },
251 [P4_EVENT_REPLAY_EVENT] = { 423 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), 424 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 425 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
426 .escr_emask =
427 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
428 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
254 .cntr = { {12, 13, 16}, {14, 15, 17} }, 429 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 }, 430 },
256 [P4_EVENT_INSTR_RETIRED] = { 431 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), 432 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 433 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
434 .escr_emask =
435 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
436 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
437 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
438 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
259 .cntr = { {12, 13, 16}, {14, 15, 17} }, 439 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 }, 440 },
261 [P4_EVENT_UOPS_RETIRED] = { 441 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), 442 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 443 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
444 .escr_emask =
445 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
446 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
264 .cntr = { {12, 13, 16}, {14, 15, 17} }, 447 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 }, 448 },
266 [P4_EVENT_UOP_TYPE] = { 449 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), 450 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, 451 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
452 .escr_emask =
453 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
454 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
269 .cntr = { {12, 13, 16}, {14, 15, 17} }, 455 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 }, 456 },
271 [P4_EVENT_BRANCH_RETIRED] = { 457 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), 458 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 459 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
460 .escr_emask =
461 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
462 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
463 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
464 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
274 .cntr = { {12, 13, 16}, {14, 15, 17} }, 465 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 }, 466 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = { 467 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), 468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
470 .escr_emask =
471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
279 .cntr = { {12, 13, 16}, {14, 15, 17} }, 472 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 }, 473 },
281 [P4_EVENT_X87_ASSIST] = { 474 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), 475 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 476 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
477 .escr_emask =
478 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
479 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
480 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
481 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
482 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
284 .cntr = { {12, 13, 16}, {14, 15, 17} }, 483 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 }, 484 },
286 [P4_EVENT_MACHINE_CLEAR] = { 485 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), 486 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 487 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
488 .escr_emask =
489 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
490 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
491 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
289 .cntr = { {12, 13, 16}, {14, 15, 17} }, 492 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 }, 493 },
291 [P4_EVENT_INSTR_COMPLETED] = { 494 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), 495 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 496 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
497 .escr_emask =
498 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
499 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
294 .cntr = { {12, 13, 16}, {14, 15, 17} }, 500 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 }, 501 },
296}; 502};
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
428 return config; 634 return config;
429} 635}
430 636
637/* check cpu model specifics */
638static bool p4_event_match_cpu_model(unsigned int event_idx)
639{
640 /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
641 if (event_idx == P4_EVENT_INSTR_COMPLETED) {
642 if (boot_cpu_data.x86_model != 3 &&
643 boot_cpu_data.x86_model != 4 &&
644 boot_cpu_data.x86_model != 6)
645 return false;
646 }
647
648 /*
649 * For info
650 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
651 */
652
653 return true;
654}
655
431static int p4_validate_raw_event(struct perf_event *event) 656static int p4_validate_raw_event(struct perf_event *event)
432{ 657{
433 unsigned int v; 658 unsigned int v, emask;
434 659
435 /* user data may have out-of-bound event index */ 660 /* User data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config); 661 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) { 662 if (v >= ARRAY_SIZE(p4_event_bind_map))
438 pr_warning("P4 PMU: Unknown event code: %d\n", v); 663 return -EINVAL;
664
665 /* It may be unsupported: */
666 if (!p4_event_match_cpu_model(v))
439 return -EINVAL; 667 return -EINVAL;
668
669 /*
670 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
671 * in Architectural Performance Monitoring, it means not
672 * on _which_ logical cpu to count but rather _when_, ie it
673 * depends on logical cpu state -- count event if one cpu active,
674 * none, both or any, so we just allow user to pass any value
675 * desired.
676 *
677 * In turn we always set Tx_OS/Tx_USR bits bound to logical
678 * cpu without their propagation to another cpu
679 */
680
681 /*
682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it
684 */
685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
687 return -EACCES;
440 } 688 }
441 689
690 /* ESCR EventMask bits may be invalid */
691 emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
692 if (emask & ~p4_event_bind_map[v].escr_emask)
693 return -EINVAL;
694
442 /* 695 /*
443 * it may have some screwed PEBS bits 696 * it may have some invalid PEBS bits
444 */ 697 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { 698 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL; 699 return -EINVAL;
448 } 700
449 v = p4_config_unpack_metric(event->attr.config); 701 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { 702 if (v >= ARRAY_SIZE(p4_pebs_bind_map))
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL; 703 return -EINVAL;
453 }
454 704
455 return 0; 705 return 0;
456} 706}
@@ -477,28 +727,30 @@ static int p4_hw_config(struct perf_event *event)
477 event->hw.config = p4_set_ht_bit(event->hw.config); 727 event->hw.config = p4_set_ht_bit(event->hw.config);
478 728
479 if (event->attr.type == PERF_TYPE_RAW) { 729 if (event->attr.type == PERF_TYPE_RAW) {
730 struct p4_event_bind *bind;
731 unsigned int esel;
732 /*
733 * Clear bits we reserve to be managed by kernel itself
734 * and never allowed from a user space
735 */
736 event->attr.config &= P4_CONFIG_MASK;
480 737
481 rc = p4_validate_raw_event(event); 738 rc = p4_validate_raw_event(event);
482 if (rc) 739 if (rc)
483 goto out; 740 goto out;
484 741
485 /* 742 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED 743 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc) 744 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */ 745 */
497 event->hw.config |= event->attr.config & 746 event->hw.config |= event->attr.config;
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) | 747 bind = p4_config_get_bind(event->attr.config);
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); 748 if (!bind) {
500 749 rc = -EINVAL;
501 event->hw.config &= ~P4_CCCR_FORCE_OVF; 750 goto out;
751 }
752 esel = P4_OPCODE_ESEL(bind->opcode);
753 event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
502 } 754 }
503 755
504 rc = x86_setup_perfctr(event); 756 rc = x86_setup_perfctr(event);
@@ -509,19 +761,27 @@ out:
509 761
510static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 762static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
511{ 763{
512 int overflow = 0; 764 u64 v;
513 u32 low, high;
514
515 rdmsr(hwc->config_base + hwc->idx, low, high);
516 765
517 /* we need to check high bit for unflagged overflows */ 766 /* an official way for overflow indication */
518 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { 767 rdmsrl(hwc->config_base, v);
519 overflow = 1; 768 if (v & P4_CCCR_OVF) {
520 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
521 ((u64)low) & ~P4_CCCR_OVF); 770 return 1;
522 } 771 }
523 772
524 return overflow; 773 /*
774 * In some circumstances the overflow might issue an NMI but did
775 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
776 * we simply check for high bit being set, if it's cleared it means
777 * the counter has reached zero value and continued counting before
778 * real NMI signal was received:
779 */
780 rdmsrl(hwc->event_base, v);
781 if (!(v & ARCH_P4_UNFLAGGED_BIT))
782 return 1;
783
784 return 0;
525} 785}
526 786
527static void p4_pmu_disable_pebs(void) 787static void p4_pmu_disable_pebs(void)
@@ -531,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
531 * 791 *
532 * It's still allowed that two threads setup same cache 792 * It's still allowed that two threads setup same cache
533 * events so we can't simply clear metrics until we knew 793 * events so we can't simply clear metrics until we knew
534 * noone is depending on us, so we need kind of counter 794 * no one is depending on us, so we need kind of counter
535 * for "ReplayEvent" users. 795 * for "ReplayEvent" users.
536 * 796 *
537 * What is more complex -- RAW events, if user (for some 797 * What is more complex -- RAW events, if user (for some
538 * reason) will pass some cache event metric with improper 798 * reason) will pass some cache event metric with improper
539 * event opcode -- it's fine from hardware point of view 799 * event opcode -- it's fine from hardware point of view
540 * but completely nonsence from "meaning" of such action. 800 * but completely nonsense from "meaning" of such action.
541 * 801 *
542 * So at moment let leave metrics turned on forever -- it's 802 * So at moment let leave metrics turned on forever -- it's
543 * ok for now but need to be revisited! 803 * ok for now but need to be revisited!
@@ -556,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
556 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 816 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
557 * asserted again and again 817 * asserted again and again
558 */ 818 */
559 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 819 (void)checking_wrmsrl(hwc->config_base,
560 (u64)(p4_config_unpack_cccr(hwc->config)) & 820 (u64)(p4_config_unpack_cccr(hwc->config)) &
561 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 821 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
562} 822}
@@ -626,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
626 p4_pmu_enable_pebs(hwc->config); 886 p4_pmu_enable_pebs(hwc->config);
627 887
628 (void)checking_wrmsrl(escr_addr, escr_conf); 888 (void)checking_wrmsrl(escr_addr, escr_conf);
629 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 889 (void)checking_wrmsrl(hwc->config_base,
630 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 890 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
631} 891}
632 892
@@ -652,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
652 int idx, handled = 0; 912 int idx, handled = 0;
653 u64 val; 913 u64 val;
654 914
655 data.addr = 0; 915 perf_sample_data_init(&data, 0);
656 data.raw = NULL;
657 916
658 cpuc = &__get_cpu_var(cpu_hw_events); 917 cpuc = &__get_cpu_var(cpu_hw_events);
659 918
@@ -687,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
687 if (!x86_perf_event_set_period(event)) 946 if (!x86_perf_event_set_period(event))
688 continue; 947 continue;
689 if (perf_event_overflow(event, 1, &data, regs)) 948 if (perf_event_overflow(event, 1, &data, regs))
690 p4_pmu_disable_event(event); 949 x86_pmu_stop(event, 0);
691 } 950 }
692 951
693 if (handled) { 952 if (handled)
694 /* p4 quirk: unmask it again */
695 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
696 inc_irq_stat(apic_perf_irqs); 953 inc_irq_stat(apic_perf_irqs);
697 } 954
955 /*
956 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
957 * been observed that the OVF bit flag has to be cleared first _before_
958 * the LVTPC can be unmasked.
959 *
960 * The reason is the NMI line will continue to be asserted while the OVF
961 * bit is set. This causes a second NMI to generate if the LVTPC is
962 * unmasked before the OVF bit is cleared, leading to unknown NMI
963 * messages.
964 */
965 apic_write(APIC_LVTPC, APIC_DM_NMI);
698 966
699 return handled; 967 return handled;
700} 968}
@@ -908,9 +1176,9 @@ static __initconst const struct x86_pmu p4_pmu = {
908 */ 1176 */
909 .num_counters = ARCH_P4_MAX_CCCR, 1177 .num_counters = ARCH_P4_MAX_CCCR,
910 .apic = 1, 1178 .apic = 1,
911 .cntval_bits = 40, 1179 .cntval_bits = ARCH_P4_CNTRVAL_BITS,
912 .cntval_mask = (1ULL << 40) - 1, 1180 .cntval_mask = ARCH_P4_CNTRVAL_MASK,
913 .max_period = (1ULL << 39) - 1, 1181 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
914 .hw_config = p4_hw_config, 1182 .hw_config = p4_hw_config,
915 .schedule_events = p4_pmu_schedule_events, 1183 .schedule_events = p4_pmu_schedule_events,
916 /* 1184 /*
@@ -928,7 +1196,7 @@ static __init int p4_pmu_init(void)
928{ 1196{
929 unsigned int low, high; 1197 unsigned int low, high;
930 1198
931 /* If we get stripped -- indexig fails */ 1199 /* If we get stripped -- indexing fails */
932 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1200 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
933 1201
934 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1202 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <asm/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_event.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr;
27 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
28 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29};
30
31/* Interface defining a CPU specific perfctr watchdog */
32struct wd_ops {
33 int (*reserve)(void);
34 void (*unreserve)(void);
35 int (*setup)(unsigned nmi_hz);
36 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37 void (*stop)(void);
38 unsigned perfctr;
39 unsigned evntsel;
40 u64 checkbit;
41};
42
43static const struct wd_ops *wd_ops;
44
45/* 25/*
46 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 26 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47 * offset from MSR_P4_BSU_ESCR0. 27 * offset from MSR_P4_BSU_ESCR0.
@@ -60,14 +40,14 @@ static const struct wd_ops *wd_ops;
60static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); 40static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); 41static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62 42
63static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65/* converts an msr to an appropriate reservation bit */ 43/* converts an msr to an appropriate reservation bit */
66static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) 44static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67{ 45{
68 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
69 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
70 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
71 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
72 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -92,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
92 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
93 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
94 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
95 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
96 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -172,624 +154,3 @@ void release_evntsel_nmi(unsigned int msr)
172 clear_bit(counter, evntsel_nmi_owner); 154 clear_bit(counter, evntsel_nmi_owner);
173} 155}
174EXPORT_SYMBOL(release_evntsel_nmi); 156EXPORT_SYMBOL(release_evntsel_nmi);
175
176void disable_lapic_nmi_watchdog(void)
177{
178 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
179
180 if (atomic_read(&nmi_active) <= 0)
181 return;
182
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184
185 if (wd_ops)
186 wd_ops->unreserve();
187
188 BUG_ON(atomic_read(&nmi_active) != 0);
189}
190
191void enable_lapic_nmi_watchdog(void)
192{
193 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
194
195 /* are we already enabled */
196 if (atomic_read(&nmi_active) != 0)
197 return;
198
199 /* are we lapic aware */
200 if (!wd_ops)
201 return;
202 if (!wd_ops->reserve()) {
203 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
204 return;
205 }
206
207 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
208 touch_nmi_watchdog();
209}
210
211/*
212 * Activate the NMI watchdog via the local APIC.
213 */
214
215static unsigned int adjust_for_32bit_ctr(unsigned int hz)
216{
217 u64 counter_val;
218 unsigned int retval = hz;
219
220 /*
221 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
222 * are writable, with higher bits sign extending from bit 31.
223 * So, we can only program the counter with 31 bit values and
224 * 32nd bit should be 1, for 33.. to be 1.
225 * Find the appropriate nmi_hz
226 */
227 counter_val = (u64)cpu_khz * 1000;
228 do_div(counter_val, retval);
229 if (counter_val > 0x7fffffffULL) {
230 u64 count = (u64)cpu_khz * 1000;
231 do_div(count, 0x7fffffffUL);
232 retval = count + 1;
233 }
234 return retval;
235}
236
237static void write_watchdog_counter(unsigned int perfctr_msr,
238 const char *descr, unsigned nmi_hz)
239{
240 u64 count = (u64)cpu_khz * 1000;
241
242 do_div(count, nmi_hz);
243 if (descr)
244 pr_debug("setting %s to -0x%08Lx\n", descr, count);
245 wrmsrl(perfctr_msr, 0 - count);
246}
247
248static void write_watchdog_counter32(unsigned int perfctr_msr,
249 const char *descr, unsigned nmi_hz)
250{
251 u64 count = (u64)cpu_khz * 1000;
252
253 do_div(count, nmi_hz);
254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsr(perfctr_msr, (u32)(-count), 0);
257}
258
259/*
260 * AMD K7/K8/Family10h/Family11h support.
261 * AMD keeps this interface nicely stable so there is not much variety
262 */
263#define K7_EVNTSEL_ENABLE (1 << 22)
264#define K7_EVNTSEL_INT (1 << 20)
265#define K7_EVNTSEL_OS (1 << 17)
266#define K7_EVNTSEL_USR (1 << 16)
267#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
268#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
269
270static int setup_k7_watchdog(unsigned nmi_hz)
271{
272 unsigned int perfctr_msr, evntsel_msr;
273 unsigned int evntsel;
274 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
275
276 perfctr_msr = wd_ops->perfctr;
277 evntsel_msr = wd_ops->evntsel;
278
279 wrmsrl(perfctr_msr, 0UL);
280
281 evntsel = K7_EVNTSEL_INT
282 | K7_EVNTSEL_OS
283 | K7_EVNTSEL_USR
284 | K7_NMI_EVENT;
285
286 /* setup the timer */
287 wrmsr(evntsel_msr, evntsel, 0);
288 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
289
290 /* initialize the wd struct before enabling */
291 wd->perfctr_msr = perfctr_msr;
292 wd->evntsel_msr = evntsel_msr;
293 wd->cccr_msr = 0; /* unused */
294
295 /* ok, everything is initialized, announce that we're set */
296 cpu_nmi_set_wd_enabled();
297
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301
302 return 1;
303}
304
305static void single_msr_stop_watchdog(void)
306{
307 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
308
309 wrmsr(wd->evntsel_msr, 0, 0);
310}
311
312static int single_msr_reserve(void)
313{
314 if (!reserve_perfctr_nmi(wd_ops->perfctr))
315 return 0;
316
317 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
318 release_perfctr_nmi(wd_ops->perfctr);
319 return 0;
320 }
321 return 1;
322}
323
324static void single_msr_unreserve(void)
325{
326 release_evntsel_nmi(wd_ops->evntsel);
327 release_perfctr_nmi(wd_ops->perfctr);
328}
329
330static void __kprobes
331single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
332{
333 /* start the cycle over again */
334 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
335}
336
337static const struct wd_ops k7_wd_ops = {
338 .reserve = single_msr_reserve,
339 .unreserve = single_msr_unreserve,
340 .setup = setup_k7_watchdog,
341 .rearm = single_msr_rearm,
342 .stop = single_msr_stop_watchdog,
343 .perfctr = MSR_K7_PERFCTR0,
344 .evntsel = MSR_K7_EVNTSEL0,
345 .checkbit = 1ULL << 47,
346};
347
348/*
349 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
350 */
351#define P6_EVNTSEL0_ENABLE (1 << 22)
352#define P6_EVNTSEL_INT (1 << 20)
353#define P6_EVNTSEL_OS (1 << 17)
354#define P6_EVNTSEL_USR (1 << 16)
355#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
356#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
357
358static int setup_p6_watchdog(unsigned nmi_hz)
359{
360 unsigned int perfctr_msr, evntsel_msr;
361 unsigned int evntsel;
362 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
363
364 perfctr_msr = wd_ops->perfctr;
365 evntsel_msr = wd_ops->evntsel;
366
367 /* KVM doesn't implement this MSR */
368 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
369 return 0;
370
371 evntsel = P6_EVNTSEL_INT
372 | P6_EVNTSEL_OS
373 | P6_EVNTSEL_USR
374 | P6_NMI_EVENT;
375
376 /* setup the timer */
377 wrmsr(evntsel_msr, evntsel, 0);
378 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
379 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
380
381 /* initialize the wd struct before enabling */
382 wd->perfctr_msr = perfctr_msr;
383 wd->evntsel_msr = evntsel_msr;
384 wd->cccr_msr = 0; /* unused */
385
386 /* ok, everything is initialized, announce that we're set */
387 cpu_nmi_set_wd_enabled();
388
389 apic_write(APIC_LVTPC, APIC_DM_NMI);
390 evntsel |= P6_EVNTSEL0_ENABLE;
391 wrmsr(evntsel_msr, evntsel, 0);
392
393 return 1;
394}
395
396static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
397{
398 /*
399 * P6 based Pentium M need to re-unmask
400 * the apic vector but it doesn't hurt
401 * other P6 variant.
402 * ArchPerfom/Core Duo also needs this
403 */
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405
406 /* P6/ARCH_PERFMON has 32 bit counter write */
407 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
408}
409
410static const struct wd_ops p6_wd_ops = {
411 .reserve = single_msr_reserve,
412 .unreserve = single_msr_unreserve,
413 .setup = setup_p6_watchdog,
414 .rearm = p6_rearm,
415 .stop = single_msr_stop_watchdog,
416 .perfctr = MSR_P6_PERFCTR0,
417 .evntsel = MSR_P6_EVNTSEL0,
418 .checkbit = 1ULL << 39,
419};
420
421/*
422 * Intel P4 performance counters.
423 * By far the most complicated of all.
424 */
425#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
426#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
427#define P4_ESCR_OS (1 << 3)
428#define P4_ESCR_USR (1 << 2)
429#define P4_CCCR_OVF_PMI0 (1 << 26)
430#define P4_CCCR_OVF_PMI1 (1 << 27)
431#define P4_CCCR_THRESHOLD(N) ((N) << 20)
432#define P4_CCCR_COMPLEMENT (1 << 19)
433#define P4_CCCR_COMPARE (1 << 18)
434#define P4_CCCR_REQUIRED (3 << 16)
435#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
436#define P4_CCCR_ENABLE (1 << 12)
437#define P4_CCCR_OVF (1 << 31)
438
439#define P4_CONTROLS 18
440static unsigned int p4_controls[18] = {
441 MSR_P4_BPU_CCCR0,
442 MSR_P4_BPU_CCCR1,
443 MSR_P4_BPU_CCCR2,
444 MSR_P4_BPU_CCCR3,
445 MSR_P4_MS_CCCR0,
446 MSR_P4_MS_CCCR1,
447 MSR_P4_MS_CCCR2,
448 MSR_P4_MS_CCCR3,
449 MSR_P4_FLAME_CCCR0,
450 MSR_P4_FLAME_CCCR1,
451 MSR_P4_FLAME_CCCR2,
452 MSR_P4_FLAME_CCCR3,
453 MSR_P4_IQ_CCCR0,
454 MSR_P4_IQ_CCCR1,
455 MSR_P4_IQ_CCCR2,
456 MSR_P4_IQ_CCCR3,
457 MSR_P4_IQ_CCCR4,
458 MSR_P4_IQ_CCCR5,
459};
460/*
461 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
462 * CRU_ESCR0 (with any non-null event selector) through a complemented
463 * max threshold. [IA32-Vol3, Section 14.9.9]
464 */
465static int setup_p4_watchdog(unsigned nmi_hz)
466{
467 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
468 unsigned int evntsel, cccr_val;
469 unsigned int misc_enable, dummy;
470 unsigned int ht_num;
471 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472
473 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
474 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
475 return 0;
476
477#ifdef CONFIG_SMP
478 /* detect which hyperthread we are on */
479 if (smp_num_siblings == 2) {
480 unsigned int ebx, apicid;
481
482 ebx = cpuid_ebx(1);
483 apicid = (ebx >> 24) & 0xff;
484 ht_num = apicid & 1;
485 } else
486#endif
487 ht_num = 0;
488
489 /*
490 * performance counters are shared resources
491 * assign each hyperthread its own set
492 * (re-use the ESCR0 register, seems safe
493 * and keeps the cccr_val the same)
494 */
495 if (!ht_num) {
496 /* logical cpu 0 */
497 perfctr_msr = MSR_P4_IQ_PERFCTR0;
498 evntsel_msr = MSR_P4_CRU_ESCR0;
499 cccr_msr = MSR_P4_IQ_CCCR0;
500 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
501
502 /*
503 * If we're on the kdump kernel or other situation, we may
504 * still have other performance counter registers set to
505 * interrupt and they'll keep interrupting forever because
506 * of the P4_CCCR_OVF quirk. So we need to ACK all the
507 * pending interrupts and disable all the registers here,
508 * before reenabling the NMI delivery. Refer to p4_rearm()
509 * about the P4_CCCR_OVF quirk.
510 */
511 if (reset_devices) {
512 unsigned int low, high;
513 int i;
514
515 for (i = 0; i < P4_CONTROLS; i++) {
516 rdmsr(p4_controls[i], low, high);
517 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
518 wrmsr(p4_controls[i], low, high);
519 }
520 }
521 } else {
522 /* logical cpu 1 */
523 perfctr_msr = MSR_P4_IQ_PERFCTR1;
524 evntsel_msr = MSR_P4_CRU_ESCR0;
525 cccr_msr = MSR_P4_IQ_CCCR1;
526
527 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
528 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
529 cccr_val = P4_CCCR_OVF_PMI0;
530 else
531 cccr_val = P4_CCCR_OVF_PMI1;
532 cccr_val |= P4_CCCR_ESCR_SELECT(4);
533 }
534
535 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
536 | P4_ESCR_OS
537 | P4_ESCR_USR;
538
539 cccr_val |= P4_CCCR_THRESHOLD(15)
540 | P4_CCCR_COMPLEMENT
541 | P4_CCCR_COMPARE
542 | P4_CCCR_REQUIRED;
543
544 wrmsr(evntsel_msr, evntsel, 0);
545 wrmsr(cccr_msr, cccr_val, 0);
546 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
547
548 wd->perfctr_msr = perfctr_msr;
549 wd->evntsel_msr = evntsel_msr;
550 wd->cccr_msr = cccr_msr;
551
552 /* ok, everything is initialized, announce that we're set */
553 cpu_nmi_set_wd_enabled();
554
555 apic_write(APIC_LVTPC, APIC_DM_NMI);
556 cccr_val |= P4_CCCR_ENABLE;
557 wrmsr(cccr_msr, cccr_val, 0);
558 return 1;
559}
560
561static void stop_p4_watchdog(void)
562{
563 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
564 wrmsr(wd->cccr_msr, 0, 0);
565 wrmsr(wd->evntsel_msr, 0, 0);
566}
567
568static int p4_reserve(void)
569{
570 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
571 return 0;
572#ifdef CONFIG_SMP
573 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
574 goto fail1;
575#endif
576 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
577 goto fail2;
578 /* RED-PEN why is ESCR1 not reserved here? */
579 return 1;
580 fail2:
581#ifdef CONFIG_SMP
582 if (smp_num_siblings > 1)
583 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
584 fail1:
585#endif
586 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
587 return 0;
588}
589
590static void p4_unreserve(void)
591{
592#ifdef CONFIG_SMP
593 if (smp_num_siblings > 1)
594 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595#endif
596 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
597 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598}
599
600static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
601{
602 unsigned dummy;
603 /*
604 * P4 quirks:
605 * - An overflown perfctr will assert its interrupt
606 * until the OVF flag in its CCCR is cleared.
607 * - LVTPC is masked on interrupt and must be
608 * unmasked by the LVTPC handler.
609 */
610 rdmsrl(wd->cccr_msr, dummy);
611 dummy &= ~P4_CCCR_OVF;
612 wrmsrl(wd->cccr_msr, dummy);
613 apic_write(APIC_LVTPC, APIC_DM_NMI);
614 /* start the cycle over again */
615 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
616}
617
618static const struct wd_ops p4_wd_ops = {
619 .reserve = p4_reserve,
620 .unreserve = p4_unreserve,
621 .setup = setup_p4_watchdog,
622 .rearm = p4_rearm,
623 .stop = stop_p4_watchdog,
624 /* RED-PEN this is wrong for the other sibling */
625 .perfctr = MSR_P4_BPU_PERFCTR0,
626 .evntsel = MSR_P4_BSU_ESCR0,
627 .checkbit = 1ULL << 39,
628};
629
630/*
631 * Watchdog using the Intel architected PerfMon.
632 * Used for Core2 and hopefully all future Intel CPUs.
633 */
634#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
635#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
636
637static struct wd_ops intel_arch_wd_ops;
638
639static int setup_intel_arch_watchdog(unsigned nmi_hz)
640{
641 unsigned int ebx;
642 union cpuid10_eax eax;
643 unsigned int unused;
644 unsigned int perfctr_msr, evntsel_msr;
645 unsigned int evntsel;
646 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
647
648 /*
649 * Check whether the Architectural PerfMon supports
650 * Unhalted Core Cycles Event or not.
651 * NOTE: Corresponding bit = 0 in ebx indicates event present.
652 */
653 cpuid(10, &(eax.full), &ebx, &unused, &unused);
654 if ((eax.split.mask_length <
655 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
656 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
657 return 0;
658
659 perfctr_msr = wd_ops->perfctr;
660 evntsel_msr = wd_ops->evntsel;
661
662 wrmsrl(perfctr_msr, 0UL);
663
664 evntsel = ARCH_PERFMON_EVENTSEL_INT
665 | ARCH_PERFMON_EVENTSEL_OS
666 | ARCH_PERFMON_EVENTSEL_USR
667 | ARCH_PERFMON_NMI_EVENT_SEL
668 | ARCH_PERFMON_NMI_EVENT_UMASK;
669
670 /* setup the timer */
671 wrmsr(evntsel_msr, evntsel, 0);
672 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
673 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
674
675 wd->perfctr_msr = perfctr_msr;
676 wd->evntsel_msr = evntsel_msr;
677 wd->cccr_msr = 0; /* unused */
678
679 /* ok, everything is initialized, announce that we're set */
680 cpu_nmi_set_wd_enabled();
681
682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1;
687}
688
689static struct wd_ops intel_arch_wd_ops __read_mostly = {
690 .reserve = single_msr_reserve,
691 .unreserve = single_msr_unreserve,
692 .setup = setup_intel_arch_watchdog,
693 .rearm = p6_rearm,
694 .stop = single_msr_stop_watchdog,
695 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
696 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
697};
698
699static void probe_nmi_watchdog(void)
700{
701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
705 return;
706 wd_ops = &k7_wd_ops;
707 break;
708 case X86_VENDOR_INTEL:
709 /* Work around where perfctr1 doesn't have a working enable
710 * bit as described in the following errata:
711 * AE49 Core Duo and Intel Core Solo 65 nm
712 * AN49 Intel Pentium Dual-Core
713 * AF49 Dual-Core Intel Xeon Processor LV
714 */
715 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
716 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
717 boot_cpu_data.x86_mask == 4))) {
718 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
719 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
720 }
721 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
722 wd_ops = &intel_arch_wd_ops;
723 break;
724 }
725 switch (boot_cpu_data.x86) {
726 case 6:
727 if (boot_cpu_data.x86_model > 13)
728 return;
729
730 wd_ops = &p6_wd_ops;
731 break;
732 case 15:
733 wd_ops = &p4_wd_ops;
734 break;
735 default:
736 return;
737 }
738 break;
739 }
740}
741
742/* Interface to nmi.c */
743
744int lapic_watchdog_init(unsigned nmi_hz)
745{
746 if (!wd_ops) {
747 probe_nmi_watchdog();
748 if (!wd_ops) {
749 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
750 return -1;
751 }
752
753 if (!wd_ops->reserve()) {
754 printk(KERN_ERR
755 "NMI watchdog: cannot reserve perfctrs\n");
756 return -1;
757 }
758 }
759
760 if (!(wd_ops->setup(nmi_hz))) {
761 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
762 raw_smp_processor_id());
763 return -1;
764 }
765
766 return 0;
767}
768
769void lapic_watchdog_stop(void)
770{
771 if (wd_ops)
772 wd_ops->stop();
773}
774
775unsigned lapic_adjust_nmi_hz(unsigned hz)
776{
777 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
778 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
779 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
780 hz = adjust_for_32bit_ctr(hz);
781 return hz;
782}
783
784int __kprobes lapic_wd_event(unsigned nmi_hz)
785{
786 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
787 u64 ctr;
788
789 rdmsrl(wd->perfctr_msr, ctr);
790 if (ctr & wd_ops->checkbit) /* perfctr still running? */
791 return 0;
792
793 wd_ops->rearm(wd, nmi_hz);
794 return 1;
795}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, 46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
48 { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
49 { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
50 { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
51 { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
52 { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 } 53 { 0, 0, 0, 0, 0 }
48 }; 54 };
49 55
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
37#include <linux/major.h> 36#include <linux/major.h>
38#include <linux/fs.h> 37#include <linux/fs.h>
39#include <linux/device.h> 38#include <linux/device.h>
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
14 14
15static void *kdump_buf_page; 15static void *kdump_buf_page;
16 16
17/* Stores the physical address of elf header of crash image. */
18unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
19
20static inline bool is_crashed_pfn_valid(unsigned long pfn) 17static inline bool is_crashed_pfn_valid(unsigned long pfn)
21{ 18{
22#ifndef CONFIG_X86_PAE 19#ifndef CONFIG_X86_PAE
@@ -61,7 +58,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
61 if (!is_crashed_pfn_valid(pfn)) 58 if (!is_crashed_pfn_valid(pfn))
62 return -EFAULT; 59 return -EFAULT;
63 60
64 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 61 vaddr = kmap_atomic_pfn(pfn);
65 62
66 if (!userbuf) { 63 if (!userbuf) {
67 memcpy(buf, (vaddr + offset), csize); 64 memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
16/** 13/**
17 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 15 * @pfn: page frame number to be copied
@@ -34,7 +31,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 if (!csize) 31 if (!csize)
35 return 0; 32 return 0;
36 33
37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr) 35 if (!vaddr)
39 return -ENOMEM; 36 return -ENOMEM;
40 37
@@ -46,6 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
46 } else 43 } else
47 memcpy(buf, vaddr + offset, csize); 44 memcpy(buf, vaddr + offset, csize);
48 45
46 set_iounmap_nonlazy();
49 iounmap(vaddr); 47 iounmap(vaddr);
50 return csize; 48 return csize;
51} 49}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..9aeb78a23de4
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,452 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16#include <linux/initrd.h>
17
18#include <asm/hpet.h>
19#include <asm/irq_controller.h>
20#include <asm/apic.h>
21#include <asm/pci_x86.h>
22
23__initdata u64 initial_dtb;
24char __initdata cmd_line[COMMAND_LINE_SIZE];
25static LIST_HEAD(irq_domains);
26static DEFINE_RAW_SPINLOCK(big_irq_lock);
27
28int __initdata of_ioapic;
29
30#ifdef CONFIG_X86_IO_APIC
31static void add_interrupt_host(struct irq_domain *ih)
32{
33 unsigned long flags;
34
35 raw_spin_lock_irqsave(&big_irq_lock, flags);
36 list_add(&ih->l, &irq_domains);
37 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
38}
39#endif
40
41static struct irq_domain *get_ih_from_node(struct device_node *controller)
42{
43 struct irq_domain *ih, *found = NULL;
44 unsigned long flags;
45
46 raw_spin_lock_irqsave(&big_irq_lock, flags);
47 list_for_each_entry(ih, &irq_domains, l) {
48 if (ih->controller == controller) {
49 found = ih;
50 break;
51 }
52 }
53 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
54 return found;
55}
56
57unsigned int irq_create_of_mapping(struct device_node *controller,
58 const u32 *intspec, unsigned int intsize)
59{
60 struct irq_domain *ih;
61 u32 virq, type;
62 int ret;
63
64 ih = get_ih_from_node(controller);
65 if (!ih)
66 return 0;
67 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
68 if (ret)
69 return 0;
70 if (type == IRQ_TYPE_NONE)
71 return virq;
72 irq_set_irq_type(virq, type);
73 return virq;
74}
75EXPORT_SYMBOL_GPL(irq_create_of_mapping);
76
77unsigned long pci_address_to_pio(phys_addr_t address)
78{
79 /*
80 * The ioport address can be directly used by inX / outX
81 */
82 BUG_ON(address >= (1 << 16));
83 return (unsigned long)address;
84}
85EXPORT_SYMBOL_GPL(pci_address_to_pio);
86
87void __init early_init_dt_scan_chosen_arch(unsigned long node)
88{
89 BUG();
90}
91
92void __init early_init_dt_add_memory_arch(u64 base, u64 size)
93{
94 BUG();
95}
96
97void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
98{
99 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
100}
101
102#ifdef CONFIG_BLK_DEV_INITRD
103void __init early_init_dt_setup_initrd_arch(unsigned long start,
104 unsigned long end)
105{
106 initrd_start = (unsigned long)__va(start);
107 initrd_end = (unsigned long)__va(end);
108 initrd_below_start_ok = 1;
109}
110#endif
111
112void __init add_dtb(u64 data)
113{
114 initial_dtb = data + offsetof(struct setup_data, data);
115}
116
117/*
118 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
119 */
120static struct of_device_id __initdata ce4100_ids[] = {
121 { .compatible = "intel,ce4100-cp", },
122 { .compatible = "isa", },
123 { .compatible = "pci", },
124 {},
125};
126
127static int __init add_bus_probe(void)
128{
129 if (!of_have_populated_dt())
130 return 0;
131
132 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
133}
134module_init(add_bus_probe);
135
136#ifdef CONFIG_PCI
137static int x86_of_pci_irq_enable(struct pci_dev *dev)
138{
139 struct of_irq oirq;
140 u32 virq;
141 int ret;
142 u8 pin;
143
144 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
145 if (ret)
146 return ret;
147 if (!pin)
148 return 0;
149
150 ret = of_irq_map_pci(dev, &oirq);
151 if (ret)
152 return ret;
153
154 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
155 oirq.size);
156 if (virq == 0)
157 return -EINVAL;
158 dev->irq = virq;
159 return 0;
160}
161
162static void x86_of_pci_irq_disable(struct pci_dev *dev)
163{
164}
165
166void __cpuinit x86_of_pci_init(void)
167{
168 struct device_node *np;
169
170 pcibios_enable_irq = x86_of_pci_irq_enable;
171 pcibios_disable_irq = x86_of_pci_irq_disable;
172
173 for_each_node_by_type(np, "pci") {
174 const void *prop;
175 struct pci_bus *bus;
176 unsigned int bus_min;
177 struct device_node *child;
178
179 prop = of_get_property(np, "bus-range", NULL);
180 if (!prop)
181 continue;
182 bus_min = be32_to_cpup(prop);
183
184 bus = pci_find_bus(0, bus_min);
185 if (!bus) {
186 printk(KERN_ERR "Can't find a node for bus %s.\n",
187 np->full_name);
188 continue;
189 }
190
191 if (bus->self)
192 bus->self->dev.of_node = np;
193 else
194 bus->dev.of_node = np;
195
196 for_each_child_of_node(np, child) {
197 struct pci_dev *dev;
198 u32 devfn;
199
200 prop = of_get_property(child, "reg", NULL);
201 if (!prop)
202 continue;
203
204 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
205 dev = pci_get_slot(bus, devfn);
206 if (!dev)
207 continue;
208 dev->dev.of_node = child;
209 pci_dev_put(dev);
210 }
211 }
212}
213#endif
214
215static void __init dtb_setup_hpet(void)
216{
217#ifdef CONFIG_HPET_TIMER
218 struct device_node *dn;
219 struct resource r;
220 int ret;
221
222 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
223 if (!dn)
224 return;
225 ret = of_address_to_resource(dn, 0, &r);
226 if (ret) {
227 WARN_ON(1);
228 return;
229 }
230 hpet_address = r.start;
231#endif
232}
233
234static void __init dtb_lapic_setup(void)
235{
236#ifdef CONFIG_X86_LOCAL_APIC
237 struct device_node *dn;
238 struct resource r;
239 int ret;
240
241 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
242 if (!dn)
243 return;
244
245 ret = of_address_to_resource(dn, 0, &r);
246 if (WARN_ON(ret))
247 return;
248
249 /* Did the boot loader setup the local APIC ? */
250 if (!cpu_has_apic) {
251 if (apic_force_enable(r.start))
252 return;
253 }
254 smp_found_config = 1;
255 pic_mode = 1;
256 register_lapic_address(r.start);
257 generic_processor_info(boot_cpu_physical_apicid,
258 GET_APIC_VERSION(apic_read(APIC_LVR)));
259#endif
260}
261
262#ifdef CONFIG_X86_IO_APIC
263static unsigned int ioapic_id;
264
265static void __init dtb_add_ioapic(struct device_node *dn)
266{
267 struct resource r;
268 int ret;
269
270 ret = of_address_to_resource(dn, 0, &r);
271 if (ret) {
272 printk(KERN_ERR "Can't obtain address from node %s.\n",
273 dn->full_name);
274 return;
275 }
276 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
277}
278
279static void __init dtb_ioapic_setup(void)
280{
281 struct device_node *dn;
282
283 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
284 dtb_add_ioapic(dn);
285
286 if (nr_ioapics) {
287 of_ioapic = 1;
288 return;
289 }
290 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
291}
292#else
293static void __init dtb_ioapic_setup(void) {}
294#endif
295
296static void __init dtb_apic_setup(void)
297{
298 dtb_lapic_setup();
299 dtb_ioapic_setup();
300}
301
302#ifdef CONFIG_OF_FLATTREE
303static void __init x86_flattree_get_config(void)
304{
305 u32 size, map_len;
306 void *new_dtb;
307
308 if (!initial_dtb)
309 return;
310
311 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
312 (u64)sizeof(struct boot_param_header));
313
314 initial_boot_params = early_memremap(initial_dtb, map_len);
315 size = be32_to_cpu(initial_boot_params->totalsize);
316 if (map_len < size) {
317 early_iounmap(initial_boot_params, map_len);
318 initial_boot_params = early_memremap(initial_dtb, size);
319 map_len = size;
320 }
321
322 new_dtb = alloc_bootmem(size);
323 memcpy(new_dtb, initial_boot_params, size);
324 early_iounmap(initial_boot_params, map_len);
325
326 initial_boot_params = new_dtb;
327
328 /* root level address cells */
329 of_scan_flat_dt(early_init_dt_scan_root, NULL);
330
331 unflatten_device_tree();
332}
333#else
334static inline void x86_flattree_get_config(void) { }
335#endif
336
337void __init x86_dtb_init(void)
338{
339 x86_flattree_get_config();
340
341 if (!of_have_populated_dt())
342 return;
343
344 dtb_setup_hpet();
345 dtb_apic_setup();
346}
347
348#ifdef CONFIG_X86_IO_APIC
349
350struct of_ioapic_type {
351 u32 out_type;
352 u32 trigger;
353 u32 polarity;
354};
355
356static struct of_ioapic_type of_ioapic_type[] =
357{
358 {
359 .out_type = IRQ_TYPE_EDGE_RISING,
360 .trigger = IOAPIC_EDGE,
361 .polarity = 1,
362 },
363 {
364 .out_type = IRQ_TYPE_LEVEL_LOW,
365 .trigger = IOAPIC_LEVEL,
366 .polarity = 0,
367 },
368 {
369 .out_type = IRQ_TYPE_LEVEL_HIGH,
370 .trigger = IOAPIC_LEVEL,
371 .polarity = 1,
372 },
373 {
374 .out_type = IRQ_TYPE_EDGE_FALLING,
375 .trigger = IOAPIC_EDGE,
376 .polarity = 0,
377 },
378};
379
380static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
381 u32 *out_hwirq, u32 *out_type)
382{
383 struct mp_ioapic_gsi *gsi_cfg;
384 struct io_apic_irq_attr attr;
385 struct of_ioapic_type *it;
386 u32 line, idx, type;
387
388 if (intsize < 2)
389 return -EINVAL;
390
391 line = *intspec;
392 idx = (u32) id->priv;
393 gsi_cfg = mp_ioapic_gsi_routing(idx);
394 *out_hwirq = line + gsi_cfg->gsi_base;
395
396 intspec++;
397 type = *intspec;
398
399 if (type >= ARRAY_SIZE(of_ioapic_type))
400 return -EINVAL;
401
402 it = of_ioapic_type + type;
403 *out_type = it->out_type;
404
405 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
406
407 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
408}
409
410static void __init ioapic_add_ofnode(struct device_node *np)
411{
412 struct resource r;
413 int i, ret;
414
415 ret = of_address_to_resource(np, 0, &r);
416 if (ret) {
417 printk(KERN_ERR "Failed to obtain address for %s\n",
418 np->full_name);
419 return;
420 }
421
422 for (i = 0; i < nr_ioapics; i++) {
423 if (r.start == mpc_ioapic_addr(i)) {
424 struct irq_domain *id;
425
426 id = kzalloc(sizeof(*id), GFP_KERNEL);
427 BUG_ON(!id);
428 id->controller = np;
429 id->xlate = ioapic_xlate;
430 id->priv = (void *)i;
431 add_interrupt_host(id);
432 return;
433 }
434 }
435 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
436}
437
438void __init x86_add_irq_domains(void)
439{
440 struct device_node *dp;
441
442 if (!of_have_populated_dt())
443 return;
444
445 for_each_node_with_property(dp, "interrupt-controller") {
446 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
447 ioapic_add_ofnode(dp);
448 }
449}
450#else
451void __init x86_add_irq_domains(void) { }
452#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..1aae78f775fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pS\n", (void *) address, 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 reliable ? "" : "? ", (void *) address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
135} 135}
136EXPORT_SYMBOL_GPL(print_context_stack_bp); 136EXPORT_SYMBOL_GPL(print_context_stack_bp);
137 137
138
139static void
140print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
141{
142 printk(data);
143 print_symbol(msg, symbol);
144 printk("\n");
145}
146
147static void print_trace_warning(void *data, char *msg)
148{
149 printk("%s%s\n", (char *)data, msg);
150}
151
152static int print_trace_stack(void *data, char *name) 138static int print_trace_stack(void *data, char *name)
153{ 139{
154 printk("%s <%s> ", (char *)data, name); 140 printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
166} 152}
167 153
168static const struct stacktrace_ops print_trace_ops = { 154static const struct stacktrace_ops print_trace_ops = {
169 .warning = print_trace_warning,
170 .warning_symbol = print_trace_warning_symbol,
171 .stack = print_trace_stack, 155 .stack = print_trace_stack,
172 .address = print_trace_address, 156 .address = print_trace_address,
173 .walk_stack = print_context_stack, 157 .walk_stack = print_context_stack,
@@ -197,14 +181,10 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 181 */
198void dump_stack(void) 182void dump_stack(void)
199{ 183{
200 unsigned long bp = 0; 184 unsigned long bp;
201 unsigned long stack; 185 unsigned long stack;
202 186
203#ifdef CONFIG_FRAME_POINTER 187 bp = stack_frame(current, NULL);
204 if (!bp)
205 get_bp(bp);
206#endif
207
208 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 188 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
209 current->pid, current->comm, print_tainted(), 189 current->pid, current->comm, print_tainted(),
210 init_utsname()->release, 190 init_utsname()->release,
@@ -240,6 +220,7 @@ unsigned __kprobes long oops_begin(void)
240 bust_spinlocks(1); 220 bust_spinlocks(1);
241 return flags; 221 return flags;
242} 222}
223EXPORT_SYMBOL_GPL(oops_begin);
243 224
244void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 225void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
245{ 226{
@@ -282,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
282 printk("DEBUG_PAGEALLOC"); 263 printk("DEBUG_PAGEALLOC");
283#endif 264#endif
284 printk("\n"); 265 printk("\n");
285 sysfs_printk_last_file();
286 if (notify_die(DIE_OOPS, str, regs, err, 266 if (notify_die(DIE_OOPS, str, regs, err,
287 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 267 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
288 return 1; 268 return 1;
@@ -325,41 +305,6 @@ void die(const char *str, struct pt_regs *regs, long err)
325 oops_end(flags, regs, sig); 305 oops_end(flags, regs, sig);
326} 306}
327 307
328void notrace __kprobes
329die_nmi(char *str, struct pt_regs *regs, int do_panic)
330{
331 unsigned long flags;
332
333 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
334 return;
335
336 /*
337 * We are in trouble anyway, lets at least try
338 * to get a message out.
339 */
340 flags = oops_begin();
341 printk(KERN_EMERG "%s", str);
342 printk(" on CPU%d, ip %08lx, registers:\n",
343 smp_processor_id(), regs->ip);
344 show_registers(regs);
345 oops_end(flags, regs, 0);
346 if (do_panic || panic_on_oops)
347 panic("Non maskable interrupt");
348 nmi_exit();
349 local_irq_enable();
350 do_exit(SIGBUS);
351}
352
353static int __init oops_setup(char *s)
354{
355 if (!s)
356 return -EINVAL;
357 if (!strcmp(s, "panic"))
358 panic_on_oops = 1;
359 return 0;
360}
361early_param("oops", oops_setup);
362
363static int __init kstack_setup(char *s) 308static int __init kstack_setup(char *s)
364{ 309{
365 if (!s) 310 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d9..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -34,17 +34,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
34 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
35 } 35 }
36 36
37#ifdef CONFIG_FRAME_POINTER 37 if (!bp)
38 if (!bp) { 38 bp = stack_frame(task, regs);
39 if (task == current) {
40 /* Grab bp right from our regs */
41 get_bp(bp);
42 } else {
43 /* bp is the last reg pushed by switch_to */
44 bp = *(unsigned long *) task->thread.sp;
45 }
46 }
47#endif
48 39
49 for (;;) { 40 for (;;) {
50 struct thread_info *context; 41 struct thread_info *context;
@@ -82,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
82 if (kstack_end(stack)) 73 if (kstack_end(stack))
83 break; 74 break;
84 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
85 printk("\n%s", log_lvl); 76 printk(KERN_CONT "\n");
86 printk(" %08lx", *stack++); 77 printk(KERN_CONT " %08lx", *stack++);
87 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
88 } 79 }
89 printk("\n"); 80 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
91} 82}
92 83
@@ -112,8 +103,7 @@ void show_registers(struct pt_regs *regs)
112 u8 *ip; 103 u8 *ip;
113 104
114 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
115 show_stack_log_lvl(NULL, regs, &regs->sp, 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
116 0, KERN_EMERG);
117 107
118 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
119 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c791..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy;
152 153
153 if (!task) 154 if (!task)
154 task = current; 155 task = current;
155 156
156 if (!stack) { 157 if (!stack) {
157 unsigned long dummy;
158 stack = &dummy; 158 stack = &dummy;
159 if (task && task != current) 159 if (task && task != current)
160 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
161 } 161 }
162 162
163#ifdef CONFIG_FRAME_POINTER 163 if (!bp)
164 if (!bp) { 164 bp = stack_frame(task, regs);
165 if (task == current) {
166 /* Grab bp right from our regs */
167 get_bp(bp);
168 } else {
169 /* bp is the last reg pushed by switch_to */
170 bp = *(unsigned long *) task->thread.sp;
171 }
172 }
173#endif
174
175 /* 165 /*
176 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
177 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -265,20 +255,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
265 if (stack >= irq_stack && stack <= irq_stack_end) { 255 if (stack >= irq_stack && stack <= irq_stack_end) {
266 if (stack == irq_stack_end) { 256 if (stack == irq_stack_end) {
267 stack = (unsigned long *) (irq_stack_end[-1]); 257 stack = (unsigned long *) (irq_stack_end[-1]);
268 printk(" <EOI> "); 258 printk(KERN_CONT " <EOI> ");
269 } 259 }
270 } else { 260 } else {
271 if (((long) stack & (THREAD_SIZE-1)) == 0) 261 if (((long) stack & (THREAD_SIZE-1)) == 0)
272 break; 262 break;
273 } 263 }
274 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 264 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
275 printk("\n%s", log_lvl); 265 printk(KERN_CONT "\n");
276 printk(" %016lx", *stack++); 266 printk(KERN_CONT " %016lx", *stack++);
277 touch_nmi_watchdog(); 267 touch_nmi_watchdog();
278 } 268 }
279 preempt_enable(); 269 preempt_enable();
280 270
281 printk("\n"); 271 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
283} 273}
284 274
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
308 298
309 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
310 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
311 regs->bp, KERN_EMERG); 301 0, KERN_EMERG);
312 302
313 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
314 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,10 +11,13 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <linux/pfn.h> 16#include <linux/pfn.h>
16#include <linux/suspend.h> 17#include <linux/suspend.h>
18#include <linux/acpi.h>
17#include <linux/firmware-map.h> 19#include <linux/firmware-map.h>
20#include <linux/memblock.h>
18 21
19#include <asm/e820.h> 22#include <asm/e820.h>
20#include <asm/proto.h> 23#include <asm/proto.h>
@@ -665,21 +668,15 @@ __init void e820_setup_gap(void)
665 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 668 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
666 * linked list of struct setup_data, which is parsed here. 669 * linked list of struct setup_data, which is parsed here.
667 */ 670 */
668void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 671void __init parse_e820_ext(struct setup_data *sdata)
669{ 672{
670 u32 map_len;
671 int entries; 673 int entries;
672 struct e820entry *extmap; 674 struct e820entry *extmap;
673 675
674 entries = sdata->len / sizeof(struct e820entry); 676 entries = sdata->len / sizeof(struct e820entry);
675 map_len = sdata->len + sizeof(struct setup_data);
676 if (map_len > PAGE_SIZE)
677 sdata = early_ioremap(pa_data, map_len);
678 extmap = (struct e820entry *)(sdata->data); 677 extmap = (struct e820entry *)(sdata->data);
679 __append_e820_map(extmap, entries); 678 __append_e820_map(extmap, entries);
680 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 679 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
681 if (map_len > PAGE_SIZE)
682 early_iounmap(sdata, map_len);
683 printk(KERN_INFO "extended physical RAM map:\n"); 680 printk(KERN_INFO "extended physical RAM map:\n");
684 e820_print_map("extended"); 681 e820_print_map("extended");
685} 682}
@@ -738,73 +735,7 @@ core_initcall(e820_mark_nvs_memory);
738#endif 735#endif
739 736
740/* 737/*
741 * Find a free area with specified alignment in a specific range. 738 * pre allocated 4k and reserved it in memblock and e820_saved
742 */
743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
744{
745 int i;
746
747 for (i = 0; i < e820.nr_map; i++) {
748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
751
752 if (ei->type != E820_RAM)
753 continue;
754
755 ei_last = ei->addr + ei->size;
756 ei_start = ei->addr;
757 addr = find_early_area(ei_start, ei_last, start, end,
758 size, align);
759
760 if (addr != -1ULL)
761 return addr;
762 }
763 return -1ULL;
764}
765
766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
767{
768 return find_e820_area(start, end, size, align);
769}
770
771u64 __init get_max_mapped(void)
772{
773 u64 end = max_pfn_mapped;
774
775 end <<= PAGE_SHIFT;
776
777 return end;
778}
779/*
780 * Find next free range after *start
781 */
782u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
783{
784 int i;
785
786 for (i = 0; i < e820.nr_map; i++) {
787 struct e820entry *ei = &e820.map[i];
788 u64 addr;
789 u64 ei_start, ei_last;
790
791 if (ei->type != E820_RAM)
792 continue;
793
794 ei_last = ei->addr + ei->size;
795 ei_start = ei->addr;
796 addr = find_early_area_size(ei_start, ei_last, start,
797 sizep, align);
798
799 if (addr != -1ULL)
800 return addr;
801 }
802
803 return -1ULL;
804}
805
806/*
807 * pre allocated 4k and reserved it in e820
808 */ 739 */
809u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) 740u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
810{ 741{
@@ -813,8 +744,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
813 u64 start; 744 u64 start;
814 745
815 for (start = startt; ; start += size) { 746 for (start = startt; ; start += size) {
816 start = find_e820_area_size(start, &size, align); 747 start = memblock_x86_find_in_range_size(start, &size, align);
817 if (!(start + 1)) 748 if (start == MEMBLOCK_ERROR)
818 return 0; 749 return 0;
819 if (size >= sizet) 750 if (size >= sizet)
820 break; 751 break;
@@ -830,10 +761,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
830 addr = round_down(start + size - sizet, align); 761 addr = round_down(start + size - sizet, align);
831 if (addr < start) 762 if (addr < start)
832 return 0; 763 return 0;
833 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 764 memblock_x86_reserve_range(addr, addr + sizet, "new next");
834 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 765 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
835 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 766 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
836 update_e820();
837 update_e820_saved(); 767 update_e820_saved();
838 768
839 return addr; 769 return addr;
@@ -895,74 +825,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
895{ 825{
896 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); 826 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
897} 827}
898/*
899 * Finds an active region in the address range from start_pfn to last_pfn and
900 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
901 */
902int __init e820_find_active_region(const struct e820entry *ei,
903 unsigned long start_pfn,
904 unsigned long last_pfn,
905 unsigned long *ei_startpfn,
906 unsigned long *ei_endpfn)
907{
908 u64 align = PAGE_SIZE;
909
910 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
911 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
912
913 /* Skip map entries smaller than a page */
914 if (*ei_startpfn >= *ei_endpfn)
915 return 0;
916
917 /* Skip if map is outside the node */
918 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
919 *ei_startpfn >= last_pfn)
920 return 0;
921
922 /* Check for overlaps */
923 if (*ei_startpfn < start_pfn)
924 *ei_startpfn = start_pfn;
925 if (*ei_endpfn > last_pfn)
926 *ei_endpfn = last_pfn;
927
928 return 1;
929}
930
931/* Walk the e820 map and register active regions within a node */
932void __init e820_register_active_regions(int nid, unsigned long start_pfn,
933 unsigned long last_pfn)
934{
935 unsigned long ei_startpfn;
936 unsigned long ei_endpfn;
937 int i;
938
939 for (i = 0; i < e820.nr_map; i++)
940 if (e820_find_active_region(&e820.map[i],
941 start_pfn, last_pfn,
942 &ei_startpfn, &ei_endpfn))
943 add_active_range(nid, ei_startpfn, ei_endpfn);
944}
945
946/*
947 * Find the hole size (in bytes) in the memory range.
948 * @start: starting address of the memory range to scan
949 * @end: ending address of the memory range to scan
950 */
951u64 __init e820_hole_size(u64 start, u64 end)
952{
953 unsigned long start_pfn = start >> PAGE_SHIFT;
954 unsigned long last_pfn = end >> PAGE_SHIFT;
955 unsigned long ei_startpfn, ei_endpfn, ram = 0;
956 int i;
957
958 for (i = 0; i < e820.nr_map; i++) {
959 if (e820_find_active_region(&e820.map[i],
960 start_pfn, last_pfn,
961 &ei_startpfn, &ei_endpfn))
962 ram += ei_endpfn - ei_startpfn;
963 }
964 return end - start - ((u64)ram << PAGE_SHIFT);
965}
966 828
967static void early_panic(char *msg) 829static void early_panic(char *msg)
968{ 830{
@@ -980,15 +842,21 @@ static int __init parse_memopt(char *p)
980 if (!p) 842 if (!p)
981 return -EINVAL; 843 return -EINVAL;
982 844
983#ifdef CONFIG_X86_32
984 if (!strcmp(p, "nopentium")) { 845 if (!strcmp(p, "nopentium")) {
846#ifdef CONFIG_X86_32
985 setup_clear_cpu_cap(X86_FEATURE_PSE); 847 setup_clear_cpu_cap(X86_FEATURE_PSE);
986 return 0; 848 return 0;
987 } 849#else
850 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
851 return -EINVAL;
988#endif 852#endif
853 }
989 854
990 userdef = 1; 855 userdef = 1;
991 mem_size = memparse(p, &p); 856 mem_size = memparse(p, &p);
857 /* don't remove all of memory when handling "mem={invalid}" param */
858 if (mem_size == 0)
859 return -EINVAL;
992 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 860 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
993 861
994 return 0; 862 return 0;
@@ -1210,3 +1078,48 @@ void __init setup_memory_map(void)
1210 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1078 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1211 e820_print_map(who); 1079 e820_print_map(who);
1212} 1080}
1081
1082void __init memblock_x86_fill(void)
1083{
1084 int i;
1085 u64 end;
1086
1087 /*
1088 * EFI may have more than 128 entries
1089 * We are safe to enable resizing, beause memblock_x86_fill()
1090 * is rather later for x86
1091 */
1092 memblock_can_resize = 1;
1093
1094 for (i = 0; i < e820.nr_map; i++) {
1095 struct e820entry *ei = &e820.map[i];
1096
1097 end = ei->addr + ei->size;
1098 if (end != (resource_size_t)end)
1099 continue;
1100
1101 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
1102 continue;
1103
1104 memblock_add(ei->addr, ei->size);
1105 }
1106
1107 memblock_analyze();
1108 memblock_dump_all();
1109}
1110
1111void __init memblock_find_dma_reserve(void)
1112{
1113#ifdef CONFIG_X86_64
1114 u64 free_size_pfn;
1115 u64 mem_size_pfn;
1116 /*
1117 * need to find out used area below MAX_DMA_PFN
1118 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1119 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1120 */
1121 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1122 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1123 set_dma_reserve(mem_size_pfn - free_size_pfn);
1124#endif
1125}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
101static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 100static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
102{ 101{
103 u32 d; 102 u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
115 d &= 0xff; 114 d &= 0xff;
116 return d; 115 return d;
117} 116}
118#endif
119 117
120static void __init ati_bugs(int num, int slot, int func) 118static void __init ati_bugs(int num, int slot, int func)
121{ 119{
@@ -145,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func)
145 143
146static u32 __init ati_sbx00_rev(int num, int slot, int func) 144static u32 __init ati_sbx00_rev(int num, int slot, int func)
147{ 145{
148 u32 old, d; 146 u32 d;
149 147
150 d = read_pci_config(num, slot, func, 0x70);
151 old = d;
152 d &= ~(1<<8);
153 write_pci_config(num, slot, func, 0x70, d);
154 d = read_pci_config(num, slot, func, 0x8); 148 d = read_pci_config(num, slot, func, 0x8);
155 d &= 0xff; 149 d &= 0xff;
156 write_pci_config(num, slot, func, 0x70, old);
157 150
158 return d; 151 return d;
159} 152}
@@ -162,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
162{ 155{
163 u32 d, rev; 156 u32 d, rev;
164 157
165 if (acpi_use_timer_override) 158 rev = ati_sbx00_rev(num, slot, func);
159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1;
161
162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
166 return; 168 return;
167 169
168 rev = ati_sbx00_rev(num, slot, func); 170 if (acpi_use_timer_override)
169 if (rev > 0x13)
170 return; 171 return;
171 172
172 /* check for IRQ0 interrupt swap */ 173 /* check for IRQ0 interrupt swap */
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..cd28a350f7f9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/mrst.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 19#include <linux/usb/ehci_def.h>
19 20
@@ -239,6 +240,17 @@ static int __init setup_early_printk(char *buf)
239 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
240 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
241#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_MRST
244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep);
247 }
248
249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep);
252 }
253#endif
242 buf++; 254 buf++;
243 } 255 }
244 return 0; 256 return 0;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
deleted file mode 100644
index c2fa9b8b497e..000000000000
--- a/arch/x86/kernel/efi.c
+++ /dev/null
@@ -1,612 +0,0 @@
1/*
2 * Common EFI (Extensible Firmware Interface) support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 1999 VA Linux Systems
6 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
7 * Copyright (C) 1999-2002 Hewlett-Packard Co.
8 * David Mosberger-Tang <davidm@hpl.hp.com>
9 * Stephane Eranian <eranian@hpl.hp.com>
10 * Copyright (C) 2005-2008 Intel Co.
11 * Fenghua Yu <fenghua.yu@intel.com>
12 * Bibo Mao <bibo.mao@intel.com>
13 * Chandramouli Narayanan <mouli@linux.intel.com>
14 * Huang Ying <ying.huang@intel.com>
15 *
16 * Copied from efi_32.c to eliminate the duplicated code between EFI
17 * 32/64 support code. --ying 2007-10-26
18 *
19 * All EFI Runtime Services are not implemented yet as EFI only
20 * supports physical mode addressing on SoftSDV. This is to be fixed
21 * in a future version. --drummond 1999-07-20
22 *
23 * Implemented EFI runtime services and virtual mode calls. --davidm
24 *
25 * Goutham Rao: <goutham.rao@intel.com>
26 * Skip non-WB memory and ignore empty memory ranges.
27 */
28
29#include <linux/kernel.h>
30#include <linux/init.h>
31#include <linux/efi.h>
32#include <linux/bootmem.h>
33#include <linux/spinlock.h>
34#include <linux/uaccess.h>
35#include <linux/time.h>
36#include <linux/io.h>
37#include <linux/reboot.h>
38#include <linux/bcd.h>
39
40#include <asm/setup.h>
41#include <asm/efi.h>
42#include <asm/time.h>
43#include <asm/cacheflush.h>
44#include <asm/tlbflush.h>
45#include <asm/x86_init.h>
46
47#define EFI_DEBUG 1
48#define PFX "EFI: "
49
50int efi_enabled;
51EXPORT_SYMBOL(efi_enabled);
52
53struct efi efi;
54EXPORT_SYMBOL(efi);
55
56struct efi_memory_map memmap;
57
58static struct efi efi_phys __initdata;
59static efi_system_table_t efi_systab __initdata;
60
61static int __init setup_noefi(char *arg)
62{
63 efi_enabled = 0;
64 return 0;
65}
66early_param("noefi", setup_noefi);
67
68int add_efi_memmap;
69EXPORT_SYMBOL(add_efi_memmap);
70
71static int __init setup_add_efi_memmap(char *arg)
72{
73 add_efi_memmap = 1;
74 return 0;
75}
76early_param("add_efi_memmap", setup_add_efi_memmap);
77
78
79static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc)
80{
81 return efi_call_virt2(get_time, tm, tc);
82}
83
84static efi_status_t virt_efi_set_time(efi_time_t *tm)
85{
86 return efi_call_virt1(set_time, tm);
87}
88
89static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled,
90 efi_bool_t *pending,
91 efi_time_t *tm)
92{
93 return efi_call_virt3(get_wakeup_time,
94 enabled, pending, tm);
95}
96
97static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm)
98{
99 return efi_call_virt2(set_wakeup_time,
100 enabled, tm);
101}
102
103static efi_status_t virt_efi_get_variable(efi_char16_t *name,
104 efi_guid_t *vendor,
105 u32 *attr,
106 unsigned long *data_size,
107 void *data)
108{
109 return efi_call_virt5(get_variable,
110 name, vendor, attr,
111 data_size, data);
112}
113
114static efi_status_t virt_efi_get_next_variable(unsigned long *name_size,
115 efi_char16_t *name,
116 efi_guid_t *vendor)
117{
118 return efi_call_virt3(get_next_variable,
119 name_size, name, vendor);
120}
121
122static efi_status_t virt_efi_set_variable(efi_char16_t *name,
123 efi_guid_t *vendor,
124 unsigned long attr,
125 unsigned long data_size,
126 void *data)
127{
128 return efi_call_virt5(set_variable,
129 name, vendor, attr,
130 data_size, data);
131}
132
133static efi_status_t virt_efi_get_next_high_mono_count(u32 *count)
134{
135 return efi_call_virt1(get_next_high_mono_count, count);
136}
137
138static void virt_efi_reset_system(int reset_type,
139 efi_status_t status,
140 unsigned long data_size,
141 efi_char16_t *data)
142{
143 efi_call_virt4(reset_system, reset_type, status,
144 data_size, data);
145}
146
147static efi_status_t virt_efi_set_virtual_address_map(
148 unsigned long memory_map_size,
149 unsigned long descriptor_size,
150 u32 descriptor_version,
151 efi_memory_desc_t *virtual_map)
152{
153 return efi_call_virt4(set_virtual_address_map,
154 memory_map_size, descriptor_size,
155 descriptor_version, virtual_map);
156}
157
158static efi_status_t __init phys_efi_set_virtual_address_map(
159 unsigned long memory_map_size,
160 unsigned long descriptor_size,
161 u32 descriptor_version,
162 efi_memory_desc_t *virtual_map)
163{
164 efi_status_t status;
165
166 efi_call_phys_prelog();
167 status = efi_call_phys4(efi_phys.set_virtual_address_map,
168 memory_map_size, descriptor_size,
169 descriptor_version, virtual_map);
170 efi_call_phys_epilog();
171 return status;
172}
173
174static efi_status_t __init phys_efi_get_time(efi_time_t *tm,
175 efi_time_cap_t *tc)
176{
177 efi_status_t status;
178
179 efi_call_phys_prelog();
180 status = efi_call_phys2(efi_phys.get_time, tm, tc);
181 efi_call_phys_epilog();
182 return status;
183}
184
185int efi_set_rtc_mmss(unsigned long nowtime)
186{
187 int real_seconds, real_minutes;
188 efi_status_t status;
189 efi_time_t eft;
190 efi_time_cap_t cap;
191
192 status = efi.get_time(&eft, &cap);
193 if (status != EFI_SUCCESS) {
194 printk(KERN_ERR "Oops: efitime: can't read time!\n");
195 return -1;
196 }
197
198 real_seconds = nowtime % 60;
199 real_minutes = nowtime / 60;
200 if (((abs(real_minutes - eft.minute) + 15)/30) & 1)
201 real_minutes += 30;
202 real_minutes %= 60;
203 eft.minute = real_minutes;
204 eft.second = real_seconds;
205
206 status = efi.set_time(&eft);
207 if (status != EFI_SUCCESS) {
208 printk(KERN_ERR "Oops: efitime: can't write time!\n");
209 return -1;
210 }
211 return 0;
212}
213
214unsigned long efi_get_time(void)
215{
216 efi_status_t status;
217 efi_time_t eft;
218 efi_time_cap_t cap;
219
220 status = efi.get_time(&eft, &cap);
221 if (status != EFI_SUCCESS)
222 printk(KERN_ERR "Oops: efitime: can't read time!\n");
223
224 return mktime(eft.year, eft.month, eft.day, eft.hour,
225 eft.minute, eft.second);
226}
227
228/*
229 * Tell the kernel about the EFI memory map. This might include
230 * more than the max 128 entries that can fit in the e820 legacy
231 * (zeropage) memory map.
232 */
233
234static void __init do_add_efi_memmap(void)
235{
236 void *p;
237
238 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
239 efi_memory_desc_t *md = p;
240 unsigned long long start = md->phys_addr;
241 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
242 int e820_type;
243
244 switch (md->type) {
245 case EFI_LOADER_CODE:
246 case EFI_LOADER_DATA:
247 case EFI_BOOT_SERVICES_CODE:
248 case EFI_BOOT_SERVICES_DATA:
249 case EFI_CONVENTIONAL_MEMORY:
250 if (md->attribute & EFI_MEMORY_WB)
251 e820_type = E820_RAM;
252 else
253 e820_type = E820_RESERVED;
254 break;
255 case EFI_ACPI_RECLAIM_MEMORY:
256 e820_type = E820_ACPI;
257 break;
258 case EFI_ACPI_MEMORY_NVS:
259 e820_type = E820_NVS;
260 break;
261 case EFI_UNUSABLE_MEMORY:
262 e820_type = E820_UNUSABLE;
263 break;
264 default:
265 /*
266 * EFI_RESERVED_TYPE EFI_RUNTIME_SERVICES_CODE
267 * EFI_RUNTIME_SERVICES_DATA EFI_MEMORY_MAPPED_IO
268 * EFI_MEMORY_MAPPED_IO_PORT_SPACE EFI_PAL_CODE
269 */
270 e820_type = E820_RESERVED;
271 break;
272 }
273 e820_add_region(start, size, e820_type);
274 }
275 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
276}
277
278void __init efi_reserve_early(void)
279{
280 unsigned long pmap;
281
282#ifdef CONFIG_X86_32
283 pmap = boot_params.efi_info.efi_memmap;
284#else
285 pmap = (boot_params.efi_info.efi_memmap |
286 ((__u64)boot_params.efi_info.efi_memmap_hi<<32));
287#endif
288 memmap.phys_map = (void *)pmap;
289 memmap.nr_map = boot_params.efi_info.efi_memmap_size /
290 boot_params.efi_info.efi_memdesc_size;
291 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
292 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
293 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size,
294 "EFI memmap");
295}
296
297#if EFI_DEBUG
298static void __init print_efi_memmap(void)
299{
300 efi_memory_desc_t *md;
301 void *p;
302 int i;
303
304 for (p = memmap.map, i = 0;
305 p < memmap.map_end;
306 p += memmap.desc_size, i++) {
307 md = p;
308 printk(KERN_INFO PFX "mem%02u: type=%u, attr=0x%llx, "
309 "range=[0x%016llx-0x%016llx) (%lluMB)\n",
310 i, md->type, md->attribute, md->phys_addr,
311 md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT),
312 (md->num_pages >> (20 - EFI_PAGE_SHIFT)));
313 }
314}
315#endif /* EFI_DEBUG */
316
317void __init efi_init(void)
318{
319 efi_config_table_t *config_tables;
320 efi_runtime_services_t *runtime;
321 efi_char16_t *c16;
322 char vendor[100] = "unknown";
323 int i = 0;
324 void *tmp;
325
326#ifdef CONFIG_X86_32
327 efi_phys.systab = (efi_system_table_t *)boot_params.efi_info.efi_systab;
328#else
329 efi_phys.systab = (efi_system_table_t *)
330 (boot_params.efi_info.efi_systab |
331 ((__u64)boot_params.efi_info.efi_systab_hi<<32));
332#endif
333
334 efi.systab = early_ioremap((unsigned long)efi_phys.systab,
335 sizeof(efi_system_table_t));
336 if (efi.systab == NULL)
337 printk(KERN_ERR "Couldn't map the EFI system table!\n");
338 memcpy(&efi_systab, efi.systab, sizeof(efi_system_table_t));
339 early_iounmap(efi.systab, sizeof(efi_system_table_t));
340 efi.systab = &efi_systab;
341
342 /*
343 * Verify the EFI Table
344 */
345 if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE)
346 printk(KERN_ERR "EFI system table signature incorrect!\n");
347 if ((efi.systab->hdr.revision >> 16) == 0)
348 printk(KERN_ERR "Warning: EFI system table version "
349 "%d.%02d, expected 1.00 or greater!\n",
350 efi.systab->hdr.revision >> 16,
351 efi.systab->hdr.revision & 0xffff);
352
353 /*
354 * Show what we know for posterity
355 */
356 c16 = tmp = early_ioremap(efi.systab->fw_vendor, 2);
357 if (c16) {
358 for (i = 0; i < sizeof(vendor) - 1 && *c16; ++i)
359 vendor[i] = *c16++;
360 vendor[i] = '\0';
361 } else
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2);
364
365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor);
368
369 /*
370 * Let's see what config tables the firmware passed to us.
371 */
372 config_tables = early_ioremap(
373 efi.systab->tables,
374 efi.systab->nr_tables * sizeof(efi_config_table_t));
375 if (config_tables == NULL)
376 printk(KERN_ERR "Could not map EFI Configuration Table!\n");
377
378 printk(KERN_INFO);
379 for (i = 0; i < efi.systab->nr_tables; i++) {
380 if (!efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID)) {
381 efi.mps = config_tables[i].table;
382 printk(" MPS=0x%lx ", config_tables[i].table);
383 } else if (!efi_guidcmp(config_tables[i].guid,
384 ACPI_20_TABLE_GUID)) {
385 efi.acpi20 = config_tables[i].table;
386 printk(" ACPI 2.0=0x%lx ", config_tables[i].table);
387 } else if (!efi_guidcmp(config_tables[i].guid,
388 ACPI_TABLE_GUID)) {
389 efi.acpi = config_tables[i].table;
390 printk(" ACPI=0x%lx ", config_tables[i].table);
391 } else if (!efi_guidcmp(config_tables[i].guid,
392 SMBIOS_TABLE_GUID)) {
393 efi.smbios = config_tables[i].table;
394 printk(" SMBIOS=0x%lx ", config_tables[i].table);
395#ifdef CONFIG_X86_UV
396 } else if (!efi_guidcmp(config_tables[i].guid,
397 UV_SYSTEM_TABLE_GUID)) {
398 efi.uv_systab = config_tables[i].table;
399 printk(" UVsystab=0x%lx ", config_tables[i].table);
400#endif
401 } else if (!efi_guidcmp(config_tables[i].guid,
402 HCDP_TABLE_GUID)) {
403 efi.hcdp = config_tables[i].table;
404 printk(" HCDP=0x%lx ", config_tables[i].table);
405 } else if (!efi_guidcmp(config_tables[i].guid,
406 UGA_IO_PROTOCOL_GUID)) {
407 efi.uga = config_tables[i].table;
408 printk(" UGA=0x%lx ", config_tables[i].table);
409 }
410 }
411 printk("\n");
412 early_iounmap(config_tables,
413 efi.systab->nr_tables * sizeof(efi_config_table_t));
414
415 /*
416 * Check out the runtime services table. We need to map
417 * the runtime services table so that we can grab the physical
418 * address of several of the EFI runtime functions, needed to
419 * set the firmware into virtual mode.
420 */
421 runtime = early_ioremap((unsigned long)efi.systab->runtime,
422 sizeof(efi_runtime_services_t));
423 if (runtime != NULL) {
424 /*
425 * We will only need *early* access to the following
426 * two EFI runtime services before set_virtual_address_map
427 * is invoked.
428 */
429 efi_phys.get_time = (efi_get_time_t *)runtime->get_time;
430 efi_phys.set_virtual_address_map =
431 (efi_set_virtual_address_map_t *)
432 runtime->set_virtual_address_map;
433 /*
434 * Make efi_get_time can be called before entering
435 * virtual mode.
436 */
437 efi.get_time = phys_efi_get_time;
438 } else
439 printk(KERN_ERR "Could not map the EFI runtime service "
440 "table!\n");
441 early_iounmap(runtime, sizeof(efi_runtime_services_t));
442
443 /* Map the EFI memory map */
444 memmap.map = early_ioremap((unsigned long)memmap.phys_map,
445 memmap.nr_map * memmap.desc_size);
446 if (memmap.map == NULL)
447 printk(KERN_ERR "Could not map the EFI memory map!\n");
448 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
449
450 if (memmap.desc_size != sizeof(efi_memory_desc_t))
451 printk(KERN_WARNING
452 "Kernel-defined memdesc doesn't match the one from EFI!\n");
453
454 if (add_efi_memmap)
455 do_add_efi_memmap();
456
457#ifdef CONFIG_X86_32
458 x86_platform.get_wallclock = efi_get_time;
459 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif
461
462 /* Setup for EFI runtime service */
463 reboot_type = BOOT_EFI;
464
465#if EFI_DEBUG
466 print_efi_memmap();
467#endif
468}
469
470static void __init runtime_code_page_mkexec(void)
471{
472 efi_memory_desc_t *md;
473 void *p;
474 u64 addr, npages;
475
476 /* Make EFI runtime service code area executable */
477 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
478 md = p;
479
480 if (md->type != EFI_RUNTIME_SERVICES_CODE)
481 continue;
482
483 addr = md->virt_addr;
484 npages = md->num_pages;
485 memrange_efi_to_native(&addr, &npages);
486 set_memory_x(addr, npages);
487 }
488}
489
490/*
491 * This function will switch the EFI runtime services to virtual mode.
492 * Essentially, look through the EFI memmap and map every region that
493 * has the runtime attribute bit set in its memory descriptor and update
494 * that memory descriptor with the virtual address obtained from ioremap().
495 * This enables the runtime services to be called without having to
496 * thunk back into physical mode for every invocation.
497 */
498void __init efi_enter_virtual_mode(void)
499{
500 efi_memory_desc_t *md;
501 efi_status_t status;
502 unsigned long size;
503 u64 end, systab, addr, npages, end_pfn;
504 void *p, *va;
505
506 efi.systab = NULL;
507 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
508 md = p;
509 if (!(md->attribute & EFI_MEMORY_RUNTIME))
510 continue;
511
512 size = md->num_pages << EFI_PAGE_SHIFT;
513 end = md->phys_addr + size;
514
515 end_pfn = PFN_UP(end);
516 if (end_pfn <= max_low_pfn_mapped
517 || (end_pfn > (1UL << (32 - PAGE_SHIFT))
518 && end_pfn <= max_pfn_mapped))
519 va = __va(md->phys_addr);
520 else
521 va = efi_ioremap(md->phys_addr, size, md->type);
522
523 md->virt_addr = (u64) (unsigned long) va;
524
525 if (!va) {
526 printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n",
527 (unsigned long long)md->phys_addr);
528 continue;
529 }
530
531 if (!(md->attribute & EFI_MEMORY_WB)) {
532 addr = md->virt_addr;
533 npages = md->num_pages;
534 memrange_efi_to_native(&addr, &npages);
535 set_memory_uc(addr, npages);
536 }
537
538 systab = (u64) (unsigned long) efi_phys.systab;
539 if (md->phys_addr <= systab && systab < end) {
540 systab += md->virt_addr - md->phys_addr;
541 efi.systab = (efi_system_table_t *) (unsigned long) systab;
542 }
543 }
544
545 BUG_ON(!efi.systab);
546
547 status = phys_efi_set_virtual_address_map(
548 memmap.desc_size * memmap.nr_map,
549 memmap.desc_size,
550 memmap.desc_version,
551 memmap.phys_map);
552
553 if (status != EFI_SUCCESS) {
554 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
555 "(status=%lx)!\n", status);
556 panic("EFI call to SetVirtualAddressMap() failed!");
557 }
558
559 /*
560 * Now that EFI is in virtual mode, update the function
561 * pointers in the runtime service table to the new virtual addresses.
562 *
563 * Call EFI services through wrapper functions.
564 */
565 efi.get_time = virt_efi_get_time;
566 efi.set_time = virt_efi_set_time;
567 efi.get_wakeup_time = virt_efi_get_wakeup_time;
568 efi.set_wakeup_time = virt_efi_set_wakeup_time;
569 efi.get_variable = virt_efi_get_variable;
570 efi.get_next_variable = virt_efi_get_next_variable;
571 efi.set_variable = virt_efi_set_variable;
572 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
573 efi.reset_system = virt_efi_reset_system;
574 efi.set_virtual_address_map = virt_efi_set_virtual_address_map;
575 if (__supported_pte_mask & _PAGE_NX)
576 runtime_code_page_mkexec();
577 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
578 memmap.map = NULL;
579}
580
581/*
582 * Convenience functions to obtain memory types and attributes
583 */
584u32 efi_mem_type(unsigned long phys_addr)
585{
586 efi_memory_desc_t *md;
587 void *p;
588
589 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
590 md = p;
591 if ((md->phys_addr <= phys_addr) &&
592 (phys_addr < (md->phys_addr +
593 (md->num_pages << EFI_PAGE_SHIFT))))
594 return md->type;
595 }
596 return 0;
597}
598
599u64 efi_mem_attributes(unsigned long phys_addr)
600{
601 efi_memory_desc_t *md;
602 void *p;
603
604 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
605 md = p;
606 if ((md->phys_addr <= phys_addr) &&
607 (phys_addr < (md->phys_addr +
608 (md->num_pages << EFI_PAGE_SHIFT))))
609 return md->attribute;
610 }
611 return 0;
612}
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
deleted file mode 100644
index 5cab48ee61a4..000000000000
--- a/arch/x86/kernel/efi_32.c
+++ /dev/null
@@ -1,112 +0,0 @@
1/*
2 * Extensible Firmware Interface
3 *
4 * Based on Extensible Firmware Interface Specification version 1.0
5 *
6 * Copyright (C) 1999 VA Linux Systems
7 * Copyright (C) 1999 Walt Drummond <drummond@valinux.com>
8 * Copyright (C) 1999-2002 Hewlett-Packard Co.
9 * David Mosberger-Tang <davidm@hpl.hp.com>
10 * Stephane Eranian <eranian@hpl.hp.com>
11 *
12 * All EFI Runtime Services are not implemented yet as EFI only
13 * supports physical mode addressing on SoftSDV. This is to be fixed
14 * in a future version. --drummond 1999-07-20
15 *
16 * Implemented EFI runtime services and virtual mode calls. --davidm
17 *
18 * Goutham Rao: <goutham.rao@intel.com>
19 * Skip non-WB memory and ignore empty memory ranges.
20 */
21
22#include <linux/kernel.h>
23#include <linux/types.h>
24#include <linux/ioport.h>
25#include <linux/efi.h>
26
27#include <asm/io.h>
28#include <asm/page.h>
29#include <asm/pgtable.h>
30#include <asm/tlbflush.h>
31#include <asm/efi.h>
32
33/*
34 * To make EFI call EFI runtime service in physical addressing mode we need
35 * prelog/epilog before/after the invocation to disable interrupt, to
36 * claim EFI runtime service handler exclusively and to duplicate a memory in
37 * low memory space say 0 - 3G.
38 */
39
40static unsigned long efi_rt_eflags;
41static pgd_t efi_bak_pg_dir_pointer[2];
42
43void efi_call_phys_prelog(void)
44{
45 unsigned long cr4;
46 unsigned long temp;
47 struct desc_ptr gdt_descr;
48
49 local_irq_save(efi_rt_eflags);
50
51 /*
52 * If I don't have PAE, I should just duplicate two entries in page
53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory.
55 */
56 cr4 = read_cr4_safe();
57
58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd =
60 swapper_pg_dir[pgd_index(0)].pgd;
61 swapper_pg_dir[0].pgd =
62 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
63 } else {
64 efi_bak_pg_dir_pointer[0].pgd =
65 swapper_pg_dir[pgd_index(0)].pgd;
66 efi_bak_pg_dir_pointer[1].pgd =
67 swapper_pg_dir[pgd_index(0x400000)].pgd;
68 swapper_pg_dir[pgd_index(0)].pgd =
69 swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd;
70 temp = PAGE_OFFSET + 0x400000;
71 swapper_pg_dir[pgd_index(0x400000)].pgd =
72 swapper_pg_dir[pgd_index(temp)].pgd;
73 }
74
75 /*
76 * After the lock is released, the original page table is restored.
77 */
78 __flush_tlb_all();
79
80 gdt_descr.address = __pa(get_cpu_gdt_table(0));
81 gdt_descr.size = GDT_SIZE - 1;
82 load_gdt(&gdt_descr);
83}
84
85void efi_call_phys_epilog(void)
86{
87 unsigned long cr4;
88 struct desc_ptr gdt_descr;
89
90 gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr);
93
94 cr4 = read_cr4_safe();
95
96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd =
98 efi_bak_pg_dir_pointer[0].pgd;
99 } else {
100 swapper_pg_dir[pgd_index(0)].pgd =
101 efi_bak_pg_dir_pointer[0].pgd;
102 swapper_pg_dir[pgd_index(0x400000)].pgd =
103 efi_bak_pg_dir_pointer[1].pgd;
104 }
105
106 /*
107 * After the lock is released, the original page table is restored.
108 */
109 __flush_tlb_all();
110
111 local_irq_restore(efi_rt_eflags);
112}
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
deleted file mode 100644
index ac0621a7ac3d..000000000000
--- a/arch/x86/kernel/efi_64.c
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * x86_64 specific EFI support functions
3 * Based on Extensible Firmware Interface Specification version 1.0
4 *
5 * Copyright (C) 2005-2008 Intel Co.
6 * Fenghua Yu <fenghua.yu@intel.com>
7 * Bibo Mao <bibo.mao@intel.com>
8 * Chandramouli Narayanan <mouli@linux.intel.com>
9 * Huang Ying <ying.huang@intel.com>
10 *
11 * Code to convert EFI to E820 map has been implemented in elilo bootloader
12 * based on a EFI patch by Edgar Hucek. Based on the E820 map, the page table
13 * is setup appropriately for EFI runtime code.
14 * - mouli 06/14/2007.
15 *
16 */
17
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/mm.h>
21#include <linux/types.h>
22#include <linux/spinlock.h>
23#include <linux/bootmem.h>
24#include <linux/ioport.h>
25#include <linux/module.h>
26#include <linux/efi.h>
27#include <linux/uaccess.h>
28#include <linux/io.h>
29#include <linux/reboot.h>
30
31#include <asm/setup.h>
32#include <asm/page.h>
33#include <asm/e820.h>
34#include <asm/pgtable.h>
35#include <asm/tlbflush.h>
36#include <asm/proto.h>
37#include <asm/efi.h>
38#include <asm/cacheflush.h>
39#include <asm/fixmap.h>
40
41static pgd_t save_pgd __initdata;
42static unsigned long efi_flags __initdata;
43
44static void __init early_mapping_set_exec(unsigned long start,
45 unsigned long end,
46 int executable)
47{
48 unsigned long num_pages;
49
50 start &= PMD_MASK;
51 end = (end + PMD_SIZE - 1) & PMD_MASK;
52 num_pages = (end - start) >> PAGE_SHIFT;
53 if (executable)
54 set_memory_x((unsigned long)__va(start), num_pages);
55 else
56 set_memory_nx((unsigned long)__va(start), num_pages);
57}
58
59static void __init early_runtime_code_mapping_set_exec(int executable)
60{
61 efi_memory_desc_t *md;
62 void *p;
63
64 if (!(__supported_pte_mask & _PAGE_NX))
65 return;
66
67 /* Make EFI runtime service code area executable */
68 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
69 md = p;
70 if (md->type == EFI_RUNTIME_SERVICES_CODE) {
71 unsigned long end;
72 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
73 early_mapping_set_exec(md->phys_addr, end, executable);
74 }
75 }
76}
77
78void __init efi_call_phys_prelog(void)
79{
80 unsigned long vaddress;
81
82 early_runtime_code_mapping_set_exec(1);
83 local_irq_save(efi_flags);
84 vaddress = (unsigned long)__va(0x0UL);
85 save_pgd = *pgd_offset_k(0x0UL);
86 set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress));
87 __flush_tlb_all();
88}
89
90void __init efi_call_phys_epilog(void)
91{
92 /*
93 * After the lock is released, the original page table is restored.
94 */
95 set_pgd(pgd_offset_k(0x0UL), save_pgd);
96 __flush_tlb_all();
97 local_irq_restore(efi_flags);
98 early_runtime_code_mapping_set_exec(0);
99}
100
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
102 u32 type)
103{
104 unsigned long last_map_pfn;
105
106 if (type == EFI_MEMORY_MAPPED_IO)
107 return ioremap(phys_addr, size);
108
109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size)
111 return NULL;
112
113 return (void __iomem *)__va(phys_addr);
114}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S
deleted file mode 100644
index fbe66e626c09..000000000000
--- a/arch/x86/kernel/efi_stub_32.S
+++ /dev/null
@@ -1,123 +0,0 @@
1/*
2 * EFI call stub for IA32.
3 *
4 * This stub allows us to make EFI calls in physical mode with interrupts
5 * turned off.
6 */
7
8#include <linux/linkage.h>
9#include <asm/page_types.h>
10
11/*
12 * efi_call_phys(void *, ...) is a function with variable parameters.
13 * All the callers of this function assure that all the parameters are 4-bytes.
14 */
15
16/*
17 * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save.
18 * So we'd better save all of them at the beginning of this function and restore
19 * at the end no matter how many we use, because we can not assure EFI runtime
20 * service functions will comply with gcc calling convention, too.
21 */
22
23.text
24ENTRY(efi_call_phys)
25 /*
26 * 0. The function can only be called in Linux kernel. So CS has been
27 * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found
28 * the values of these registers are the same. And, the corresponding
29 * GDT entries are identical. So I will do nothing about segment reg
30 * and GDT, but change GDT base register in prelog and epilog.
31 */
32
33 /*
34 * 1. Now I am running with EIP = <physical address> + PAGE_OFFSET.
35 * But to make it smoothly switch from virtual mode to flat mode.
36 * The mapping of lower virtual memory has been created in prelog and
37 * epilog.
38 */
39 movl $1f, %edx
40 subl $__PAGE_OFFSET, %edx
41 jmp *%edx
421:
43
44 /*
45 * 2. Now on the top of stack is the return
46 * address in the caller of efi_call_phys(), then parameter 1,
47 * parameter 2, ..., param n. To make things easy, we save the return
48 * address of efi_call_phys in a global variable.
49 */
50 popl %edx
51 movl %edx, saved_return_addr
52 /* get the function pointer into ECX*/
53 popl %ecx
54 movl %ecx, efi_rt_function_ptr
55 movl $2f, %edx
56 subl $__PAGE_OFFSET, %edx
57 pushl %edx
58
59 /*
60 * 3. Clear PG bit in %CR0.
61 */
62 movl %cr0, %edx
63 andl $0x7fffffff, %edx
64 movl %edx, %cr0
65 jmp 1f
661:
67
68 /*
69 * 4. Adjust stack pointer.
70 */
71 subl $__PAGE_OFFSET, %esp
72
73 /*
74 * 5. Call the physical function.
75 */
76 jmp *%ecx
77
782:
79 /*
80 * 6. After EFI runtime service returns, control will return to
81 * following instruction. We'd better readjust stack pointer first.
82 */
83 addl $__PAGE_OFFSET, %esp
84
85 /*
86 * 7. Restore PG bit
87 */
88 movl %cr0, %edx
89 orl $0x80000000, %edx
90 movl %edx, %cr0
91 jmp 1f
921:
93 /*
94 * 8. Now restore the virtual mode from flat mode by
95 * adding EIP with PAGE_OFFSET.
96 */
97 movl $1f, %edx
98 jmp *%edx
991:
100
101 /*
102 * 9. Balance the stack. And because EAX contain the return value,
103 * we'd better not clobber it.
104 */
105 leal efi_rt_function_ptr, %edx
106 movl (%edx), %ecx
107 pushl %ecx
108
109 /*
110 * 10. Push the saved return address onto the stack and return.
111 */
112 leal saved_return_addr, %edx
113 movl (%edx), %ecx
114 pushl %ecx
115 ret
116ENDPROC(efi_call_phys)
117.previous
118
119.data
120saved_return_addr:
121 .long 0
122efi_rt_function_ptr:
123 .long 0
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/kernel/efi_stub_64.S
deleted file mode 100644
index 4c07ccab8146..000000000000
--- a/arch/x86/kernel/efi_stub_64.S
+++ /dev/null
@@ -1,116 +0,0 @@
1/*
2 * Function calling ABI conversion from Linux to EFI for x86_64
3 *
4 * Copyright (C) 2007 Intel Corp
5 * Bibo Mao <bibo.mao@intel.com>
6 * Huang Ying <ying.huang@intel.com>
7 */
8
9#include <linux/linkage.h>
10
11#define SAVE_XMM \
12 mov %rsp, %rax; \
13 subq $0x70, %rsp; \
14 and $~0xf, %rsp; \
15 mov %rax, (%rsp); \
16 mov %cr0, %rax; \
17 clts; \
18 mov %rax, 0x8(%rsp); \
19 movaps %xmm0, 0x60(%rsp); \
20 movaps %xmm1, 0x50(%rsp); \
21 movaps %xmm2, 0x40(%rsp); \
22 movaps %xmm3, 0x30(%rsp); \
23 movaps %xmm4, 0x20(%rsp); \
24 movaps %xmm5, 0x10(%rsp)
25
26#define RESTORE_XMM \
27 movaps 0x60(%rsp), %xmm0; \
28 movaps 0x50(%rsp), %xmm1; \
29 movaps 0x40(%rsp), %xmm2; \
30 movaps 0x30(%rsp), %xmm3; \
31 movaps 0x20(%rsp), %xmm4; \
32 movaps 0x10(%rsp), %xmm5; \
33 mov 0x8(%rsp), %rsi; \
34 mov %rsi, %cr0; \
35 mov (%rsp), %rsp
36
37ENTRY(efi_call0)
38 SAVE_XMM
39 subq $32, %rsp
40 call *%rdi
41 addq $32, %rsp
42 RESTORE_XMM
43 ret
44ENDPROC(efi_call0)
45
46ENTRY(efi_call1)
47 SAVE_XMM
48 subq $32, %rsp
49 mov %rsi, %rcx
50 call *%rdi
51 addq $32, %rsp
52 RESTORE_XMM
53 ret
54ENDPROC(efi_call1)
55
56ENTRY(efi_call2)
57 SAVE_XMM
58 subq $32, %rsp
59 mov %rsi, %rcx
60 call *%rdi
61 addq $32, %rsp
62 RESTORE_XMM
63 ret
64ENDPROC(efi_call2)
65
66ENTRY(efi_call3)
67 SAVE_XMM
68 subq $32, %rsp
69 mov %rcx, %r8
70 mov %rsi, %rcx
71 call *%rdi
72 addq $32, %rsp
73 RESTORE_XMM
74 ret
75ENDPROC(efi_call3)
76
77ENTRY(efi_call4)
78 SAVE_XMM
79 subq $32, %rsp
80 mov %r8, %r9
81 mov %rcx, %r8
82 mov %rsi, %rcx
83 call *%rdi
84 addq $32, %rsp
85 RESTORE_XMM
86 ret
87ENDPROC(efi_call4)
88
89ENTRY(efi_call5)
90 SAVE_XMM
91 subq $48, %rsp
92 mov %r9, 32(%rsp)
93 mov %r8, %r9
94 mov %rcx, %r8
95 mov %rsi, %rcx
96 call *%rdi
97 addq $48, %rsp
98 RESTORE_XMM
99 ret
100ENDPROC(efi_call5)
101
102ENTRY(efi_call6)
103 SAVE_XMM
104 mov (%rsp), %rax
105 mov 8(%rax), %rax
106 subq $48, %rsp
107 mov %r9, 32(%rsp)
108 mov %rax, 40(%rsp)
109 mov %r8, %r9
110 mov %rcx, %r8
111 mov %rsi, %rcx
112 call *%rdi
113 addq $48, %rsp
114 RESTORE_XMM
115 ret
116ENDPROC(efi_call6)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -115,8 +117,7 @@
115 117
116 /* unfortunately push/pop can't be no-op */ 118 /* unfortunately push/pop can't be no-op */
117.macro PUSH_GS 119.macro PUSH_GS
118 pushl $0 120 pushl_cfi $0
119 CFI_ADJUST_CFA_OFFSET 4
120.endm 121.endm
121.macro POP_GS pop=0 122.macro POP_GS pop=0
122 addl $(4 + \pop), %esp 123 addl $(4 + \pop), %esp
@@ -140,14 +141,12 @@
140#else /* CONFIG_X86_32_LAZY_GS */ 141#else /* CONFIG_X86_32_LAZY_GS */
141 142
142.macro PUSH_GS 143.macro PUSH_GS
143 pushl %gs 144 pushl_cfi %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/ 145 /*CFI_REL_OFFSET gs, 0*/
146.endm 146.endm
147 147
148.macro POP_GS pop=0 148.macro POP_GS pop=0
14998: popl %gs 14998: popl_cfi %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/ 150 /*CFI_RESTORE gs*/
152 .if \pop <> 0 151 .if \pop <> 0
153 add $\pop, %esp 152 add $\pop, %esp
@@ -195,35 +194,25 @@
195.macro SAVE_ALL 194.macro SAVE_ALL
196 cld 195 cld
197 PUSH_GS 196 PUSH_GS
198 pushl %fs 197 pushl_cfi %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/ 198 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es 199 pushl_cfi %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/ 200 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds 201 pushl_cfi %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/ 202 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax 203 pushl_cfi %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0 204 CFI_REL_OFFSET eax, 0
210 pushl %ebp 205 pushl_cfi %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0 206 CFI_REL_OFFSET ebp, 0
213 pushl %edi 207 pushl_cfi %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0 208 CFI_REL_OFFSET edi, 0
216 pushl %esi 209 pushl_cfi %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0 210 CFI_REL_OFFSET esi, 0
219 pushl %edx 211 pushl_cfi %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0 212 CFI_REL_OFFSET edx, 0
222 pushl %ecx 213 pushl_cfi %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0 214 CFI_REL_OFFSET ecx, 0
225 pushl %ebx 215 pushl_cfi %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0 216 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx 217 movl $(__USER_DS), %edx
229 movl %edx, %ds 218 movl %edx, %ds
@@ -234,39 +223,29 @@
234.endm 223.endm
235 224
236.macro RESTORE_INT_REGS 225.macro RESTORE_INT_REGS
237 popl %ebx 226 popl_cfi %ebx
238 CFI_ADJUST_CFA_OFFSET -4
239 CFI_RESTORE ebx 227 CFI_RESTORE ebx
240 popl %ecx 228 popl_cfi %ecx
241 CFI_ADJUST_CFA_OFFSET -4
242 CFI_RESTORE ecx 229 CFI_RESTORE ecx
243 popl %edx 230 popl_cfi %edx
244 CFI_ADJUST_CFA_OFFSET -4
245 CFI_RESTORE edx 231 CFI_RESTORE edx
246 popl %esi 232 popl_cfi %esi
247 CFI_ADJUST_CFA_OFFSET -4
248 CFI_RESTORE esi 233 CFI_RESTORE esi
249 popl %edi 234 popl_cfi %edi
250 CFI_ADJUST_CFA_OFFSET -4
251 CFI_RESTORE edi 235 CFI_RESTORE edi
252 popl %ebp 236 popl_cfi %ebp
253 CFI_ADJUST_CFA_OFFSET -4
254 CFI_RESTORE ebp 237 CFI_RESTORE ebp
255 popl %eax 238 popl_cfi %eax
256 CFI_ADJUST_CFA_OFFSET -4
257 CFI_RESTORE eax 239 CFI_RESTORE eax
258.endm 240.endm
259 241
260.macro RESTORE_REGS pop=0 242.macro RESTORE_REGS pop=0
261 RESTORE_INT_REGS 243 RESTORE_INT_REGS
2621: popl %ds 2441: popl_cfi %ds
263 CFI_ADJUST_CFA_OFFSET -4
264 /*CFI_RESTORE ds;*/ 245 /*CFI_RESTORE ds;*/
2652: popl %es 2462: popl_cfi %es
266 CFI_ADJUST_CFA_OFFSET -4
267 /*CFI_RESTORE es;*/ 247 /*CFI_RESTORE es;*/
2683: popl %fs 2483: popl_cfi %fs
269 CFI_ADJUST_CFA_OFFSET -4
270 /*CFI_RESTORE fs;*/ 249 /*CFI_RESTORE fs;*/
271 POP_GS \pop 250 POP_GS \pop
272.pushsection .fixup, "ax" 251.pushsection .fixup, "ax"
@@ -320,16 +299,12 @@
320 299
321ENTRY(ret_from_fork) 300ENTRY(ret_from_fork)
322 CFI_STARTPROC 301 CFI_STARTPROC
323 pushl %eax 302 pushl_cfi %eax
324 CFI_ADJUST_CFA_OFFSET 4
325 call schedule_tail 303 call schedule_tail
326 GET_THREAD_INFO(%ebp) 304 GET_THREAD_INFO(%ebp)
327 popl %eax 305 popl_cfi %eax
328 CFI_ADJUST_CFA_OFFSET -4 306 pushl_cfi $0x0202 # Reset kernel eflags
329 pushl $0x0202 # Reset kernel eflags 307 popfl_cfi
330 CFI_ADJUST_CFA_OFFSET 4
331 popfl
332 CFI_ADJUST_CFA_OFFSET -4
333 jmp syscall_exit 308 jmp syscall_exit
334 CFI_ENDPROC 309 CFI_ENDPROC
335END(ret_from_fork) 310END(ret_from_fork)
@@ -409,29 +384,23 @@ sysenter_past_esp:
409 * enough kernel state to call TRACE_IRQS_OFF can be called - but 384 * enough kernel state to call TRACE_IRQS_OFF can be called - but
410 * we immediately enable interrupts at that point anyway. 385 * we immediately enable interrupts at that point anyway.
411 */ 386 */
412 pushl $(__USER_DS) 387 pushl_cfi $__USER_DS
413 CFI_ADJUST_CFA_OFFSET 4
414 /*CFI_REL_OFFSET ss, 0*/ 388 /*CFI_REL_OFFSET ss, 0*/
415 pushl %ebp 389 pushl_cfi %ebp
416 CFI_ADJUST_CFA_OFFSET 4
417 CFI_REL_OFFSET esp, 0 390 CFI_REL_OFFSET esp, 0
418 pushfl 391 pushfl_cfi
419 orl $X86_EFLAGS_IF, (%esp) 392 orl $X86_EFLAGS_IF, (%esp)
420 CFI_ADJUST_CFA_OFFSET 4 393 pushl_cfi $__USER_CS
421 pushl $(__USER_CS)
422 CFI_ADJUST_CFA_OFFSET 4
423 /*CFI_REL_OFFSET cs, 0*/ 394 /*CFI_REL_OFFSET cs, 0*/
424 /* 395 /*
425 * Push current_thread_info()->sysenter_return to the stack. 396 * Push current_thread_info()->sysenter_return to the stack.
426 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
427 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
428 */ 399 */
429 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
430 CFI_ADJUST_CFA_OFFSET 4
431 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
432 402
433 pushl %eax 403 pushl_cfi %eax
434 CFI_ADJUST_CFA_OFFSET 4
435 SAVE_ALL 404 SAVE_ALL
436 ENABLE_INTERRUPTS(CLBR_NONE) 405 ENABLE_INTERRUPTS(CLBR_NONE)
437 406
@@ -486,8 +455,7 @@ sysenter_audit:
486 movl %eax,%edx /* 2nd arg: syscall number */ 455 movl %eax,%edx /* 2nd arg: syscall number */
487 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 456 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
488 call audit_syscall_entry 457 call audit_syscall_entry
489 pushl %ebx 458 pushl_cfi %ebx
490 CFI_ADJUST_CFA_OFFSET 4
491 movl PT_EAX(%esp),%eax /* reload syscall number */ 459 movl PT_EAX(%esp),%eax /* reload syscall number */
492 jmp sysenter_do_call 460 jmp sysenter_do_call
493 461
@@ -529,8 +497,7 @@ ENDPROC(ia32_sysenter_target)
529 # system call handler stub 497 # system call handler stub
530ENTRY(system_call) 498ENTRY(system_call)
531 RING0_INT_FRAME # can't unwind into user space anyway 499 RING0_INT_FRAME # can't unwind into user space anyway
532 pushl %eax # save orig_eax 500 pushl_cfi %eax # save orig_eax
533 CFI_ADJUST_CFA_OFFSET 4
534 SAVE_ALL 501 SAVE_ALL
535 GET_THREAD_INFO(%ebp) 502 GET_THREAD_INFO(%ebp)
536 # system call tracing in operation / emulation 503 # system call tracing in operation / emulation
@@ -566,7 +533,6 @@ restore_all_notrace:
566 je ldt_ss # returning to user-space with LDT SS 533 je ldt_ss # returning to user-space with LDT SS
567restore_nocheck: 534restore_nocheck:
568 RESTORE_REGS 4 # skip orig_eax/error_code 535 RESTORE_REGS 4 # skip orig_eax/error_code
569 CFI_ADJUST_CFA_OFFSET -4
570irq_return: 536irq_return:
571 INTERRUPT_RETURN 537 INTERRUPT_RETURN
572.section .fixup,"ax" 538.section .fixup,"ax"
@@ -619,10 +585,8 @@ ldt_ss:
619 shr $16, %edx 585 shr $16, %edx
620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 586 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 587 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 588 pushl_cfi $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 589 pushl_cfi %eax /* new kernel esp */
624 push %eax /* new kernel esp */
625 CFI_ADJUST_CFA_OFFSET 4
626 /* Disable interrupts, but do not irqtrace this section: we 590 /* Disable interrupts, but do not irqtrace this section: we
627 * will soon execute iret and the tracer was already set to 591 * will soon execute iret and the tracer was already set to
628 * the irqstate after the iret */ 592 * the irqstate after the iret */
@@ -666,11 +630,9 @@ work_notifysig: # deal with pending signals and
666 630
667 ALIGN 631 ALIGN
668work_notifysig_v86: 632work_notifysig_v86:
669 pushl %ecx # save ti_flags for do_notify_resume 633 pushl_cfi %ecx # save ti_flags for do_notify_resume
670 CFI_ADJUST_CFA_OFFSET 4
671 call save_v86_state # %eax contains pt_regs pointer 634 call save_v86_state # %eax contains pt_regs pointer
672 popl %ecx 635 popl_cfi %ecx
673 CFI_ADJUST_CFA_OFFSET -4
674 movl %eax, %esp 636 movl %eax, %esp
675#else 637#else
676 movl %esp, %eax 638 movl %esp, %eax
@@ -750,14 +712,18 @@ ptregs_##name: \
750#define PTREGSCALL3(name) \ 712#define PTREGSCALL3(name) \
751 ALIGN; \ 713 ALIGN; \
752ptregs_##name: \ 714ptregs_##name: \
715 CFI_STARTPROC; \
753 leal 4(%esp),%eax; \ 716 leal 4(%esp),%eax; \
754 pushl %eax; \ 717 pushl_cfi %eax; \
755 movl PT_EDX(%eax),%ecx; \ 718 movl PT_EDX(%eax),%ecx; \
756 movl PT_ECX(%eax),%edx; \ 719 movl PT_ECX(%eax),%edx; \
757 movl PT_EBX(%eax),%eax; \ 720 movl PT_EBX(%eax),%eax; \
758 call sys_##name; \ 721 call sys_##name; \
759 addl $4,%esp; \ 722 addl $4,%esp; \
760 ret 723 CFI_ADJUST_CFA_OFFSET -4; \
724 ret; \
725 CFI_ENDPROC; \
726ENDPROC(ptregs_##name)
761 727
762PTREGSCALL1(iopl) 728PTREGSCALL1(iopl)
763PTREGSCALL0(fork) 729PTREGSCALL0(fork)
@@ -772,15 +738,19 @@ PTREGSCALL1(vm86old)
772/* Clone is an oddball. The 4th arg is in %edi */ 738/* Clone is an oddball. The 4th arg is in %edi */
773 ALIGN; 739 ALIGN;
774ptregs_clone: 740ptregs_clone:
741 CFI_STARTPROC
775 leal 4(%esp),%eax 742 leal 4(%esp),%eax
776 pushl %eax 743 pushl_cfi %eax
777 pushl PT_EDI(%eax) 744 pushl_cfi PT_EDI(%eax)
778 movl PT_EDX(%eax),%ecx 745 movl PT_EDX(%eax),%ecx
779 movl PT_ECX(%eax),%edx 746 movl PT_ECX(%eax),%edx
780 movl PT_EBX(%eax),%eax 747 movl PT_EBX(%eax),%eax
781 call sys_clone 748 call sys_clone
782 addl $8,%esp 749 addl $8,%esp
750 CFI_ADJUST_CFA_OFFSET -8
783 ret 751 ret
752 CFI_ENDPROC
753ENDPROC(ptregs_clone)
784 754
785.macro FIXUP_ESPFIX_STACK 755.macro FIXUP_ESPFIX_STACK
786/* 756/*
@@ -795,10 +765,8 @@ ptregs_clone:
795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 765 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 shl $16, %eax 766 shl $16, %eax
797 addl %esp, %eax /* the adjusted stack pointer */ 767 addl %esp, %eax /* the adjusted stack pointer */
798 pushl $__KERNEL_DS 768 pushl_cfi $__KERNEL_DS
799 CFI_ADJUST_CFA_OFFSET 4 769 pushl_cfi %eax
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 lss (%esp), %esp /* switch to the normal stack segment */ 770 lss (%esp), %esp /* switch to the normal stack segment */
803 CFI_ADJUST_CFA_OFFSET -8 771 CFI_ADJUST_CFA_OFFSET -8
804.endm 772.endm
@@ -822,7 +790,7 @@ ptregs_clone:
822 */ 790 */
823.section .init.rodata,"a" 791.section .init.rodata,"a"
824ENTRY(interrupt) 792ENTRY(interrupt)
825.text 793.section .entry.text, "ax"
826 .p2align 5 794 .p2align 5
827 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
828ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -835,14 +803,13 @@ vector=FIRST_EXTERNAL_VECTOR
835 .if vector <> FIRST_EXTERNAL_VECTOR 803 .if vector <> FIRST_EXTERNAL_VECTOR
836 CFI_ADJUST_CFA_OFFSET -4 804 CFI_ADJUST_CFA_OFFSET -4
837 .endif 805 .endif
8381: pushl $(~vector+0x80) /* Note: always in signed byte range */ 8061: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
839 CFI_ADJUST_CFA_OFFSET 4
840 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 807 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
841 jmp 2f 808 jmp 2f
842 .endif 809 .endif
843 .previous 810 .previous
844 .long 1b 811 .long 1b
845 .text 812 .section .entry.text, "ax"
846vector=vector+1 813vector=vector+1
847 .endif 814 .endif
848 .endr 815 .endr
@@ -876,8 +843,7 @@ ENDPROC(common_interrupt)
876#define BUILD_INTERRUPT3(name, nr, fn) \ 843#define BUILD_INTERRUPT3(name, nr, fn) \
877ENTRY(name) \ 844ENTRY(name) \
878 RING0_INT_FRAME; \ 845 RING0_INT_FRAME; \
879 pushl $~(nr); \ 846 pushl_cfi $~(nr); \
880 CFI_ADJUST_CFA_OFFSET 4; \
881 SAVE_ALL; \ 847 SAVE_ALL; \
882 TRACE_IRQS_OFF \ 848 TRACE_IRQS_OFF \
883 movl %esp,%eax; \ 849 movl %esp,%eax; \
@@ -893,21 +859,18 @@ ENDPROC(name)
893 859
894ENTRY(coprocessor_error) 860ENTRY(coprocessor_error)
895 RING0_INT_FRAME 861 RING0_INT_FRAME
896 pushl $0 862 pushl_cfi $0
897 CFI_ADJUST_CFA_OFFSET 4 863 pushl_cfi $do_coprocessor_error
898 pushl $do_coprocessor_error
899 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 864 jmp error_code
901 CFI_ENDPROC 865 CFI_ENDPROC
902END(coprocessor_error) 866END(coprocessor_error)
903 867
904ENTRY(simd_coprocessor_error) 868ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 869 RING0_INT_FRAME
906 pushl $0 870 pushl_cfi $0
907 CFI_ADJUST_CFA_OFFSET 4
908#ifdef CONFIG_X86_INVD_BUG 871#ifdef CONFIG_X86_INVD_BUG
909 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 872 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
910661: pushl $do_general_protection 873661: pushl_cfi $do_general_protection
911662: 874662:
912.section .altinstructions,"a" 875.section .altinstructions,"a"
913 .balign 4 876 .balign 4
@@ -922,19 +885,16 @@ ENTRY(simd_coprocessor_error)
922664: 885664:
923.previous 886.previous
924#else 887#else
925 pushl $do_simd_coprocessor_error 888 pushl_cfi $do_simd_coprocessor_error
926#endif 889#endif
927 CFI_ADJUST_CFA_OFFSET 4
928 jmp error_code 890 jmp error_code
929 CFI_ENDPROC 891 CFI_ENDPROC
930END(simd_coprocessor_error) 892END(simd_coprocessor_error)
931 893
932ENTRY(device_not_available) 894ENTRY(device_not_available)
933 RING0_INT_FRAME 895 RING0_INT_FRAME
934 pushl $-1 # mark this as an int 896 pushl_cfi $-1 # mark this as an int
935 CFI_ADJUST_CFA_OFFSET 4 897 pushl_cfi $do_device_not_available
936 pushl $do_device_not_available
937 CFI_ADJUST_CFA_OFFSET 4
938 jmp error_code 898 jmp error_code
939 CFI_ENDPROC 899 CFI_ENDPROC
940END(device_not_available) 900END(device_not_available)
@@ -956,82 +916,68 @@ END(native_irq_enable_sysexit)
956 916
957ENTRY(overflow) 917ENTRY(overflow)
958 RING0_INT_FRAME 918 RING0_INT_FRAME
959 pushl $0 919 pushl_cfi $0
960 CFI_ADJUST_CFA_OFFSET 4 920 pushl_cfi $do_overflow
961 pushl $do_overflow
962 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 921 jmp error_code
964 CFI_ENDPROC 922 CFI_ENDPROC
965END(overflow) 923END(overflow)
966 924
967ENTRY(bounds) 925ENTRY(bounds)
968 RING0_INT_FRAME 926 RING0_INT_FRAME
969 pushl $0 927 pushl_cfi $0
970 CFI_ADJUST_CFA_OFFSET 4 928 pushl_cfi $do_bounds
971 pushl $do_bounds
972 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 929 jmp error_code
974 CFI_ENDPROC 930 CFI_ENDPROC
975END(bounds) 931END(bounds)
976 932
977ENTRY(invalid_op) 933ENTRY(invalid_op)
978 RING0_INT_FRAME 934 RING0_INT_FRAME
979 pushl $0 935 pushl_cfi $0
980 CFI_ADJUST_CFA_OFFSET 4 936 pushl_cfi $do_invalid_op
981 pushl $do_invalid_op
982 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 937 jmp error_code
984 CFI_ENDPROC 938 CFI_ENDPROC
985END(invalid_op) 939END(invalid_op)
986 940
987ENTRY(coprocessor_segment_overrun) 941ENTRY(coprocessor_segment_overrun)
988 RING0_INT_FRAME 942 RING0_INT_FRAME
989 pushl $0 943 pushl_cfi $0
990 CFI_ADJUST_CFA_OFFSET 4 944 pushl_cfi $do_coprocessor_segment_overrun
991 pushl $do_coprocessor_segment_overrun
992 CFI_ADJUST_CFA_OFFSET 4
993 jmp error_code 945 jmp error_code
994 CFI_ENDPROC 946 CFI_ENDPROC
995END(coprocessor_segment_overrun) 947END(coprocessor_segment_overrun)
996 948
997ENTRY(invalid_TSS) 949ENTRY(invalid_TSS)
998 RING0_EC_FRAME 950 RING0_EC_FRAME
999 pushl $do_invalid_TSS 951 pushl_cfi $do_invalid_TSS
1000 CFI_ADJUST_CFA_OFFSET 4
1001 jmp error_code 952 jmp error_code
1002 CFI_ENDPROC 953 CFI_ENDPROC
1003END(invalid_TSS) 954END(invalid_TSS)
1004 955
1005ENTRY(segment_not_present) 956ENTRY(segment_not_present)
1006 RING0_EC_FRAME 957 RING0_EC_FRAME
1007 pushl $do_segment_not_present 958 pushl_cfi $do_segment_not_present
1008 CFI_ADJUST_CFA_OFFSET 4
1009 jmp error_code 959 jmp error_code
1010 CFI_ENDPROC 960 CFI_ENDPROC
1011END(segment_not_present) 961END(segment_not_present)
1012 962
1013ENTRY(stack_segment) 963ENTRY(stack_segment)
1014 RING0_EC_FRAME 964 RING0_EC_FRAME
1015 pushl $do_stack_segment 965 pushl_cfi $do_stack_segment
1016 CFI_ADJUST_CFA_OFFSET 4
1017 jmp error_code 966 jmp error_code
1018 CFI_ENDPROC 967 CFI_ENDPROC
1019END(stack_segment) 968END(stack_segment)
1020 969
1021ENTRY(alignment_check) 970ENTRY(alignment_check)
1022 RING0_EC_FRAME 971 RING0_EC_FRAME
1023 pushl $do_alignment_check 972 pushl_cfi $do_alignment_check
1024 CFI_ADJUST_CFA_OFFSET 4
1025 jmp error_code 973 jmp error_code
1026 CFI_ENDPROC 974 CFI_ENDPROC
1027END(alignment_check) 975END(alignment_check)
1028 976
1029ENTRY(divide_error) 977ENTRY(divide_error)
1030 RING0_INT_FRAME 978 RING0_INT_FRAME
1031 pushl $0 # no error code 979 pushl_cfi $0 # no error code
1032 CFI_ADJUST_CFA_OFFSET 4 980 pushl_cfi $do_divide_error
1033 pushl $do_divide_error
1034 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 981 jmp error_code
1036 CFI_ENDPROC 982 CFI_ENDPROC
1037END(divide_error) 983END(divide_error)
@@ -1039,10 +985,8 @@ END(divide_error)
1039#ifdef CONFIG_X86_MCE 985#ifdef CONFIG_X86_MCE
1040ENTRY(machine_check) 986ENTRY(machine_check)
1041 RING0_INT_FRAME 987 RING0_INT_FRAME
1042 pushl $0 988 pushl_cfi $0
1043 CFI_ADJUST_CFA_OFFSET 4 989 pushl_cfi machine_check_vector
1044 pushl machine_check_vector
1045 CFI_ADJUST_CFA_OFFSET 4
1046 jmp error_code 990 jmp error_code
1047 CFI_ENDPROC 991 CFI_ENDPROC
1048END(machine_check) 992END(machine_check)
@@ -1050,10 +994,8 @@ END(machine_check)
1050 994
1051ENTRY(spurious_interrupt_bug) 995ENTRY(spurious_interrupt_bug)
1052 RING0_INT_FRAME 996 RING0_INT_FRAME
1053 pushl $0 997 pushl_cfi $0
1054 CFI_ADJUST_CFA_OFFSET 4 998 pushl_cfi $do_spurious_interrupt_bug
1055 pushl $do_spurious_interrupt_bug
1056 CFI_ADJUST_CFA_OFFSET 4
1057 jmp error_code 999 jmp error_code
1058 CFI_ENDPROC 1000 CFI_ENDPROC
1059END(spurious_interrupt_bug) 1001END(spurious_interrupt_bug)
@@ -1084,8 +1026,7 @@ ENTRY(xen_sysenter_target)
1084 1026
1085ENTRY(xen_hypervisor_callback) 1027ENTRY(xen_hypervisor_callback)
1086 CFI_STARTPROC 1028 CFI_STARTPROC
1087 pushl $0 1029 pushl_cfi $0
1088 CFI_ADJUST_CFA_OFFSET 4
1089 SAVE_ALL 1030 SAVE_ALL
1090 TRACE_IRQS_OFF 1031 TRACE_IRQS_OFF
1091 1032
@@ -1121,23 +1062,20 @@ ENDPROC(xen_hypervisor_callback)
1121# We distinguish between categories by maintaining a status value in EAX. 1062# We distinguish between categories by maintaining a status value in EAX.
1122ENTRY(xen_failsafe_callback) 1063ENTRY(xen_failsafe_callback)
1123 CFI_STARTPROC 1064 CFI_STARTPROC
1124 pushl %eax 1065 pushl_cfi %eax
1125 CFI_ADJUST_CFA_OFFSET 4
1126 movl $1,%eax 1066 movl $1,%eax
11271: mov 4(%esp),%ds 10671: mov 4(%esp),%ds
11282: mov 8(%esp),%es 10682: mov 8(%esp),%es
11293: mov 12(%esp),%fs 10693: mov 12(%esp),%fs
11304: mov 16(%esp),%gs 10704: mov 16(%esp),%gs
1131 testl %eax,%eax 1071 testl %eax,%eax
1132 popl %eax 1072 popl_cfi %eax
1133 CFI_ADJUST_CFA_OFFSET -4
1134 lea 16(%esp),%esp 1073 lea 16(%esp),%esp
1135 CFI_ADJUST_CFA_OFFSET -16 1074 CFI_ADJUST_CFA_OFFSET -16
1136 jz 5f 1075 jz 5f
1137 addl $16,%esp 1076 addl $16,%esp
1138 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 1077 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
11395: pushl $0 # EAX == 0 => Category 1 (Bad segment) 10785: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
1140 CFI_ADJUST_CFA_OFFSET 4
1141 SAVE_ALL 1079 SAVE_ALL
1142 jmp ret_from_exception 1080 jmp ret_from_exception
1143 CFI_ENDPROC 1081 CFI_ENDPROC
@@ -1287,40 +1225,29 @@ syscall_table_size=(.-sys_call_table)
1287 1225
1288ENTRY(page_fault) 1226ENTRY(page_fault)
1289 RING0_EC_FRAME 1227 RING0_EC_FRAME
1290 pushl $do_page_fault 1228 pushl_cfi $do_page_fault
1291 CFI_ADJUST_CFA_OFFSET 4
1292 ALIGN 1229 ALIGN
1293error_code: 1230error_code:
1294 /* the function address is in %gs's slot on the stack */ 1231 /* the function address is in %gs's slot on the stack */
1295 pushl %fs 1232 pushl_cfi %fs
1296 CFI_ADJUST_CFA_OFFSET 4
1297 /*CFI_REL_OFFSET fs, 0*/ 1233 /*CFI_REL_OFFSET fs, 0*/
1298 pushl %es 1234 pushl_cfi %es
1299 CFI_ADJUST_CFA_OFFSET 4
1300 /*CFI_REL_OFFSET es, 0*/ 1235 /*CFI_REL_OFFSET es, 0*/
1301 pushl %ds 1236 pushl_cfi %ds
1302 CFI_ADJUST_CFA_OFFSET 4
1303 /*CFI_REL_OFFSET ds, 0*/ 1237 /*CFI_REL_OFFSET ds, 0*/
1304 pushl %eax 1238 pushl_cfi %eax
1305 CFI_ADJUST_CFA_OFFSET 4
1306 CFI_REL_OFFSET eax, 0 1239 CFI_REL_OFFSET eax, 0
1307 pushl %ebp 1240 pushl_cfi %ebp
1308 CFI_ADJUST_CFA_OFFSET 4
1309 CFI_REL_OFFSET ebp, 0 1241 CFI_REL_OFFSET ebp, 0
1310 pushl %edi 1242 pushl_cfi %edi
1311 CFI_ADJUST_CFA_OFFSET 4
1312 CFI_REL_OFFSET edi, 0 1243 CFI_REL_OFFSET edi, 0
1313 pushl %esi 1244 pushl_cfi %esi
1314 CFI_ADJUST_CFA_OFFSET 4
1315 CFI_REL_OFFSET esi, 0 1245 CFI_REL_OFFSET esi, 0
1316 pushl %edx 1246 pushl_cfi %edx
1317 CFI_ADJUST_CFA_OFFSET 4
1318 CFI_REL_OFFSET edx, 0 1247 CFI_REL_OFFSET edx, 0
1319 pushl %ecx 1248 pushl_cfi %ecx
1320 CFI_ADJUST_CFA_OFFSET 4
1321 CFI_REL_OFFSET ecx, 0 1249 CFI_REL_OFFSET ecx, 0
1322 pushl %ebx 1250 pushl_cfi %ebx
1323 CFI_ADJUST_CFA_OFFSET 4
1324 CFI_REL_OFFSET ebx, 0 1251 CFI_REL_OFFSET ebx, 0
1325 cld 1252 cld
1326 movl $(__KERNEL_PERCPU), %ecx 1253 movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1289,9 @@ END(page_fault)
1362 movl TSS_sysenter_sp0 + \offset(%esp), %esp 1289 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1363 CFI_DEF_CFA esp, 0 1290 CFI_DEF_CFA esp, 0
1364 CFI_UNDEFINED eip 1291 CFI_UNDEFINED eip
1365 pushfl 1292 pushfl_cfi
1366 CFI_ADJUST_CFA_OFFSET 4 1293 pushl_cfi $__KERNEL_CS
1367 pushl $__KERNEL_CS 1294 pushl_cfi $sysenter_past_esp
1368 CFI_ADJUST_CFA_OFFSET 4
1369 pushl $sysenter_past_esp
1370 CFI_ADJUST_CFA_OFFSET 4
1371 CFI_REL_OFFSET eip, 0 1295 CFI_REL_OFFSET eip, 0
1372.endm 1296.endm
1373 1297
@@ -1377,8 +1301,7 @@ ENTRY(debug)
1377 jne debug_stack_correct 1301 jne debug_stack_correct
1378 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1302 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1379debug_stack_correct: 1303debug_stack_correct:
1380 pushl $-1 # mark this as an int 1304 pushl_cfi $-1 # mark this as an int
1381 CFI_ADJUST_CFA_OFFSET 4
1382 SAVE_ALL 1305 SAVE_ALL
1383 TRACE_IRQS_OFF 1306 TRACE_IRQS_OFF
1384 xorl %edx,%edx # error code 0 1307 xorl %edx,%edx # error code 0
@@ -1398,32 +1321,27 @@ END(debug)
1398 */ 1321 */
1399ENTRY(nmi) 1322ENTRY(nmi)
1400 RING0_INT_FRAME 1323 RING0_INT_FRAME
1401 pushl %eax 1324 pushl_cfi %eax
1402 CFI_ADJUST_CFA_OFFSET 4
1403 movl %ss, %eax 1325 movl %ss, %eax
1404 cmpw $__ESPFIX_SS, %ax 1326 cmpw $__ESPFIX_SS, %ax
1405 popl %eax 1327 popl_cfi %eax
1406 CFI_ADJUST_CFA_OFFSET -4
1407 je nmi_espfix_stack 1328 je nmi_espfix_stack
1408 cmpl $ia32_sysenter_target,(%esp) 1329 cmpl $ia32_sysenter_target,(%esp)
1409 je nmi_stack_fixup 1330 je nmi_stack_fixup
1410 pushl %eax 1331 pushl_cfi %eax
1411 CFI_ADJUST_CFA_OFFSET 4
1412 movl %esp,%eax 1332 movl %esp,%eax
1413 /* Do not access memory above the end of our stack page, 1333 /* Do not access memory above the end of our stack page,
1414 * it might not exist. 1334 * it might not exist.
1415 */ 1335 */
1416 andl $(THREAD_SIZE-1),%eax 1336 andl $(THREAD_SIZE-1),%eax
1417 cmpl $(THREAD_SIZE-20),%eax 1337 cmpl $(THREAD_SIZE-20),%eax
1418 popl %eax 1338 popl_cfi %eax
1419 CFI_ADJUST_CFA_OFFSET -4
1420 jae nmi_stack_correct 1339 jae nmi_stack_correct
1421 cmpl $ia32_sysenter_target,12(%esp) 1340 cmpl $ia32_sysenter_target,12(%esp)
1422 je nmi_debug_stack_check 1341 je nmi_debug_stack_check
1423nmi_stack_correct: 1342nmi_stack_correct:
1424 /* We have a RING0_INT_FRAME here */ 1343 /* We have a RING0_INT_FRAME here */
1425 pushl %eax 1344 pushl_cfi %eax
1426 CFI_ADJUST_CFA_OFFSET 4
1427 SAVE_ALL 1345 SAVE_ALL
1428 xorl %edx,%edx # zero error code 1346 xorl %edx,%edx # zero error code
1429 movl %esp,%eax # pt_regs pointer 1347 movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1370,14 @@ nmi_espfix_stack:
1452 * 1370 *
1453 * create the pointer to lss back 1371 * create the pointer to lss back
1454 */ 1372 */
1455 pushl %ss 1373 pushl_cfi %ss
1456 CFI_ADJUST_CFA_OFFSET 4 1374 pushl_cfi %esp
1457 pushl %esp
1458 CFI_ADJUST_CFA_OFFSET 4
1459 addl $4, (%esp) 1375 addl $4, (%esp)
1460 /* copy the iret frame of 12 bytes */ 1376 /* copy the iret frame of 12 bytes */
1461 .rept 3 1377 .rept 3
1462 pushl 16(%esp) 1378 pushl_cfi 16(%esp)
1463 CFI_ADJUST_CFA_OFFSET 4
1464 .endr 1379 .endr
1465 pushl %eax 1380 pushl_cfi %eax
1466 CFI_ADJUST_CFA_OFFSET 4
1467 SAVE_ALL 1381 SAVE_ALL
1468 FIXUP_ESPFIX_STACK # %eax == %esp 1382 FIXUP_ESPFIX_STACK # %eax == %esp
1469 xorl %edx,%edx # zero error code 1383 xorl %edx,%edx # zero error code
@@ -1477,8 +1391,7 @@ END(nmi)
1477 1391
1478ENTRY(int3) 1392ENTRY(int3)
1479 RING0_INT_FRAME 1393 RING0_INT_FRAME
1480 pushl $-1 # mark this as an int 1394 pushl_cfi $-1 # mark this as an int
1481 CFI_ADJUST_CFA_OFFSET 4
1482 SAVE_ALL 1395 SAVE_ALL
1483 TRACE_IRQS_OFF 1396 TRACE_IRQS_OFF
1484 xorl %edx,%edx # zero error code 1397 xorl %edx,%edx # zero error code
@@ -1490,12 +1403,20 @@ END(int3)
1490 1403
1491ENTRY(general_protection) 1404ENTRY(general_protection)
1492 RING0_EC_FRAME 1405 RING0_EC_FRAME
1493 pushl $do_general_protection 1406 pushl_cfi $do_general_protection
1494 CFI_ADJUST_CFA_OFFSET 4
1495 jmp error_code 1407 jmp error_code
1496 CFI_ENDPROC 1408 CFI_ENDPROC
1497END(general_protection) 1409END(general_protection)
1498 1410
1411#ifdef CONFIG_KVM_GUEST
1412ENTRY(async_page_fault)
1413 RING0_EC_FRAME
1414 pushl_cfi $do_async_page_fault
1415 jmp error_code
1416 CFI_ENDPROC
1417END(async_page_fault)
1418#endif
1419
1499/* 1420/*
1500 * End of kprobes section 1421 * End of kprobes section
1501 */ 1422 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 115e8951e8c8..47a4bcd2e503 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -213,23 +215,17 @@ ENDPROC(native_usergs_sysret64)
213 .macro FAKE_STACK_FRAME child_rip 215 .macro FAKE_STACK_FRAME child_rip
214 /* push in order ss, rsp, eflags, cs, rip */ 216 /* push in order ss, rsp, eflags, cs, rip */
215 xorl %eax, %eax 217 xorl %eax, %eax
216 pushq $__KERNEL_DS /* ss */ 218 pushq_cfi $__KERNEL_DS /* ss */
217 CFI_ADJUST_CFA_OFFSET 8
218 /*CFI_REL_OFFSET ss,0*/ 219 /*CFI_REL_OFFSET ss,0*/
219 pushq %rax /* rsp */ 220 pushq_cfi %rax /* rsp */
220 CFI_ADJUST_CFA_OFFSET 8
221 CFI_REL_OFFSET rsp,0 221 CFI_REL_OFFSET rsp,0
222 pushq $X86_EFLAGS_IF /* eflags - interrupts on */ 222 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
223 CFI_ADJUST_CFA_OFFSET 8
224 /*CFI_REL_OFFSET rflags,0*/ 223 /*CFI_REL_OFFSET rflags,0*/
225 pushq $__KERNEL_CS /* cs */ 224 pushq_cfi $__KERNEL_CS /* cs */
226 CFI_ADJUST_CFA_OFFSET 8
227 /*CFI_REL_OFFSET cs,0*/ 225 /*CFI_REL_OFFSET cs,0*/
228 pushq \child_rip /* rip */ 226 pushq_cfi \child_rip /* rip */
229 CFI_ADJUST_CFA_OFFSET 8
230 CFI_REL_OFFSET rip,0 227 CFI_REL_OFFSET rip,0
231 pushq %rax /* orig rax */ 228 pushq_cfi %rax /* orig rax */
232 CFI_ADJUST_CFA_OFFSET 8
233 .endm 229 .endm
234 230
235 .macro UNFAKE_STACK_FRAME 231 .macro UNFAKE_STACK_FRAME
@@ -301,20 +297,25 @@ ENDPROC(native_usergs_sysret64)
301 .endm 297 .endm
302 298
303/* save partial stack frame */ 299/* save partial stack frame */
300 .pushsection .kprobes.text, "ax"
304ENTRY(save_args) 301ENTRY(save_args)
305 XCPT_FRAME 302 XCPT_FRAME
306 cld 303 cld
307 movq_cfi rdi, RDI+16-ARGOFFSET 304 /*
308 movq_cfi rsi, RSI+16-ARGOFFSET 305 * start from rbp in pt_regs and jump over
309 movq_cfi rdx, RDX+16-ARGOFFSET 306 * return address.
310 movq_cfi rcx, RCX+16-ARGOFFSET 307 */
311 movq_cfi rax, RAX+16-ARGOFFSET 308 movq_cfi rdi, RDI+8-RBP
312 movq_cfi r8, R8+16-ARGOFFSET 309 movq_cfi rsi, RSI+8-RBP
313 movq_cfi r9, R9+16-ARGOFFSET 310 movq_cfi rdx, RDX+8-RBP
314 movq_cfi r10, R10+16-ARGOFFSET 311 movq_cfi rcx, RCX+8-RBP
315 movq_cfi r11, R11+16-ARGOFFSET 312 movq_cfi rax, RAX+8-RBP
316 313 movq_cfi r8, R8+8-RBP
317 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ 314 movq_cfi r9, R9+8-RBP
315 movq_cfi r10, R10+8-RBP
316 movq_cfi r11, R11+8-RBP
317
318 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
318 movq_cfi rbp, 8 /* push %rbp */ 319 movq_cfi rbp, 8 /* push %rbp */
319 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 320 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
320 testl $3, CS(%rdi) 321 testl $3, CS(%rdi)
@@ -340,6 +341,7 @@ ENTRY(save_args)
340 ret 341 ret
341 CFI_ENDPROC 342 CFI_ENDPROC
342END(save_args) 343END(save_args)
344 .popsection
343 345
344ENTRY(save_rest) 346ENTRY(save_rest)
345 PARTIAL_FRAME 1 REST_SKIP+8 347 PARTIAL_FRAME 1 REST_SKIP+8
@@ -398,10 +400,8 @@ ENTRY(ret_from_fork)
398 400
399 LOCK ; btr $TIF_FORK,TI_flags(%r8) 401 LOCK ; btr $TIF_FORK,TI_flags(%r8)
400 402
401 push kernel_eflags(%rip) 403 pushq_cfi kernel_eflags(%rip)
402 CFI_ADJUST_CFA_OFFSET 8 404 popfq_cfi # reset kernel eflags
403 popf # reset kernel eflags
404 CFI_ADJUST_CFA_OFFSET -8
405 405
406 call schedule_tail # rdi: 'prev' task parameter 406 call schedule_tail # rdi: 'prev' task parameter
407 407
@@ -422,7 +422,7 @@ ENTRY(ret_from_fork)
422END(ret_from_fork) 422END(ret_from_fork)
423 423
424/* 424/*
425 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
426 * 426 *
427 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
428 * stack pointer. 428 * stack pointer.
@@ -521,11 +521,9 @@ sysret_careful:
521 jnc sysret_signal 521 jnc sysret_signal
522 TRACE_IRQS_ON 522 TRACE_IRQS_ON
523 ENABLE_INTERRUPTS(CLBR_NONE) 523 ENABLE_INTERRUPTS(CLBR_NONE)
524 pushq %rdi 524 pushq_cfi %rdi
525 CFI_ADJUST_CFA_OFFSET 8
526 call schedule 525 call schedule
527 popq %rdi 526 popq_cfi %rdi
528 CFI_ADJUST_CFA_OFFSET -8
529 jmp sysret_check 527 jmp sysret_check
530 528
531 /* Handle a signal */ 529 /* Handle a signal */
@@ -634,11 +632,9 @@ int_careful:
634 jnc int_very_careful 632 jnc int_very_careful
635 TRACE_IRQS_ON 633 TRACE_IRQS_ON
636 ENABLE_INTERRUPTS(CLBR_NONE) 634 ENABLE_INTERRUPTS(CLBR_NONE)
637 pushq %rdi 635 pushq_cfi %rdi
638 CFI_ADJUST_CFA_OFFSET 8
639 call schedule 636 call schedule
640 popq %rdi 637 popq_cfi %rdi
641 CFI_ADJUST_CFA_OFFSET -8
642 DISABLE_INTERRUPTS(CLBR_NONE) 638 DISABLE_INTERRUPTS(CLBR_NONE)
643 TRACE_IRQS_OFF 639 TRACE_IRQS_OFF
644 jmp int_with_check 640 jmp int_with_check
@@ -652,12 +648,10 @@ int_check_syscall_exit_work:
652 /* Check for syscall exit trace */ 648 /* Check for syscall exit trace */
653 testl $_TIF_WORK_SYSCALL_EXIT,%edx 649 testl $_TIF_WORK_SYSCALL_EXIT,%edx
654 jz int_signal 650 jz int_signal
655 pushq %rdi 651 pushq_cfi %rdi
656 CFI_ADJUST_CFA_OFFSET 8
657 leaq 8(%rsp),%rdi # &ptregs -> arg1 652 leaq 8(%rsp),%rdi # &ptregs -> arg1
658 call syscall_trace_leave 653 call syscall_trace_leave
659 popq %rdi 654 popq_cfi %rdi
660 CFI_ADJUST_CFA_OFFSET -8
661 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 655 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
662 jmp int_restore_rest 656 jmp int_restore_rest
663 657
@@ -714,9 +708,8 @@ END(ptregscall_common)
714 708
715ENTRY(stub_execve) 709ENTRY(stub_execve)
716 CFI_STARTPROC 710 CFI_STARTPROC
717 popq %r11 711 addq $8, %rsp
718 CFI_ADJUST_CFA_OFFSET -8 712 PARTIAL_FRAME 0
719 CFI_REGISTER rip, r11
720 SAVE_REST 713 SAVE_REST
721 FIXUP_TOP_OF_STACK %r11 714 FIXUP_TOP_OF_STACK %r11
722 movq %rsp, %rcx 715 movq %rsp, %rcx
@@ -735,7 +728,7 @@ END(stub_execve)
735ENTRY(stub_rt_sigreturn) 728ENTRY(stub_rt_sigreturn)
736 CFI_STARTPROC 729 CFI_STARTPROC
737 addq $8, %rsp 730 addq $8, %rsp
738 CFI_ADJUST_CFA_OFFSET -8 731 PARTIAL_FRAME 0
739 SAVE_REST 732 SAVE_REST
740 movq %rsp,%rdi 733 movq %rsp,%rdi
741 FIXUP_TOP_OF_STACK %r11 734 FIXUP_TOP_OF_STACK %r11
@@ -753,7 +746,7 @@ END(stub_rt_sigreturn)
753 */ 746 */
754 .section .init.rodata,"a" 747 .section .init.rodata,"a"
755ENTRY(interrupt) 748ENTRY(interrupt)
756 .text 749 .section .entry.text
757 .p2align 5 750 .p2align 5
758 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
759ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -766,14 +759,13 @@ vector=FIRST_EXTERNAL_VECTOR
766 .if vector <> FIRST_EXTERNAL_VECTOR 759 .if vector <> FIRST_EXTERNAL_VECTOR
767 CFI_ADJUST_CFA_OFFSET -8 760 CFI_ADJUST_CFA_OFFSET -8
768 .endif 761 .endif
7691: pushq $(~vector+0x80) /* Note: always in signed byte range */ 7621: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
770 CFI_ADJUST_CFA_OFFSET 8
771 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 763 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
772 jmp 2f 764 jmp 2f
773 .endif 765 .endif
774 .previous 766 .previous
775 .quad 1b 767 .quad 1b
776 .text 768 .section .entry.text
777vector=vector+1 769vector=vector+1
778 .endif 770 .endif
779 .endr 771 .endr
@@ -796,8 +788,9 @@ END(interrupt)
796 788
797/* 0(%rsp): ~(interrupt number) */ 789/* 0(%rsp): ~(interrupt number) */
798 .macro interrupt func 790 .macro interrupt func
799 subq $10*8, %rsp 791 /* reserve pt_regs for scratch regs and rbp */
800 CFI_ADJUST_CFA_OFFSET 10*8 792 subq $ORIG_RAX-RBP, %rsp
793 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
801 call save_args 794 call save_args
802 PARTIAL_FRAME 0 795 PARTIAL_FRAME 0
803 call \func 796 call \func
@@ -822,8 +815,14 @@ ret_from_intr:
822 TRACE_IRQS_OFF 815 TRACE_IRQS_OFF
823 decl PER_CPU_VAR(irq_count) 816 decl PER_CPU_VAR(irq_count)
824 leaveq 817 leaveq
818
819 CFI_RESTORE rbp
825 CFI_DEF_CFA_REGISTER rsp 820 CFI_DEF_CFA_REGISTER rsp
826 CFI_ADJUST_CFA_OFFSET -8 821 CFI_ADJUST_CFA_OFFSET -8
822
823 /* we did not save rbx, restore only from ARGOFFSET */
824 addq $8, %rsp
825 CFI_ADJUST_CFA_OFFSET -8
827exit_intr: 826exit_intr:
828 GET_THREAD_INFO(%rcx) 827 GET_THREAD_INFO(%rcx)
829 testl $3,CS-ARGOFFSET(%rsp) 828 testl $3,CS-ARGOFFSET(%rsp)
@@ -903,11 +902,9 @@ retint_careful:
903 jnc retint_signal 902 jnc retint_signal
904 TRACE_IRQS_ON 903 TRACE_IRQS_ON
905 ENABLE_INTERRUPTS(CLBR_NONE) 904 ENABLE_INTERRUPTS(CLBR_NONE)
906 pushq %rdi 905 pushq_cfi %rdi
907 CFI_ADJUST_CFA_OFFSET 8
908 call schedule 906 call schedule
909 popq %rdi 907 popq_cfi %rdi
910 CFI_ADJUST_CFA_OFFSET -8
911 GET_THREAD_INFO(%rcx) 908 GET_THREAD_INFO(%rcx)
912 DISABLE_INTERRUPTS(CLBR_NONE) 909 DISABLE_INTERRUPTS(CLBR_NONE)
913 TRACE_IRQS_OFF 910 TRACE_IRQS_OFF
@@ -956,8 +953,7 @@ END(common_interrupt)
956.macro apicinterrupt num sym do_sym 953.macro apicinterrupt num sym do_sym
957ENTRY(\sym) 954ENTRY(\sym)
958 INTR_FRAME 955 INTR_FRAME
959 pushq $~(\num) 956 pushq_cfi $~(\num)
960 CFI_ADJUST_CFA_OFFSET 8
961 interrupt \do_sym 957 interrupt \do_sym
962 jmp ret_from_intr 958 jmp ret_from_intr
963 CFI_ENDPROC 959 CFI_ENDPROC
@@ -981,22 +977,13 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
981 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
982 978
983#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
985 invalidate_interrupt0 smp_invalidate_interrupt 981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
986apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ 982.if NUM_INVALIDATE_TLB_VECTORS > \idx
987 invalidate_interrupt1 smp_invalidate_interrupt 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
988apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ 984 invalidate_interrupt\idx smp_invalidate_interrupt
989 invalidate_interrupt2 smp_invalidate_interrupt 985.endif
990apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ 986.endr
991 invalidate_interrupt3 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
993 invalidate_interrupt4 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
995 invalidate_interrupt5 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
997 invalidate_interrupt6 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
999 invalidate_interrupt7 smp_invalidate_interrupt
1000#endif 987#endif
1001 988
1002apicinterrupt THRESHOLD_APIC_VECTOR \ 989apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1025,9 +1012,9 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1012apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1013 spurious_interrupt smp_spurious_interrupt
1027 1014
1028#ifdef CONFIG_PERF_EVENTS 1015#ifdef CONFIG_IRQ_WORK
1029apicinterrupt LOCAL_PENDING_VECTOR \ 1016apicinterrupt IRQ_WORK_VECTOR \
1030 perf_pending_interrupt smp_perf_pending_interrupt 1017 irq_work_interrupt smp_irq_work_interrupt
1031#endif 1018#endif
1032 1019
1033/* 1020/*
@@ -1038,8 +1025,8 @@ ENTRY(\sym)
1038 INTR_FRAME 1025 INTR_FRAME
1039 PARAVIRT_ADJUST_EXCEPTION_FRAME 1026 PARAVIRT_ADJUST_EXCEPTION_FRAME
1040 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1027 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1041 subq $15*8,%rsp 1028 subq $ORIG_RAX-R15, %rsp
1042 CFI_ADJUST_CFA_OFFSET 15*8 1029 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1043 call error_entry 1030 call error_entry
1044 DEFAULT_FRAME 0 1031 DEFAULT_FRAME 0
1045 movq %rsp,%rdi /* pt_regs pointer */ 1032 movq %rsp,%rdi /* pt_regs pointer */
@@ -1054,9 +1041,9 @@ END(\sym)
1054ENTRY(\sym) 1041ENTRY(\sym)
1055 INTR_FRAME 1042 INTR_FRAME
1056 PARAVIRT_ADJUST_EXCEPTION_FRAME 1043 PARAVIRT_ADJUST_EXCEPTION_FRAME
1057 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1044 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1058 CFI_ADJUST_CFA_OFFSET 8 1045 subq $ORIG_RAX-R15, %rsp
1059 subq $15*8, %rsp 1046 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1060 call save_paranoid 1047 call save_paranoid
1061 TRACE_IRQS_OFF 1048 TRACE_IRQS_OFF
1062 movq %rsp,%rdi /* pt_regs pointer */ 1049 movq %rsp,%rdi /* pt_regs pointer */
@@ -1072,9 +1059,9 @@ END(\sym)
1072ENTRY(\sym) 1059ENTRY(\sym)
1073 INTR_FRAME 1060 INTR_FRAME
1074 PARAVIRT_ADJUST_EXCEPTION_FRAME 1061 PARAVIRT_ADJUST_EXCEPTION_FRAME
1075 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1062 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1076 CFI_ADJUST_CFA_OFFSET 8 1063 subq $ORIG_RAX-R15, %rsp
1077 subq $15*8, %rsp 1064 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1078 call save_paranoid 1065 call save_paranoid
1079 TRACE_IRQS_OFF 1066 TRACE_IRQS_OFF
1080 movq %rsp,%rdi /* pt_regs pointer */ 1067 movq %rsp,%rdi /* pt_regs pointer */
@@ -1091,8 +1078,8 @@ END(\sym)
1091ENTRY(\sym) 1078ENTRY(\sym)
1092 XCPT_FRAME 1079 XCPT_FRAME
1093 PARAVIRT_ADJUST_EXCEPTION_FRAME 1080 PARAVIRT_ADJUST_EXCEPTION_FRAME
1094 subq $15*8,%rsp 1081 subq $ORIG_RAX-R15, %rsp
1095 CFI_ADJUST_CFA_OFFSET 15*8 1082 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1096 call error_entry 1083 call error_entry
1097 DEFAULT_FRAME 0 1084 DEFAULT_FRAME 0
1098 movq %rsp,%rdi /* pt_regs pointer */ 1085 movq %rsp,%rdi /* pt_regs pointer */
@@ -1109,8 +1096,8 @@ END(\sym)
1109ENTRY(\sym) 1096ENTRY(\sym)
1110 XCPT_FRAME 1097 XCPT_FRAME
1111 PARAVIRT_ADJUST_EXCEPTION_FRAME 1098 PARAVIRT_ADJUST_EXCEPTION_FRAME
1112 subq $15*8,%rsp 1099 subq $ORIG_RAX-R15, %rsp
1113 CFI_ADJUST_CFA_OFFSET 15*8 1100 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1114 call save_paranoid 1101 call save_paranoid
1115 DEFAULT_FRAME 0 1102 DEFAULT_FRAME 0
1116 TRACE_IRQS_OFF 1103 TRACE_IRQS_OFF
@@ -1141,16 +1128,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
1141 /* edi: new selector */ 1128 /* edi: new selector */
1142ENTRY(native_load_gs_index) 1129ENTRY(native_load_gs_index)
1143 CFI_STARTPROC 1130 CFI_STARTPROC
1144 pushf 1131 pushfq_cfi
1145 CFI_ADJUST_CFA_OFFSET 8
1146 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1132 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1147 SWAPGS 1133 SWAPGS
1148gs_change: 1134gs_change:
1149 movl %edi,%gs 1135 movl %edi,%gs
11502: mfence /* workaround */ 11362: mfence /* workaround */
1151 SWAPGS 1137 SWAPGS
1152 popf 1138 popfq_cfi
1153 CFI_ADJUST_CFA_OFFSET -8
1154 ret 1139 ret
1155 CFI_ENDPROC 1140 CFI_ENDPROC
1156END(native_load_gs_index) 1141END(native_load_gs_index)
@@ -1217,8 +1202,7 @@ END(kernel_execve)
1217/* Call softirq on interrupt stack. Interrupts are off. */ 1202/* Call softirq on interrupt stack. Interrupts are off. */
1218ENTRY(call_softirq) 1203ENTRY(call_softirq)
1219 CFI_STARTPROC 1204 CFI_STARTPROC
1220 push %rbp 1205 pushq_cfi %rbp
1221 CFI_ADJUST_CFA_OFFSET 8
1222 CFI_REL_OFFSET rbp,0 1206 CFI_REL_OFFSET rbp,0
1223 mov %rsp,%rbp 1207 mov %rsp,%rbp
1224 CFI_DEF_CFA_REGISTER rbp 1208 CFI_DEF_CFA_REGISTER rbp
@@ -1227,6 +1211,7 @@ ENTRY(call_softirq)
1227 push %rbp # backlink for old unwinder 1211 push %rbp # backlink for old unwinder
1228 call __do_softirq 1212 call __do_softirq
1229 leaveq 1213 leaveq
1214 CFI_RESTORE rbp
1230 CFI_DEF_CFA_REGISTER rsp 1215 CFI_DEF_CFA_REGISTER rsp
1231 CFI_ADJUST_CFA_OFFSET -8 1216 CFI_ADJUST_CFA_OFFSET -8
1232 decl PER_CPU_VAR(irq_count) 1217 decl PER_CPU_VAR(irq_count)
@@ -1270,7 +1255,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1270 decl PER_CPU_VAR(irq_count) 1255 decl PER_CPU_VAR(irq_count)
1271 jmp error_exit 1256 jmp error_exit
1272 CFI_ENDPROC 1257 CFI_ENDPROC
1273END(do_hypervisor_callback) 1258END(xen_do_hypervisor_callback)
1274 1259
1275/* 1260/*
1276 * Hypervisor uses this for application faults while it executes. 1261 * Hypervisor uses this for application faults while it executes.
@@ -1351,6 +1336,9 @@ errorentry xen_stack_segment do_stack_segment
1351#endif 1336#endif
1352errorentry general_protection do_general_protection 1337errorentry general_protection do_general_protection
1353errorentry page_fault do_page_fault 1338errorentry page_fault do_page_fault
1339#ifdef CONFIG_KVM_GUEST
1340errorentry async_page_fault do_async_page_fault
1341#endif
1354#ifdef CONFIG_X86_MCE 1342#ifdef CONFIG_X86_MCE
1355paranoidzeroentry machine_check *machine_check_vector(%rip) 1343paranoidzeroentry machine_check *machine_check_vector(%rip)
1356#endif 1344#endif
@@ -1370,7 +1358,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1370 1358
1371 /* ebx: no swapgs flag */ 1359 /* ebx: no swapgs flag */
1372ENTRY(paranoid_exit) 1360ENTRY(paranoid_exit)
1373 INTR_FRAME 1361 DEFAULT_FRAME
1374 DISABLE_INTERRUPTS(CLBR_NONE) 1362 DISABLE_INTERRUPTS(CLBR_NONE)
1375 TRACE_IRQS_OFF 1363 TRACE_IRQS_OFF
1376 testl %ebx,%ebx /* swapgs needed? */ 1364 testl %ebx,%ebx /* swapgs needed? */
@@ -1447,7 +1435,6 @@ error_swapgs:
1447error_sti: 1435error_sti:
1448 TRACE_IRQS_OFF 1436 TRACE_IRQS_OFF
1449 ret 1437 ret
1450 CFI_ENDPROC
1451 1438
1452/* 1439/*
1453 * There are two places in the kernel that can potentially fault with 1440 * There are two places in the kernel that can potentially fault with
@@ -1472,6 +1459,7 @@ bstep_iret:
1472 /* Fix truncated RIP */ 1459 /* Fix truncated RIP */
1473 movq %rcx,RIP+8(%rsp) 1460 movq %rcx,RIP+8(%rsp)
1474 jmp error_swapgs 1461 jmp error_swapgs
1462 CFI_ENDPROC
1475END(error_entry) 1463END(error_entry)
1476 1464
1477 1465
@@ -1500,8 +1488,8 @@ ENTRY(nmi)
1500 INTR_FRAME 1488 INTR_FRAME
1501 PARAVIRT_ADJUST_EXCEPTION_FRAME 1489 PARAVIRT_ADJUST_EXCEPTION_FRAME
1502 pushq_cfi $-1 1490 pushq_cfi $-1
1503 subq $15*8, %rsp 1491 subq $ORIG_RAX-R15, %rsp
1504 CFI_ADJUST_CFA_OFFSET 15*8 1492 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1505 call save_paranoid 1493 call save_paranoid
1506 DEFAULT_FRAME 0 1494 DEFAULT_FRAME 0
1507 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1495 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..c9a281f272fd 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/module.h>
22 23
23#include <trace/syscall.h> 24#include <trace/syscall.h>
24 25
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
49int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
50{ 51{
51 set_kernel_text_rw(); 52 set_kernel_text_rw();
53 set_all_modules_text_rw();
52 modifying_code = 1; 54 modifying_code = 1;
53 return 0; 55 return 0;
54} 56}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
56int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
57{ 59{
58 modifying_code = 0; 60 modifying_code = 0;
61 set_all_modules_text_ro();
59 set_kernel_text_ro(); 62 set_kernel_text_ro();
60 return 0; 63 return 0;
61} 64}
@@ -120,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
120static atomic_t nmi_running = ATOMIC_INIT(0); 123static atomic_t nmi_running = ATOMIC_INIT(0);
121static int mod_code_status; /* holds return value of text write */ 124static int mod_code_status; /* holds return value of text write */
122static void *mod_code_ip; /* holds the IP to write to */ 125static void *mod_code_ip; /* holds the IP to write to */
123static void *mod_code_newcode; /* holds the text to write to the IP */ 126static const void *mod_code_newcode; /* holds the text to write to the IP */
124 127
125static unsigned nmi_wait_count; 128static unsigned nmi_wait_count;
126static atomic_t nmi_update_count = ATOMIC_INIT(0); 129static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void)
167 170
168void ftrace_nmi_enter(void) 171void ftrace_nmi_enter(void)
169{ 172{
170 __get_cpu_var(save_modifying_code) = modifying_code; 173 __this_cpu_write(save_modifying_code, modifying_code);
171 174
172 if (!__get_cpu_var(save_modifying_code)) 175 if (!__this_cpu_read(save_modifying_code))
173 return; 176 return;
174 177
175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void)
183 186
184void ftrace_nmi_exit(void) 187void ftrace_nmi_exit(void)
185{ 188{
186 if (!__get_cpu_var(save_modifying_code)) 189 if (!__this_cpu_read(save_modifying_code))
187 return; 190 return;
188 191
189 /* Finish all executions before clearing nmi_running */ 192 /* Finish all executions before clearing nmi_running */
@@ -222,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
222} 225}
223 226
224static int 227static int
225do_ftrace_mod_code(unsigned long ip, void *new_code) 228do_ftrace_mod_code(unsigned long ip, const void *new_code)
226{ 229{
227 /* 230 /*
228 * On x86_64, kernel text mappings are mapped read-only with 231 * On x86_64, kernel text mappings are mapped read-only with
@@ -257,19 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
257 return mod_code_status; 260 return mod_code_status;
258} 261}
259 262
260 263static const unsigned char *ftrace_nop_replace(void)
261
262
263static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
264
265static unsigned char *ftrace_nop_replace(void)
266{ 264{
267 return ftrace_nop; 265 return ideal_nops[NOP_ATOMIC5];
268} 266}
269 267
270static int 268static int
271ftrace_modify_code(unsigned long ip, unsigned char *old_code, 269ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
272 unsigned char *new_code) 270 unsigned const char *new_code)
273{ 271{
274 unsigned char replaced[MCOUNT_INSN_SIZE]; 272 unsigned char replaced[MCOUNT_INSN_SIZE];
275 273
@@ -303,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
303int ftrace_make_nop(struct module *mod, 301int ftrace_make_nop(struct module *mod,
304 struct dyn_ftrace *rec, unsigned long addr) 302 struct dyn_ftrace *rec, unsigned long addr)
305{ 303{
306 unsigned char *new, *old; 304 unsigned const char *new, *old;
307 unsigned long ip = rec->ip; 305 unsigned long ip = rec->ip;
308 306
309 old = ftrace_call_replace(ip, addr); 307 old = ftrace_call_replace(ip, addr);
@@ -314,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
314 312
315int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 313int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
316{ 314{
317 unsigned char *new, *old; 315 unsigned const char *new, *old;
318 unsigned long ip = rec->ip; 316 unsigned long ip = rec->ip;
319 317
320 old = ftrace_nop_replace(); 318 old = ftrace_nop_replace();
@@ -338,62 +336,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
338 336
339int __init ftrace_dyn_arch_init(void *data) 337int __init ftrace_dyn_arch_init(void *data)
340{ 338{
341 extern const unsigned char ftrace_test_p6nop[];
342 extern const unsigned char ftrace_test_nop5[];
343 extern const unsigned char ftrace_test_jmp[];
344 int faulted = 0;
345
346 /*
347 * There is no good nop for all x86 archs.
348 * We will default to using the P6_NOP5, but first we
349 * will test to make sure that the nop will actually
350 * work on this CPU. If it faults, we will then
351 * go to a lesser efficient 5 byte nop. If that fails
352 * we then just use a jmp as our nop. This isn't the most
353 * efficient nop, but we can not use a multi part nop
354 * since we would then risk being preempted in the middle
355 * of that nop, and if we enabled tracing then, it might
356 * cause a system crash.
357 *
358 * TODO: check the cpuid to determine the best nop.
359 */
360 asm volatile (
361 "ftrace_test_jmp:"
362 "jmp ftrace_test_p6nop\n"
363 "nop\n"
364 "nop\n"
365 "nop\n" /* 2 byte jmp + 3 bytes */
366 "ftrace_test_p6nop:"
367 P6_NOP5
368 "jmp 1f\n"
369 "ftrace_test_nop5:"
370 ".byte 0x66,0x66,0x66,0x66,0x90\n"
371 "1:"
372 ".section .fixup, \"ax\"\n"
373 "2: movl $1, %0\n"
374 " jmp ftrace_test_nop5\n"
375 "3: movl $2, %0\n"
376 " jmp 1b\n"
377 ".previous\n"
378 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
379 _ASM_EXTABLE(ftrace_test_nop5, 3b)
380 : "=r"(faulted) : "0" (faulted));
381
382 switch (faulted) {
383 case 0:
384 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
385 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
386 break;
387 case 1:
388 pr_info("converting mcount calls to 66 66 66 66 90\n");
389 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
390 break;
391 case 2:
392 pr_info("converting mcount calls to jmp . + 5\n");
393 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
394 break;
395 }
396
397 /* The return code is retured via data */ 339 /* The return code is retured via data */
398 *(unsigned long *)data = 0; 340 *(unsigned long *)data = 0;
399 341
@@ -495,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
495 return; 437 return;
496 } 438 }
497 439
498 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
499 frame_pointer) == -EBUSY) {
500 *parent = old;
501 return;
502 }
503
504 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
505 442
506 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
507 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
508 current->curr_ret_stack--;
509 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
510 } 453 }
511} 454}
512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..af0699ba48cf 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <linux/memblock.h>
3 4
4#include <asm/setup.h> 5#include <asm/setup.h>
5#include <asm/bios_ebda.h> 6#include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
51 lowmem = 0x9f000; 52 lowmem = 0x9f000;
52 53
53 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
54 reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); 55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
55} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625c..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/sections.h> 14#include <asm/sections.h>
@@ -17,11 +18,11 @@
17#include <asm/apic.h> 18#include <asm/apic.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 20#include <asm/bios_ebda.h>
21#include <asm/tlbflush.h>
20 22
21static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
22{ 24{
23 /* Initialize 32bit specific setup functions */ 25 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 26 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
27 28
@@ -30,17 +31,9 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33#ifdef CONFIG_X86_TRAMPOLINE 34 memblock_init();
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
42 35
43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 36 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
44 37
45#ifdef CONFIG_BLK_DEV_INITRD 38#ifdef CONFIG_BLK_DEV_INITRD
46 /* Reserve INITRD */ 39 /* Reserve INITRD */
@@ -49,7 +42,7 @@ void __init i386_start_kernel(void)
49 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 42 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
50 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 43 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 44 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 45 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
53 } 46 }
54#endif 47#endif
55 48
@@ -58,6 +51,9 @@ void __init i386_start_kernel(void)
58 case X86_SUBARCH_MRST: 51 case X86_SUBARCH_MRST:
59 x86_mrst_early_setup(); 52 x86_mrst_early_setup();
60 break; 53 break;
54 case X86_SUBARCH_CE4100:
55 x86_ce4100_early_setup();
56 break;
61 default: 57 default:
62 i386_default_early_setup(); 58 i386_default_early_setup();
63 break; 59 break;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h> 13#include <linux/start_kernel.h>
14#include <linux/io.h> 14#include <linux/io.h>
15#include <linux/memblock.h>
15 16
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
@@ -76,8 +77,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
76 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
77 zap_identity_mappings(); 78 zap_identity_mappings();
78 79
79 /* Cleanup the over mapped high alias */ 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
80 cleanup_highmap();
81 81
82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
83#ifdef CONFIG_EARLY_PRINTK 83#ifdef CONFIG_EARLY_PRINTK
@@ -98,7 +98,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 memblock_init();
102
103 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
102 104
103#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 106 /* Reserve INITRD */
@@ -107,7 +109,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 112 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
111 } 113 }
112#endif 114#endif
113 115
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09fb..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,18 +60,20 @@
60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) 60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
61#endif 61#endif
62 62
63/* Number of possible pages in the lowmem region */
64LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
65
63/* Enough space to fit pagetables for the low memory linear map */ 66/* Enough space to fit pagetables for the low memory linear map */
64MAPPING_BEYOND_END = \ 67MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
65 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
66 68
67/* 69/*
68 * Worst-case size of the kernel mapping we need to make: 70 * Worst-case size of the kernel mapping we need to make:
69 * the worst-case size of the kernel itself, plus the extra we need 71 * a relocatable kernel can live anywhere in lowmem, so we need to be able
70 * to map for the linear map. 72 * to map all of lowmem.
71 */ 73 */
72KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT 74KERNEL_PAGES = LOWMEM_PAGES
73 75
74INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
75RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
76 78
77/* 79/*
@@ -83,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
83 */ 85 */
84__HEAD 86__HEAD
85ENTRY(startup_32) 87ENTRY(startup_32)
88 movl pa(stack_start),%ecx
89
86 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 90 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
87 us to not reload segments */ 91 us to not reload segments */
88 testb $(1<<6), BP_loadflags(%esi) 92 testb $(1<<6), BP_loadflags(%esi)
@@ -97,7 +101,9 @@ ENTRY(startup_32)
97 movl %eax,%es 101 movl %eax,%es
98 movl %eax,%fs 102 movl %eax,%fs
99 movl %eax,%gs 103 movl %eax,%gs
104 movl %eax,%ss
1002: 1052:
106 leal -__PAGE_OFFSET(%ecx),%esp
101 107
102/* 108/*
103 * Clear BSS first so that there are no surprises... 109 * Clear BSS first so that there are no surprises...
@@ -124,72 +130,35 @@ ENTRY(startup_32)
124 movsl 130 movsl
125 movl pa(boot_params) + NEW_CL_POINTER,%esi 131 movl pa(boot_params) + NEW_CL_POINTER,%esi
126 andl %esi,%esi 132 andl %esi,%esi
127 jz 1f # No comand line 133 jz 1f # No command line
128 movl $pa(boot_command_line),%edi 134 movl $pa(boot_command_line),%edi
129 movl $(COMMAND_LINE_SIZE/4),%ecx 135 movl $(COMMAND_LINE_SIZE/4),%ecx
130 rep 136 rep
131 movsl 137 movsl
1321: 1381:
133 139
134#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
135 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax 142 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
138#endif 144#endif
139 145
140#ifdef CONFIG_PARAVIRT
141 /* This is can only trip for a broken bootloader... */
142 cmpw $0x207, pa(boot_params + BP_version)
143 jb default_entry
144
145 /* Paravirt-compatible boot parameters. Look to see what architecture
146 we're booting under. */
147 movl pa(boot_params + BP_hardware_subarch), %eax
148 cmpl $num_subarch_entries, %eax
149 jae bad_subarch
150
151 movl pa(subarch_entries)(,%eax,4), %eax
152 subl $__PAGE_OFFSET, %eax
153 jmp *%eax
154
155bad_subarch:
156WEAK(lguest_entry)
157WEAK(xen_entry)
158 /* Unknown implementation; there's really
159 nothing we can do at this point. */
160 ud2a
161
162 __INITDATA
163
164subarch_entries:
165 .long default_entry /* normal x86/PC */
166 .long lguest_entry /* lguest hypervisor */
167 .long xen_entry /* Xen hypervisor */
168 .long default_entry /* Moorestown MID */
169num_subarch_entries = (. - subarch_entries) / 4
170.previous
171#endif /* CONFIG_PARAVIRT */
172
173/* 146/*
174 * Initialize page tables. This creates a PDE and a set of page 147 * Initialize page tables. This creates a PDE and a set of page
175 * tables, which are located immediately beyond __brk_base. The variable 148 * tables, which are located immediately beyond __brk_base. The variable
176 * _brk_end is set up to point to the first "safe" location. 149 * _brk_end is set up to point to the first "safe" location.
177 * Mappings are created both at virtual address 0 (identity mapping) 150 * Mappings are created both at virtual address 0 (identity mapping)
178 * and PAGE_OFFSET for up to _end. 151 * and PAGE_OFFSET for up to _end.
179 *
180 * Note that the stack is not yet set up!
181 */ 152 */
182default_entry:
183#ifdef CONFIG_X86_PAE 153#ifdef CONFIG_X86_PAE
184 154
185 /* 155 /*
186 * In PAE mode swapper_pg_dir is statically defined to contain enough 156 * In PAE mode initial_page_table is statically defined to contain
187 * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 157 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
188 * entries). The identity mapping is handled by pointing two PGD 158 * entries). The identity mapping is handled by pointing two PGD entries
189 * entries to the first kernel PMD. 159 * to the first kernel PMD.
190 * 160 *
191 * Note the upper half of each PMD or PTE are always zero at 161 * Note the upper half of each PMD or PTE are always zero at this stage.
192 * this stage.
193 */ 162 */
194 163
195#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ 164#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +166,7 @@ default_entry:
197 xorl %ebx,%ebx /* %ebx is kept at zero */ 166 xorl %ebx,%ebx /* %ebx is kept at zero */
198 167
199 movl $pa(__brk_base), %edi 168 movl $pa(__brk_base), %edi
200 movl $pa(swapper_pg_pmd), %edx 169 movl $pa(initial_pg_pmd), %edx
201 movl $PTE_IDENT_ATTR, %eax 170 movl $PTE_IDENT_ATTR, %eax
20210: 17110:
203 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ 172 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -226,14 +195,14 @@ default_entry:
226 movl %eax, pa(max_pfn_mapped) 195 movl %eax, pa(max_pfn_mapped)
227 196
228 /* Do early initialization of the fixmap area */ 197 /* Do early initialization of the fixmap area */
229 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 198 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
230 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 199 movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
231#else /* Not PAE */ 200#else /* Not PAE */
232 201
233page_pde_offset = (__PAGE_OFFSET >> 20); 202page_pde_offset = (__PAGE_OFFSET >> 20);
234 203
235 movl $pa(__brk_base), %edi 204 movl $pa(__brk_base), %edi
236 movl $pa(swapper_pg_dir), %edx 205 movl $pa(initial_page_table), %edx
237 movl $PTE_IDENT_ATTR, %eax 206 movl $PTE_IDENT_ATTR, %eax
23810: 20710:
239 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ 208 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -257,10 +226,45 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 226 movl %eax, pa(max_pfn_mapped)
258 227
259 /* Do early initialization of the fixmap area */ 228 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 229 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 230 movl %eax,pa(initial_page_table+0xffc)
262#endif 231#endif
263 jmp 3f 232
233#ifdef CONFIG_PARAVIRT
234 /* This is can only trip for a broken bootloader... */
235 cmpw $0x207, pa(boot_params + BP_version)
236 jb default_entry
237
238 /* Paravirt-compatible boot parameters. Look to see what architecture
239 we're booting under. */
240 movl pa(boot_params + BP_hardware_subarch), %eax
241 cmpl $num_subarch_entries, %eax
242 jae bad_subarch
243
244 movl pa(subarch_entries)(,%eax,4), %eax
245 subl $__PAGE_OFFSET, %eax
246 jmp *%eax
247
248bad_subarch:
249WEAK(lguest_entry)
250WEAK(xen_entry)
251 /* Unknown implementation; there's really
252 nothing we can do at this point. */
253 ud2a
254
255 __INITDATA
256
257subarch_entries:
258 .long default_entry /* normal x86/PC */
259 .long lguest_entry /* lguest hypervisor */
260 .long xen_entry /* Xen hypervisor */
261 .long default_entry /* Moorestown MID */
262num_subarch_entries = (. - subarch_entries) / 4
263.previous
264#else
265 jmp default_entry
266#endif /* CONFIG_PARAVIRT */
267
264/* 268/*
265 * Non-boot CPU entry point; entered from trampoline.S 269 * Non-boot CPU entry point; entered from trampoline.S
266 * We can't lgdt here, because lgdt itself uses a data segment, but 270 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,8 +284,11 @@ ENTRY(startup_32_smp)
280 movl %eax,%es 284 movl %eax,%es
281 movl %eax,%fs 285 movl %eax,%fs
282 movl %eax,%gs 286 movl %eax,%gs
287 movl pa(stack_start),%ecx
288 movl %eax,%ss
289 leal -__PAGE_OFFSET(%ecx),%esp
283#endif /* CONFIG_SMP */ 290#endif /* CONFIG_SMP */
2843: 291default_entry:
285 292
286/* 293/*
287 * New page tables may be in 4Mbyte page mode and may 294 * New page tables may be in 4Mbyte page mode and may
@@ -315,6 +322,10 @@ ENTRY(startup_32_smp)
315 subl $0x80000001, %eax 322 subl $0x80000001, %eax
316 cmpl $(0x8000ffff-0x80000001), %eax 323 cmpl $(0x8000ffff-0x80000001), %eax
317 ja 6f 324 ja 6f
325
326 /* Clear bogus XD_DISABLE bits */
327 call verify_cpu
328
318 mov $0x80000001, %eax 329 mov $0x80000001, %eax
319 cpuid 330 cpuid
320 /* Execute Disable bit supported? */ 331 /* Execute Disable bit supported? */
@@ -334,15 +345,15 @@ ENTRY(startup_32_smp)
334/* 345/*
335 * Enable paging 346 * Enable paging
336 */ 347 */
337 movl pa(initial_page_table), %eax 348 movl $pa(initial_page_table), %eax
338 movl %eax,%cr3 /* set the page table pointer.. */ 349 movl %eax,%cr3 /* set the page table pointer.. */
339 movl %cr0,%eax 350 movl %cr0,%eax
340 orl $X86_CR0_PG,%eax 351 orl $X86_CR0_PG,%eax
341 movl %eax,%cr0 /* ..and set paging (PG) bit */ 352 movl %eax,%cr0 /* ..and set paging (PG) bit */
342 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 353 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
3431: 3541:
344 /* Set up the stack pointer */ 355 /* Shift the stack pointer to a virtual address */
345 lss stack_start,%esp 356 addl $__PAGE_OFFSET, %esp
346 357
347/* 358/*
348 * Initialize eflags. Some BIOS's leave bits like NT set. This would 359 * Initialize eflags. Some BIOS's leave bits like NT set. This would
@@ -354,9 +365,7 @@ ENTRY(startup_32_smp)
354 365
355#ifdef CONFIG_SMP 366#ifdef CONFIG_SMP
356 cmpb $0, ready 367 cmpb $0, ready
357 jz 1f /* Initial CPU cleans BSS */ 368 jnz checkCPUtype
358 jmp checkCPUtype
3591:
360#endif /* CONFIG_SMP */ 369#endif /* CONFIG_SMP */
361 370
362/* 371/*
@@ -464,14 +473,7 @@ is386: movl $2,%ecx # set MP
464 473
465 cld # gcc2 wants the direction flag cleared at all times 474 cld # gcc2 wants the direction flag cleared at all times
466 pushl $0 # fake return address for unwinder 475 pushl $0 # fake return address for unwinder
467#ifdef CONFIG_SMP
468 movb ready, %cl
469 movb $1, ready 476 movb $1, ready
470 cmpb $0,%cl # the first CPU calls start_kernel
471 je 1f
472 movl (stack_start), %esp
4731:
474#endif /* CONFIG_SMP */
475 jmp *(initial_code) 477 jmp *(initial_code)
476 478
477/* 479/*
@@ -610,33 +612,31 @@ ignore_int:
610#endif 612#endif
611 iret 613 iret
612 614
615#include "verify_cpu.S"
616
613 __REFDATA 617 __REFDATA
614.align 4 618.align 4
615ENTRY(initial_code) 619ENTRY(initial_code)
616 .long i386_start_kernel 620 .long i386_start_kernel
617ENTRY(initial_page_table)
618 .long pa(swapper_pg_dir)
619 621
620/* 622/*
621 * BSS section 623 * BSS section
622 */ 624 */
623__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
624 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
625#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
626swapper_pg_pmd: 628initial_pg_pmd:
627 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
628#else 630#else
629ENTRY(swapper_pg_dir) 631ENTRY(initial_page_table)
630 .fill 1024,4,0 632 .fill 1024,4,0
631#endif 633#endif
632swapper_pg_fixmap: 634initial_pg_fixmap:
633 .fill 1024,4,0
634#ifdef CONFIG_X86_TRAMPOLINE
635ENTRY(trampoline_pg_dir)
636 .fill 1024,4,0 635 .fill 1024,4,0
637#endif
638ENTRY(empty_zero_page) 636ENTRY(empty_zero_page)
639 .fill 4096,1,0 637 .fill 4096,1,0
638ENTRY(swapper_pg_dir)
639 .fill 1024,4,0
640 640
641/* 641/*
642 * This starts the data section. 642 * This starts the data section.
@@ -644,37 +644,37 @@ ENTRY(empty_zero_page)
644#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
645__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
648ENTRY(swapper_pg_dir) 648ENTRY(initial_page_table)
649 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 650# if KPMDS == 3
651 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 651 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
652 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 652 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
653 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 653 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
654# elif KPMDS == 2 654# elif KPMDS == 2
655 .long 0,0 655 .long 0,0
656 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 656 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
657 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 657 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
658# elif KPMDS == 1 658# elif KPMDS == 1
659 .long 0,0 659 .long 0,0
660 .long 0,0 660 .long 0,0
661 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 661 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
662# else 662# else
663# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
664# endif 664# endif
665 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
666#endif 666#endif
667 667
668.data 668.data
669.balign 4
669ENTRY(stack_start) 670ENTRY(stack_start)
670 .long init_thread_union+THREAD_SIZE 671 .long init_thread_union+THREAD_SIZE
671 .long __BOOT_DS
672
673ready: .byte 0
674 672
675early_recursion_flag: 673early_recursion_flag:
676 .long 0 674 .long 0
677 675
676ready: .byte 0
677
678int_msg: 678int_msg:
679 .asciz "Unknown interrupt or fault at: %p %p %p\n" 679 .asciz "Unknown interrupt or fault at: %p %p %p\n"
680 680
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b3..6781765b3a0d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
27#define HPET_DEV_FSB_CAP 0x1000 27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000 28#define HPET_DEV_PERI_CAP 0x2000
29 29
30#define HPET_MIN_CYCLES 128
31#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
32
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) 33#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
31 34
32/* 35/*
@@ -214,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
214/* 217/*
215 * Common hpet info 218 * Common hpet info
216 */ 219 */
217static unsigned long hpet_period; 220static unsigned long hpet_freq;
218 221
219static void hpet_legacy_set_mode(enum clock_event_mode mode, 222static void hpet_legacy_set_mode(enum clock_event_mode mode,
220 struct clock_event_device *evt); 223 struct clock_event_device *evt);
@@ -229,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
229 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
230 .set_mode = hpet_legacy_set_mode, 233 .set_mode = hpet_legacy_set_mode,
231 .set_next_event = hpet_legacy_next_event, 234 .set_next_event = hpet_legacy_next_event,
232 .shift = 32,
233 .irq = 0, 235 .irq = 0,
234 .rating = 50, 236 .rating = 50,
235}; 237};
@@ -287,27 +289,12 @@ static void hpet_legacy_clockevent_register(void)
287 hpet_enable_legacy_int(); 289 hpet_enable_legacy_int();
288 290
289 /* 291 /*
290 * The mult factor is defined as (include/linux/clockchips.h)
291 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
292 * hpet_period is in units of femtoseconds (per cycle), so
293 * mult/2^shift = cyc/ns = 10^6/hpet_period
294 * mult = (10^6 * 2^shift)/hpet_period
295 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
296 */
297 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
298 hpet_period, hpet_clockevent.shift);
299 /* Calculate the min / max delta */
300 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
301 &hpet_clockevent);
302 /* 5 usec minimum reprogramming delta. */
303 hpet_clockevent.min_delta_ns = 5000;
304
305 /*
306 * Start hpet with the boot cpu mask and make it 292 * Start hpet with the boot cpu mask and make it
307 * global after the IO_APIC has been initialized. 293 * global after the IO_APIC has been initialized.
308 */ 294 */
309 hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); 295 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
310 clockevents_register_device(&hpet_clockevent); 296 clockevents_config_and_register(&hpet_clockevent, hpet_freq,
297 HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
311 global_clock_event = &hpet_clockevent; 298 global_clock_event = &hpet_clockevent;
312 printk(KERN_DEBUG "hpet clockevent registered\n"); 299 printk(KERN_DEBUG "hpet clockevent registered\n");
313} 300}
@@ -380,44 +367,37 @@ static int hpet_next_event(unsigned long delta,
380 struct clock_event_device *evt, int timer) 367 struct clock_event_device *evt, int timer)
381{ 368{
382 u32 cnt; 369 u32 cnt;
370 s32 res;
383 371
384 cnt = hpet_readl(HPET_COUNTER); 372 cnt = hpet_readl(HPET_COUNTER);
385 cnt += (u32) delta; 373 cnt += (u32) delta;
386 hpet_writel(cnt, HPET_Tn_CMP(timer)); 374 hpet_writel(cnt, HPET_Tn_CMP(timer));
387 375
388 /* 376 /*
389 * We need to read back the CMP register on certain HPET 377 * HPETs are a complete disaster. The compare register is
390 * implementations (ATI chipsets) which seem to delay the 378 * based on a equal comparison and neither provides a less
391 * transfer of the compare register into the internal compare 379 * than or equal functionality (which would require to take
392 * logic. With small deltas this might actually be too late as 380 * the wraparound into account) nor a simple count down event
393 * the counter could already be higher than the compare value 381 * mode. Further the write to the comparator register is
394 * at that point and we would wait for the next hpet interrupt 382 * delayed internally up to two HPET clock cycles in certain
395 * forever. We found out that reading the CMP register back 383 * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
396 * forces the transfer so we can rely on the comparison with 384 * longer delays. We worked around that by reading back the
397 * the counter register below. If the read back from the 385 * compare register, but that required another workaround for
398 * compare register does not match the value we programmed 386 * ICH9,10 chips where the first readout after write can
399 * then we might have a real hardware problem. We can not do 387 * return the old stale value. We already had a minimum
400 * much about it here, but at least alert the user/admin with 388 * programming delta of 5us enforced, but a NMI or SMI hitting
401 * a prominent warning. 389 * between the counter readout and the comparator write can
402 * 390 * move us behind that point easily. Now instead of reading
403 * An erratum on some chipsets (ICH9,..), results in 391 * the compare register back several times, we make the ETIME
404 * comparator read immediately following a write returning old 392 * decision based on the following: Return ETIME if the
405 * value. Workaround for this is to read this value second 393 * counter value after the write is less than HPET_MIN_CYCLES
406 * time, when first read returns old value. 394 * away from the event or if the counter is already ahead of
407 * 395 * the event. The minimum programming delta for the generic
408 * In fact the write to the comparator register is delayed up 396 * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
409 * to two HPET cycles so the workaround we tried to restrict
410 * the readback to those known to be borked ATI chipsets
411 * failed miserably. So we give up on optimizations forever
412 * and penalize all HPET incarnations unconditionally.
413 */ 397 */
414 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { 398 res = (s32)(cnt - hpet_readl(HPET_COUNTER));
415 if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
416 printk_once(KERN_WARNING
417 "hpet: compare register read back failed.\n");
418 }
419 399
420 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 400 return res < HPET_MIN_CYCLES ? -ETIME : 0;
421} 401}
422 402
423static void hpet_legacy_set_mode(enum clock_event_mode mode, 403static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -440,9 +420,9 @@ static int hpet_legacy_next_event(unsigned long delta,
440static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); 420static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
441static struct hpet_dev *hpet_devs; 421static struct hpet_dev *hpet_devs;
442 422
443void hpet_msi_unmask(unsigned int irq) 423void hpet_msi_unmask(struct irq_data *data)
444{ 424{
445 struct hpet_dev *hdev = get_irq_data(irq); 425 struct hpet_dev *hdev = data->handler_data;
446 unsigned int cfg; 426 unsigned int cfg;
447 427
448 /* unmask it */ 428 /* unmask it */
@@ -451,10 +431,10 @@ void hpet_msi_unmask(unsigned int irq)
451 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 431 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
452} 432}
453 433
454void hpet_msi_mask(unsigned int irq) 434void hpet_msi_mask(struct irq_data *data)
455{ 435{
436 struct hpet_dev *hdev = data->handler_data;
456 unsigned int cfg; 437 unsigned int cfg;
457 struct hpet_dev *hdev = get_irq_data(irq);
458 438
459 /* mask it */ 439 /* mask it */
460 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 440 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +442,14 @@ void hpet_msi_mask(unsigned int irq)
462 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 442 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
463} 443}
464 444
465void hpet_msi_write(unsigned int irq, struct msi_msg *msg) 445void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
466{ 446{
467 struct hpet_dev *hdev = get_irq_data(irq);
468
469 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); 447 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
470 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); 448 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
471} 449}
472 450
473void hpet_msi_read(unsigned int irq, struct msi_msg *msg) 451void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
474{ 452{
475 struct hpet_dev *hdev = get_irq_data(irq);
476
477 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); 453 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
478 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); 454 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
479 msg->address_hi = 0; 455 msg->address_hi = 0;
@@ -510,7 +486,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
510 if (!irq) 486 if (!irq)
511 return -EINVAL; 487 return -EINVAL;
512 488
513 set_irq_data(irq, dev); 489 irq_set_handler_data(irq, dev);
514 490
515 if (hpet_setup_msi_irq(irq)) 491 if (hpet_setup_msi_irq(irq))
516 return -EINVAL; 492 return -EINVAL;
@@ -556,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
556static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) 532static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
557{ 533{
558 struct clock_event_device *evt = &hdev->evt; 534 struct clock_event_device *evt = &hdev->evt;
559 uint64_t hpet_freq;
560 535
561 WARN_ON(cpu != smp_processor_id()); 536 WARN_ON(cpu != smp_processor_id());
562 if (!(hdev->flags & HPET_DEV_VALID)) 537 if (!(hdev->flags & HPET_DEV_VALID))
@@ -578,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
578 553
579 evt->set_mode = hpet_msi_set_mode; 554 evt->set_mode = hpet_msi_set_mode;
580 evt->set_next_event = hpet_msi_next_event; 555 evt->set_next_event = hpet_msi_next_event;
581 evt->shift = 32;
582
583 /*
584 * The period is a femto seconds value. We need to calculate the
585 * scaled math multiplication factor for nanosecond to hpet tick
586 * conversion.
587 */
588 hpet_freq = FSEC_PER_SEC;
589 do_div(hpet_freq, hpet_period);
590 evt->mult = div_sc((unsigned long) hpet_freq,
591 NSEC_PER_SEC, evt->shift);
592 /* Calculate the max delta */
593 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
594 /* 5 usec minimum reprogramming delta. */
595 evt->min_delta_ns = 5000;
596
597 evt->cpumask = cpumask_of(hdev->cpu); 556 evt->cpumask = cpumask_of(hdev->cpu);
598 clockevents_register_device(evt); 557
558 clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
559 0x7FFFFFFF);
599} 560}
600 561
601#ifdef CONFIG_HPET 562#ifdef CONFIG_HPET
@@ -726,7 +687,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
726 687
727 switch (action & 0xf) { 688 switch (action & 0xf) {
728 case CPU_ONLINE: 689 case CPU_ONLINE:
729 INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); 690 INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
730 init_completion(&work.complete); 691 init_completion(&work.complete);
731 /* FIXME: add schedule_work_on() */ 692 /* FIXME: add schedule_work_on() */
732 schedule_delayed_work_on(cpu, &work.work, 0); 693 schedule_delayed_work_on(cpu, &work.work, 0);
@@ -799,7 +760,6 @@ static struct clocksource clocksource_hpet = {
799static int hpet_clocksource_register(void) 760static int hpet_clocksource_register(void)
800{ 761{
801 u64 start, now; 762 u64 start, now;
802 u64 hpet_freq;
803 cycle_t t1; 763 cycle_t t1;
804 764
805 /* Start the counter */ 765 /* Start the counter */
@@ -826,24 +786,7 @@ static int hpet_clocksource_register(void)
826 return -ENODEV; 786 return -ENODEV;
827 } 787 }
828 788
829 /*
830 * The definition of mult is (include/linux/clocksource.h)
831 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
832 * so we first need to convert hpet_period to ns/cyc units:
833 * mult/2^shift = ns/cyc = hpet_period/10^6
834 * mult = (hpet_period * 2^shift)/10^6
835 * mult = (hpet_period << shift)/FSEC_PER_NSEC
836 */
837
838 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
839 *
840 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
841 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
842 */
843 hpet_freq = FSEC_PER_SEC;
844 do_div(hpet_freq, hpet_period);
845 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); 789 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
846
847 return 0; 790 return 0;
848} 791}
849 792
@@ -852,7 +795,9 @@ static int hpet_clocksource_register(void)
852 */ 795 */
853int __init hpet_enable(void) 796int __init hpet_enable(void)
854{ 797{
798 unsigned long hpet_period;
855 unsigned int id; 799 unsigned int id;
800 u64 freq;
856 int i; 801 int i;
857 802
858 if (!is_hpet_capable()) 803 if (!is_hpet_capable())
@@ -891,6 +836,14 @@ int __init hpet_enable(void)
891 goto out_nohpet; 836 goto out_nohpet;
892 837
893 /* 838 /*
839 * The period is a femto seconds value. Convert it to a
840 * frequency.
841 */
842 freq = FSEC_PER_SEC;
843 do_div(freq, hpet_period);
844 hpet_freq = freq;
845
846 /*
894 * Read the HPET ID register to retrieve the IRQ routing 847 * Read the HPET ID register to retrieve the IRQ routing
895 * information and the number of channels 848 * information and the number of channels
896 */ 849 */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
122 return -EBUSY; 122 return -EBUSY;
123 123
124 set_debugreg(info->address, i); 124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address; 125 __this_cpu_write(cpu_debugreg[i], info->address);
126 126
127 dr7 = &__get_cpu_var(cpu_dr7); 127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type); 128 *dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
397 397
398void hw_breakpoint_restore(void) 398void hw_breakpoint_restore(void)
399{ 399{
400 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); 400 set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
401 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); 401 set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
402 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); 402 set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
403 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); 403 set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
404 set_debugreg(current->thread.debugreg6, 6); 404 set_debugreg(current->thread.debugreg6, 6);
405 set_debugreg(__get_cpu_var(cpu_dr7), 7); 405 set_debugreg(__this_cpu_read(cpu_dr7), 7);
406} 406}
407EXPORT_SYMBOL_GPL(hw_breakpoint_restore); 407EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
408 408
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
433 dr6_p = (unsigned long *)ERR_PTR(args->err); 433 dr6_p = (unsigned long *)ERR_PTR(args->err);
434 dr6 = *dr6_p; 434 dr6 = *dr6_p;
435 435
436 /* If it's a single step, TRAP bits are random */
437 if (dr6 & DR_STEP)
438 return NOTIFY_DONE;
439
436 /* Do an early return if no trap bits are set in DR6 */ 440 /* Do an early return if no trap bits are set in DR6 */
437 if ((dr6 & DR_TRAP_BITS) == 0) 441 if ((dr6 & DR_TRAP_BITS) == 0)
438 return NOTIFY_DONE; 442 return NOTIFY_DONE;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
68 */ 68 */
69 69
70 if (!HAVE_HWFP) { 70 if (!HAVE_HWFP) {
71 /*
72 * Disable xsave as we do not support it if i387
73 * emulation is enabled.
74 */
75 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
76 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
71 xstate_size = sizeof(struct i387_soft_struct); 77 xstate_size = sizeof(struct i387_soft_struct);
72 return; 78 return;
73 } 79 }
74 80
75 if (cpu_has_fxsr) 81 if (cpu_has_fxsr)
76 xstate_size = sizeof(struct i387_fxsave_struct); 82 xstate_size = sizeof(struct i387_fxsave_struct);
77#ifdef CONFIG_X86_32
78 else 83 else
79 xstate_size = sizeof(struct i387_fsave_struct); 84 xstate_size = sizeof(struct i387_fsave_struct);
80#endif
81} 85}
82 86
83#ifdef CONFIG_X86_64
84/* 87/*
85 * Called at bootup to set up the initial FPU state that is later cloned 88 * Called at bootup to set up the initial FPU state that is later cloned
86 * into all processes. 89 * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
88 91
89void __cpuinit fpu_init(void) 92void __cpuinit fpu_init(void)
90{ 93{
91 unsigned long oldcr0 = read_cr0(); 94 unsigned long cr0;
92 95 unsigned long cr4_mask = 0;
93 set_in_cr4(X86_CR4_OSFXSR);
94 set_in_cr4(X86_CR4_OSXMMEXCPT);
95 96
96 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 97 if (cpu_has_fxsr)
98 cr4_mask |= X86_CR4_OSFXSR;
99 if (cpu_has_xmm)
100 cr4_mask |= X86_CR4_OSXMMEXCPT;
101 if (cr4_mask)
102 set_in_cr4(cr4_mask);
103
104 cr0 = read_cr0();
105 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
106 if (!HAVE_HWFP)
107 cr0 |= X86_CR0_EM;
108 write_cr0(cr0);
97 109
98 if (!smp_processor_id()) 110 if (!smp_processor_id())
99 init_thread_xstate(); 111 init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
104 clear_used_math(); 116 clear_used_math();
105} 117}
106 118
107#else /* CONFIG_X86_64 */
108
109void __cpuinit fpu_init(void)
110{
111 if (!smp_processor_id())
112 init_thread_xstate();
113}
114
115#endif /* CONFIG_X86_32 */
116
117void fpu_finit(struct fpu *fpu) 119void fpu_finit(struct fpu *fpu)
118{ 120{
119#ifdef CONFIG_X86_32
120 if (!HAVE_HWFP) { 121 if (!HAVE_HWFP) {
121 finit_soft_fpu(&fpu->state->soft); 122 finit_soft_fpu(&fpu->state->soft);
122 return; 123 return;
123 } 124 }
124#endif
125 125
126 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
127 struct i387_fxsave_struct *fx = &fpu->state->fxsave; 127 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
@@ -386,19 +387,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
386#ifdef CONFIG_X86_64 387#ifdef CONFIG_X86_64
387 env->fip = fxsave->rip; 388 env->fip = fxsave->rip;
388 env->foo = fxsave->rdp; 389 env->foo = fxsave->rdp;
390 /*
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 env->fcs = task_pt_regs(tsk)->cs;
389 if (tsk == current) { 395 if (tsk == current) {
390 /* 396 savesegment(ds, env->fos);
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
395 asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
396 } else { 397 } else {
397 struct pt_regs *regs = task_pt_regs(tsk); 398 env->fos = tsk->thread.ds;
398
399 env->fos = 0xffff0000 | tsk->thread.ds;
400 env->fcs = regs->cs;
401 } 399 }
400 env->fos |= 0xffff0000;
402#else 401#else
403 env->fip = fxsave->fip; 402 env->fip = fxsave->fip;
404 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); 403 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sysdev.h> 13#include <linux/syscore_ops.h>
14 14
15#include <asm/dma.h> 15#include <asm/dma.h>
16 16
@@ -21,7 +21,7 @@
21 * in asm/dma.h. 21 * in asm/dma.h.
22 */ 22 */
23 23
24static int i8237A_resume(struct sys_device *dev) 24static void i8237A_resume(void)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
27 int i; 27 int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
41 enable_dma(4); 41 enable_dma(4);
42 42
43 release_dma_lock(flags); 43 release_dma_lock(flags);
44
45 return 0;
46} 44}
47 45
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state) 46static struct syscore_ops i8237_syscore_ops = {
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 .name = "i8237",
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 47 .resume = i8237A_resume,
57}; 48};
58 49
59static struct sys_device device_i8237A = { 50static int __init i8237A_init_ops(void)
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{ 51{
66 int error = sysdev_class_register(&i8237_sysdev_class); 52 register_syscore_ops(&i8237_syscore_ops);
67 if (!error) 53 return 0;
68 error = sysdev_register(&device_i8237A);
69 return error;
70} 54}
71device_initcall(i8237A_init_sysfs); 55device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..fb66dc9e36cb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer, 94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event, 95 .set_next_event = pit_next_event,
96 .shift = 32,
97 .irq = 0, 96 .irq = 0,
98}; 97};
99 98
@@ -108,90 +107,12 @@ void __init setup_pit_timer(void)
108 * IO_APIC has been initialized. 107 * IO_APIC has been initialized.
109 */ 108 */
110 pit_ce.cpumask = cpumask_of(smp_processor_id()); 109 pit_ce.cpumask = cpumask_of(smp_processor_id());
111 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
112 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
113 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
114 110
115 clockevents_register_device(&pit_ce); 111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
116 global_clock_event = &pit_ce; 112 global_clock_event = &pit_ce;
117} 113}
118 114
119#ifndef CONFIG_X86_64 115#ifndef CONFIG_X86_64
120/*
121 * Since the PIT overflows every tick, its not very useful
122 * to just read by itself. So use jiffies to emulate a free
123 * running counter:
124 */
125static cycle_t pit_read(struct clocksource *cs)
126{
127 static int old_count;
128 static u32 old_jifs;
129 unsigned long flags;
130 int count;
131 u32 jifs;
132
133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /*
135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine
137 * by having side effects on state that we cannot undo if
138 * there is a collision on the seqlock and our caller has to
139 * retry. (Namely, old_jifs and old_count.) So we must treat
140 * jiffies as volatile despite the lock. We read jiffies
141 * before latching the timer count to guarantee that although
142 * the jiffies value might be older than the count (that is,
143 * the counter may underflow between the last point where
144 * jiffies was incremented and the point where we latch the
145 * count), it cannot be newer.
146 */
147 jifs = jiffies;
148 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
149 count = inb_pit(PIT_CH0); /* read the latched count */
150 count |= inb_pit(PIT_CH0) << 8;
151
152 /* VIA686a test code... reset the latch if count > max + 1 */
153 if (count > LATCH) {
154 outb_pit(0x34, PIT_MODE);
155 outb_pit(LATCH & 0xff, PIT_CH0);
156 outb_pit(LATCH >> 8, PIT_CH0);
157 count = LATCH - 1;
158 }
159
160 /*
161 * It's possible for count to appear to go the wrong way for a
162 * couple of reasons:
163 *
164 * 1. The timer counter underflows, but we haven't handled the
165 * resulting interrupt and incremented jiffies yet.
166 * 2. Hardware problem with the timer, not giving us continuous time,
167 * the counter does small "jumps" upwards on some Pentium systems,
168 * (see c't 95/10 page 335 for Neptun bug.)
169 *
170 * Previous attempts to handle these cases intelligently were
171 * buggy, so we just do the simple thing now.
172 */
173 if (count > old_count && jifs == old_jifs)
174 count = old_count;
175
176 old_count = count;
177 old_jifs = jifs;
178
179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180
181 count = (LATCH - 1) - count;
182
183 return (cycle_t)(jifs * LATCH) + count;
184}
185
186static struct clocksource pit_cs = {
187 .name = "pit",
188 .rating = 110,
189 .read = pit_read,
190 .mask = CLOCKSOURCE_MASK(32),
191 .mult = 0,
192 .shift = 20,
193};
194
195static int __init init_pit_clocksource(void) 116static int __init init_pit_clocksource(void)
196{ 117{
197 /* 118 /*
@@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void)
205 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
206 return 0; 127 return 0;
207 128
208 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); 129 return clocksource_i8253_init();
209
210 return clocksource_register(&pit_cs);
211} 130}
212arch_initcall(init_pit_clocksource); 131arch_initcall(init_pit_clocksource);
213
214#endif /* !CONFIG_X86_64 */ 132#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
8#include <linux/random.h> 8#include <linux/random.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <linux/sysdev.h> 11#include <linux/syscore_ops.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/acpi.h> 13#include <linux/acpi.h>
14#include <linux/io.h> 14#include <linux/io.h>
@@ -29,24 +29,10 @@
29 * plus some generic x86 specific things if generic specifics makes 29 * plus some generic x86 specific things if generic specifics makes
30 * any sense at all. 30 * any sense at all.
31 */ 31 */
32static void init_8259A(int auto_eoi);
32 33
33static int i8259A_auto_eoi; 34static int i8259A_auto_eoi;
34DEFINE_RAW_SPINLOCK(i8259A_lock); 35DEFINE_RAW_SPINLOCK(i8259A_lock);
35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
42
43struct irq_chip i8259A_chip = {
44 .name = "XT-PIC",
45 .mask = disable_8259A_irq,
46 .disable = disable_8259A_irq,
47 .unmask = enable_8259A_irq,
48 .mask_ack = mask_and_ack_8259A,
49};
50 36
51/* 37/*
52 * 8259A PIC functions to handle ISA devices: 38 * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
68 */ 54 */
69unsigned long io_apic_irqs; 55unsigned long io_apic_irqs;
70 56
71static void disable_8259A_irq(unsigned int irq) 57static void mask_8259A_irq(unsigned int irq)
72{ 58{
73 unsigned int mask = 1 << irq; 59 unsigned int mask = 1 << irq;
74 unsigned long flags; 60 unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
82 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 68 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
83} 69}
84 70
85static void enable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(struct irq_data *data)
72{
73 mask_8259A_irq(data->irq);
74}
75
76static void unmask_8259A_irq(unsigned int irq)
86{ 77{
87 unsigned int mask = ~(1 << irq); 78 unsigned int mask = ~(1 << irq);
88 unsigned long flags; 79 unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
96 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 87 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
97} 88}
98 89
90static void enable_8259A_irq(struct irq_data *data)
91{
92 unmask_8259A_irq(data->irq);
93}
94
99static int i8259A_irq_pending(unsigned int irq) 95static int i8259A_irq_pending(unsigned int irq)
100{ 96{
101 unsigned int mask = 1<<irq; 97 unsigned int mask = 1<<irq;
@@ -116,8 +112,8 @@ static void make_8259A_irq(unsigned int irq)
116{ 112{
117 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
118 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
119 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
120 "XT"); 116 i8259A_chip.name);
121 enable_irq(irq); 117 enable_irq(irq);
122} 118}
123 119
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
150 * first, _then_ send the EOI, and the order of EOI 146 * first, _then_ send the EOI, and the order of EOI
151 * to the two 8259s is important! 147 * to the two 8259s is important!
152 */ 148 */
153static void mask_and_ack_8259A(unsigned int irq) 149static void mask_and_ack_8259A(struct irq_data *data)
154{ 150{
151 unsigned int irq = data->irq;
155 unsigned int irqmask = 1 << irq; 152 unsigned int irqmask = 1 << irq;
156 unsigned long flags; 153 unsigned long flags;
157 154
@@ -223,6 +220,14 @@ spurious_8259A_irq:
223 } 220 }
224} 221}
225 222
223struct irq_chip i8259A_chip = {
224 .name = "XT-PIC",
225 .irq_mask = disable_8259A_irq,
226 .irq_disable = disable_8259A_irq,
227 .irq_unmask = enable_8259A_irq,
228 .irq_mask_ack = mask_and_ack_8259A,
229};
230
226static char irq_trigger[2]; 231static char irq_trigger[2];
227/** 232/**
228 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 233 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -240,20 +245,19 @@ static void save_ELCR(char *trigger)
240 trigger[1] = inb(0x4d1) & 0xDE; 245 trigger[1] = inb(0x4d1) & 0xDE;
241} 246}
242 247
243static int i8259A_resume(struct sys_device *dev) 248static void i8259A_resume(void)
244{ 249{
245 init_8259A(i8259A_auto_eoi); 250 init_8259A(i8259A_auto_eoi);
246 restore_ELCR(irq_trigger); 251 restore_ELCR(irq_trigger);
247 return 0;
248} 252}
249 253
250static int i8259A_suspend(struct sys_device *dev, pm_message_t state) 254static int i8259A_suspend(void)
251{ 255{
252 save_ELCR(irq_trigger); 256 save_ELCR(irq_trigger);
253 return 0; 257 return 0;
254} 258}
255 259
256static int i8259A_shutdown(struct sys_device *dev) 260static void i8259A_shutdown(void)
257{ 261{
258 /* Put the i8259A into a quiescent state that 262 /* Put the i8259A into a quiescent state that
259 * the kernel initialization code can get it 263 * the kernel initialization code can get it
@@ -261,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
261 */ 265 */
262 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
263 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ 267 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
264 return 0;
265} 268}
266 269
267static struct sysdev_class i8259_sysdev_class = { 270static struct syscore_ops i8259_syscore_ops = {
268 .name = "i8259",
269 .suspend = i8259A_suspend, 271 .suspend = i8259A_suspend,
270 .resume = i8259A_resume, 272 .resume = i8259A_resume,
271 .shutdown = i8259A_shutdown, 273 .shutdown = i8259A_shutdown,
272}; 274};
273 275
274static struct sys_device device_i8259A = {
275 .id = 0,
276 .cls = &i8259_sysdev_class,
277};
278
279static void mask_8259A(void) 276static void mask_8259A(void)
280{ 277{
281 unsigned long flags; 278 unsigned long flags;
@@ -342,9 +339,9 @@ static void init_8259A(int auto_eoi)
342 * In AEOI mode we just have to mask the interrupt 339 * In AEOI mode we just have to mask the interrupt
343 * when acking. 340 * when acking.
344 */ 341 */
345 i8259A_chip.mask_ack = disable_8259A_irq; 342 i8259A_chip.irq_mask_ack = disable_8259A_irq;
346 else 343 else
347 i8259A_chip.mask_ack = mask_and_ack_8259A; 344 i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
348 345
349 udelay(100); /* wait for 8259A to initialize */ 346 udelay(100); /* wait for 8259A to initialize */
350 347
@@ -363,14 +360,6 @@ static void init_8259A(int auto_eoi)
363static void legacy_pic_noop(void) { }; 360static void legacy_pic_noop(void) { };
364static void legacy_pic_uint_noop(unsigned int unused) { }; 361static void legacy_pic_uint_noop(unsigned int unused) { };
365static void legacy_pic_int_noop(int unused) { }; 362static void legacy_pic_int_noop(int unused) { };
366
367static struct irq_chip dummy_pic_chip = {
368 .name = "dummy pic",
369 .mask = legacy_pic_uint_noop,
370 .unmask = legacy_pic_uint_noop,
371 .disable = legacy_pic_uint_noop,
372 .mask_ack = legacy_pic_uint_noop,
373};
374static int legacy_pic_irq_pending_noop(unsigned int irq) 363static int legacy_pic_irq_pending_noop(unsigned int irq)
375{ 364{
376 return 0; 365 return 0;
@@ -378,7 +367,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
378 367
379struct legacy_pic null_legacy_pic = { 368struct legacy_pic null_legacy_pic = {
380 .nr_legacy_irqs = 0, 369 .nr_legacy_irqs = 0,
381 .chip = &dummy_pic_chip, 370 .chip = &dummy_irq_chip,
371 .mask = legacy_pic_uint_noop,
372 .unmask = legacy_pic_uint_noop,
382 .mask_all = legacy_pic_noop, 373 .mask_all = legacy_pic_noop,
383 .restore_mask = legacy_pic_noop, 374 .restore_mask = legacy_pic_noop,
384 .init = legacy_pic_int_noop, 375 .init = legacy_pic_int_noop,
@@ -389,7 +380,9 @@ struct legacy_pic null_legacy_pic = {
389struct legacy_pic default_legacy_pic = { 380struct legacy_pic default_legacy_pic = {
390 .nr_legacy_irqs = NR_IRQS_LEGACY, 381 .nr_legacy_irqs = NR_IRQS_LEGACY,
391 .chip = &i8259A_chip, 382 .chip = &i8259A_chip,
392 .mask_all = mask_8259A, 383 .mask = mask_8259A_irq,
384 .unmask = unmask_8259A_irq,
385 .mask_all = mask_8259A,
393 .restore_mask = unmask_8259A, 386 .restore_mask = unmask_8259A,
394 .init = init_8259A, 387 .init = init_8259A,
395 .irq_pending = i8259A_irq_pending, 388 .irq_pending = i8259A_irq_pending,
@@ -398,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
398 391
399struct legacy_pic *legacy_pic = &default_legacy_pic; 392struct legacy_pic *legacy_pic = &default_legacy_pic;
400 393
401static int __init i8259A_init_sysfs(void) 394static int __init i8259A_init_ops(void)
402{ 395{
403 int error; 396 if (legacy_pic == &default_legacy_pic)
404 397 register_syscore_ops(&i8259_syscore_ops);
405 if (legacy_pic != &default_legacy_pic)
406 return 0;
407 398
408 error = sysdev_class_register(&i8259_sysdev_class); 399 return 0;
409 if (!error)
410 error = sysdev_register(&device_i8259A);
411 return error;
412} 400}
413 401
414device_initcall(i8259A_init_sysfs); 402device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..6c0802eb2f7f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,9 +4,11 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/of.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h>
10 12
11#include <asm/apic.h> 13#include <asm/apic.h>
12#include <asm/io_apic.h> 14#include <asm/io_apic.h>
@@ -43,9 +45,9 @@ void ack_bad_irq(unsigned int irq)
43 45
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 46#define irq_stats(x) (&per_cpu(irq_stat, x))
45/* 47/*
46 * /proc/interrupts printing: 48 * /proc/interrupts printing for arch specific interrupts
47 */ 49 */
48static int show_other_interrupts(struct seq_file *p, int prec) 50int arch_show_interrupts(struct seq_file *p, int prec)
49{ 51{
50 int j; 52 int j;
51 53
@@ -67,10 +69,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
67 for_each_online_cpu(j) 69 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 70 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance monitoring interrupts\n"); 71 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 72 seq_printf(p, "%*s: ", prec, "IWI");
71 for_each_online_cpu(j) 73 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 74 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
73 seq_printf(p, " Performance pending work\n"); 75 seq_printf(p, " IRQ work interrupts\n");
74#endif 76#endif
75 if (x86_platform_ipi_callback) { 77 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 78 seq_printf(p, "%*s: ", prec, "PLT");
@@ -121,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
121 return 0; 123 return 0;
122} 124}
123 125
124int show_interrupts(struct seq_file *p, void *v)
125{
126 unsigned long flags, any_count = 0;
127 int i = *(loff_t *) v, j, prec;
128 struct irqaction *action;
129 struct irq_desc *desc;
130
131 if (i > nr_irqs)
132 return 0;
133
134 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
135 j *= 10;
136
137 if (i == nr_irqs)
138 return show_other_interrupts(p, prec);
139
140 /* print header */
141 if (i == 0) {
142 seq_printf(p, "%*s", prec + 8, "");
143 for_each_online_cpu(j)
144 seq_printf(p, "CPU%-8d", j);
145 seq_putc(p, '\n');
146 }
147
148 desc = irq_to_desc(i);
149 if (!desc)
150 return 0;
151
152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action;
156 if (!action && !any_count)
157 goto out;
158
159 seq_printf(p, "%*d: ", prec, i);
160 for_each_online_cpu(j)
161 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
162 seq_printf(p, " %8s", desc->chip->name);
163 seq_printf(p, "-%-8s", desc->name);
164
165 if (action) {
166 seq_printf(p, " %s", action->name);
167 while ((action = action->next) != NULL)
168 seq_printf(p, ", %s", action->name);
169 }
170
171 seq_putc(p, '\n');
172out:
173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0;
175}
176
177/* 126/*
178 * /proc/stat helpers 127 * /proc/stat helpers
179 */ 128 */
@@ -185,7 +134,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185 sum += irq_stats(cpu)->apic_timer_irqs; 134 sum += irq_stats(cpu)->apic_timer_irqs;
186 sum += irq_stats(cpu)->irq_spurious_count; 135 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs; 136 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 137 sum += irq_stats(cpu)->apic_irq_work_irqs;
189#endif 138#endif
190 if (x86_platform_ipi_callback) 139 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->x86_platform_ipis; 140 sum += irq_stats(cpu)->x86_platform_ipis;
@@ -234,7 +183,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
234 exit_idle(); 183 exit_idle();
235 irq_enter(); 184 irq_enter();
236 185
237 irq = __get_cpu_var(vector_irq)[vector]; 186 irq = __this_cpu_read(vector_irq[vector]);
238 187
239 if (!handle_irq(irq, regs)) { 188 if (!handle_irq(irq, regs)) {
240 ack_APIC_irq(); 189 ack_APIC_irq();
@@ -282,6 +231,8 @@ void fixup_irqs(void)
282 unsigned int irq, vector; 231 unsigned int irq, vector;
283 static int warned; 232 static int warned;
284 struct irq_desc *desc; 233 struct irq_desc *desc;
234 struct irq_data *data;
235 struct irq_chip *chip;
285 236
286 for_each_irq_desc(irq, desc) { 237 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0; 238 int break_affinity = 0;
@@ -296,9 +247,10 @@ void fixup_irqs(void)
296 /* interrupt's are disabled at this point */ 247 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock); 248 raw_spin_lock(&desc->lock);
298 249
299 affinity = desc->affinity; 250 data = irq_desc_get_irq_data(desc);
300 if (!irq_has_action(irq) || 251 affinity = data->affinity;
301 cpumask_equal(affinity, cpu_online_mask)) { 252 if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
253 cpumask_subset(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
303 continue; 255 continue;
304 } 256 }
@@ -315,16 +267,18 @@ void fixup_irqs(void)
315 affinity = cpu_all_mask; 267 affinity = cpu_all_mask;
316 } 268 }
317 269
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) 270 chip = irq_data_get_irq_chip(data);
319 desc->chip->mask(irq); 271 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
272 chip->irq_mask(data);
320 273
321 if (desc->chip->set_affinity) 274 if (chip->irq_set_affinity)
322 desc->chip->set_affinity(irq, affinity); 275 chip->irq_set_affinity(data, affinity, true);
323 else if (!(warned++)) 276 else if (!(warned++))
324 set_affinity = 0; 277 set_affinity = 0;
325 278
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) 279 if (!irqd_can_move_in_process_context(data) &&
327 desc->chip->unmask(irq); 280 !irqd_irq_disabled(data) && chip->irq_unmask)
281 chip->irq_unmask(data);
328 282
329 raw_spin_unlock(&desc->lock); 283 raw_spin_unlock(&desc->lock);
330 284
@@ -348,17 +302,19 @@ void fixup_irqs(void)
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 302 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr; 303 unsigned int irr;
350 304
351 if (__get_cpu_var(vector_irq)[vector] < 0) 305 if (__this_cpu_read(vector_irq[vector]) < 0)
352 continue; 306 continue;
353 307
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 308 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) { 309 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector]; 310 irq = __this_cpu_read(vector_irq[vector]);
357 311
358 desc = irq_to_desc(irq); 312 desc = irq_to_desc(irq);
313 data = irq_desc_get_irq_data(desc);
314 chip = irq_data_get_irq_chip(data);
359 raw_spin_lock(&desc->lock); 315 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger) 316 if (chip->irq_retrigger)
361 desc->chip->retrigger(irq); 317 chip->irq_retrigger(data);
362 raw_spin_unlock(&desc->lock); 318 raw_spin_unlock(&desc->lock);
363 } 319 }
364 } 320 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/mm.h>
20 21
21#include <asm/apic.h> 22#include <asm/apic.h>
22 23
@@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; }
49static inline void print_stack_overflow(void) { } 50static inline void print_stack_overflow(void) { }
50#endif 51#endif
51 52
52#ifdef CONFIG_4KSTACKS
53/* 53/*
54 * per-CPU IRQ handling contexts (thread information and stack) 54 * per-CPU IRQ handling contexts (thread information and stack)
55 */ 55 */
56union irq_ctx { 56union irq_ctx {
57 struct thread_info tinfo; 57 struct thread_info tinfo;
58 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
59} __attribute__((aligned(PAGE_SIZE))); 59} __attribute__((aligned(THREAD_SIZE)));
60 60
61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
63 63
64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
66
67static void call_on_stack(void *func, void *stack) 64static void call_on_stack(void *func, void *stack)
68{ 65{
69 asm volatile("xchgl %%ebx,%%esp \n" 66 asm volatile("xchgl %%ebx,%%esp \n"
@@ -82,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
82 u32 *isp, arg1, arg2; 79 u32 *isp, arg1, arg2;
83 80
84 curctx = (union irq_ctx *) current_thread_info(); 81 curctx = (union irq_ctx *) current_thread_info();
85 irqctx = __get_cpu_var(hardirq_ctx); 82 irqctx = __this_cpu_read(hardirq_ctx);
86 83
87 /* 84 /*
88 * this is where we switch to the IRQ stack. However, if we are 85 * this is where we switch to the IRQ stack. However, if we are
@@ -129,20 +126,21 @@ void __cpuinit irq_ctx_init(int cpu)
129 if (per_cpu(hardirq_ctx, cpu)) 126 if (per_cpu(hardirq_ctx, cpu))
130 return; 127 return;
131 128
132 irqctx = &per_cpu(hardirq_stack, cpu); 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
133 irqctx->tinfo.task = NULL; 130 THREAD_FLAGS,
134 irqctx->tinfo.exec_domain = NULL; 131 THREAD_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
135 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
138 136
139 per_cpu(hardirq_ctx, cpu) = irqctx; 137 per_cpu(hardirq_ctx, cpu) = irqctx;
140 138
141 irqctx = &per_cpu(softirq_stack, cpu); 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
142 irqctx->tinfo.task = NULL; 140 THREAD_FLAGS,
143 irqctx->tinfo.exec_domain = NULL; 141 THREAD_ORDER));
142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
145 irqctx->tinfo.preempt_count = 0;
146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
147 145
148 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
@@ -151,11 +149,6 @@ void __cpuinit irq_ctx_init(int cpu)
151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); 149 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
152} 150}
153 151
154void irq_ctx_exit(int cpu)
155{
156 per_cpu(hardirq_ctx, cpu) = NULL;
157}
158
159asmlinkage void do_softirq(void) 152asmlinkage void do_softirq(void)
160{ 153{
161 unsigned long flags; 154 unsigned long flags;
@@ -170,7 +163,7 @@ asmlinkage void do_softirq(void)
170 163
171 if (local_softirq_pending()) { 164 if (local_softirq_pending()) {
172 curctx = current_thread_info(); 165 curctx = current_thread_info();
173 irqctx = __get_cpu_var(softirq_ctx); 166 irqctx = __this_cpu_read(softirq_ctx);
174 irqctx->tinfo.task = curctx->task; 167 irqctx->tinfo.task = curctx->task;
175 irqctx->tinfo.previous_esp = current_stack_pointer; 168 irqctx->tinfo.previous_esp = current_stack_pointer;
176 169
@@ -179,7 +172,7 @@ asmlinkage void do_softirq(void)
179 172
180 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
181 /* 174 /*
182 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
183 */ 176 */
184 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
185 } 178 }
@@ -187,11 +180,6 @@ asmlinkage void do_softirq(void)
187 local_irq_restore(flags); 180 local_irq_restore(flags);
188} 181}
189 182
190#else
191static inline int
192execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
193#endif
194
195bool handle_irq(unsigned irq, struct pt_regs *regs) 183bool handle_irq(unsigned irq, struct pt_regs *regs)
196{ 184{
197 struct irq_desc *desc; 185 struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
1/*
2 * x86 specific code for irq_work
3 *
4 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 */
6
7#include <linux/kernel.h>
8#include <linux/irq_work.h>
9#include <linux/hardirq.h>
10#include <asm/apic.h>
11
12void smp_irq_work_interrupt(struct pt_regs *regs)
13{
14 irq_enter();
15 ack_APIC_irq();
16 inc_irq_stat(apic_irq_work_irqs);
17 irq_work_run();
18 irq_exit();
19}
20
21void arch_irq_work_raise(void)
22{
23#ifdef CONFIG_X86_LOCAL_APIC
24 if (!cpu_has_apic)
25 return;
26
27 apic->send_IPI_self(IRQ_WORK_VECTOR);
28 apic_wait_icr_idle();
29#endif
30}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 9772b1a0f9a4..48acf71c6534 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -100,6 +103,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
100 103
101void __init init_ISA_irqs(void) 104void __init init_ISA_irqs(void)
102{ 105{
106 struct irq_chip *chip = legacy_pic->chip;
107 const char *name = chip->name;
103 int i; 108 int i;
104 109
105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 110#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +112,8 @@ void __init init_ISA_irqs(void)
107#endif 112#endif
108 legacy_pic->init(0); 113 legacy_pic->init(0);
109 114
110 /* 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
111 * 16 old-style INTA-cycle interrupts: 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
112 */
113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
114 struct irq_desc *desc = irq_to_desc(i);
115
116 desc->status = IRQ_DISABLED;
117 desc->action = NULL;
118 desc->depth = 1;
119
120 set_irq_chip_and_handler_name(i, &i8259A_chip,
121 handle_level_irq, "XT");
122 }
123} 117}
124 118
125void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -127,6 +121,12 @@ void __init init_IRQ(void)
127 int i; 121 int i;
128 122
129 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -173,14 +173,77 @@ static void __init smp_intr_init(void)
173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
174 174
175 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
176 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
178 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
179 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
180 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
181 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
182 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
183 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
184 247
185 /* IPI for generic function call */ 248 /* IPI for generic function call */
186 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -227,9 +290,9 @@ static void __init apic_intr_init(void)
227 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 290 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
228 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 291 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
229 292
230 /* Performance monitoring interrupts: */ 293 /* IRQ work interrupts: */
231# ifdef CONFIG_PERF_EVENTS 294# ifdef CONFIG_IRQ_WORK
232 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 295 alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
233# endif 296# endif
234 297
235#endif 298#endif
@@ -255,7 +318,7 @@ void __init native_init_IRQ(void)
255 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 318 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
256 } 319 }
257 320
258 if (!acpi_ioapic) 321 if (!acpi_ioapic && !of_ioapic)
259 setup_irq(2, &irq2); 322 setup_irq(2, &irq2);
260 323
261#ifdef CONFIG_X86_32 324#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..3fee346ef545
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,51 @@
1/*
2 * jump label x86 support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/cpu.h>
14#include <asm/kprobes.h>
15#include <asm/alternative.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19union jump_code_union {
20 char code[JUMP_LABEL_NOP_SIZE];
21 struct {
22 char jump;
23 int offset;
24 } __attribute__((packed));
25};
26
27void arch_jump_label_transform(struct jump_entry *entry,
28 enum jump_label_type type)
29{
30 union jump_code_union code;
31
32 if (type == JUMP_LABEL_ENABLE) {
33 code.jump = 0xe9;
34 code.offset = entry->target -
35 (entry->code + JUMP_LABEL_NOP_SIZE);
36 } else
37 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
38 get_online_cpus();
39 mutex_lock(&text_mutex);
40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
41 mutex_unlock(&text_mutex);
42 put_online_cpus();
43}
44
45void arch_jump_label_text_poke_early(jump_label_t addr)
46{
47 text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
48 JUMP_LABEL_NOP_SIZE);
49}
50
51#endif
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index 0f7bc20cfcde..000000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/k8.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15
16static u32 *flush_words;
17
18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 {}
22};
23EXPORT_SYMBOL(k8_nb_ids);
24
25struct pci_dev **k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges);
27
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
29{
30 do {
31 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
32 if (!dev)
33 break;
34 } while (!pci_match_id(&k8_nb_ids[0], dev));
35 return dev;
36}
37
38int cache_k8_northbridges(void)
39{
40 int i;
41 struct pci_dev *dev;
42
43 if (num_k8_northbridges)
44 return 0;
45
46 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++;
49
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
51 GFP_KERNEL);
52 if (!k8_northbridges)
53 return -ENOMEM;
54
55 if (!num_k8_northbridges) {
56 k8_northbridges[0] = NULL;
57 return 0;
58 }
59
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
61 if (!flush_words) {
62 kfree(k8_northbridges);
63 return -ENOMEM;
64 }
65
66 dev = NULL;
67 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 }
72 k8_northbridges[i] = NULL;
73 return 0;
74}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges);
76
77/* Ignores subdevice/subvendor but as far as I can figure out
78 they're useless anyways */
79int __init early_is_k8_nb(u32 device)
80{
81 struct pci_device_id *id;
82 u32 vendor = device & 0xffff;
83 device >>= 16;
84 for (id = k8_nb_ids; id->vendor; id++)
85 if (vendor == id->vendor && device == id->device)
86 return 1;
87 return 0;
88}
89
90void k8_flush_garts(void)
91{
92 int flushed, i;
93 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock);
95
96 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c,
104 flush_words[i]|1);
105 flushed++;
106 }
107 for (i = 0; i < num_k8_northbridges; i++) {
108 u32 w;
109 /* Make sure the hardware actually executed the flush*/
110 for (;;) {
111 pci_read_config_dword(k8_northbridges[i],
112 0x9c, &w);
113 if (!(w & 1))
114 break;
115 cpu_relax();
116 }
117 }
118 spin_unlock_irqrestore(&gart_lock, flags);
119 if (!flushed)
120 printk("nothing to flush?\n");
121}
122EXPORT_SYMBOL_GPL(k8_flush_garts);
123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
78static const struct file_operations fops_setup_data = { 78static const struct file_operations fops_setup_data = {
79 .read = setup_data_read, 79 .read = setup_data_read,
80 .open = setup_data_open, 80 .open = setup_data_open,
81 .llseek = default_llseek,
81}; 82};
82 83
83static int __init 84static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a37..5f9ecff328b5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/nmi.h>
51 52
52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = 53struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53{ 54{
@@ -120,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, 121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
121 dbg_reg_def[regno].size); 122 dbg_reg_def[regno].size);
122 123
123 switch (regno) {
124#ifdef CONFIG_X86_32 124#ifdef CONFIG_X86_32
125 switch (regno) {
125 case GDB_SS: 126 case GDB_SS:
126 if (!user_mode_vm(regs)) 127 if (!user_mode_vm(regs))
127 *(unsigned long *)mem = __KERNEL_DS; 128 *(unsigned long *)mem = __KERNEL_DS;
@@ -134,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
134 case GDB_FS: 135 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF; 136 *(unsigned long *)mem = 0xFFFF;
136 break; 137 break;
137#endif
138 } 138 }
139#endif
139 return dbg_reg_def[regno].name; 140 return dbg_reg_def[regno].name;
140} 141}
141 142
@@ -277,7 +278,7 @@ static int hw_break_release_slot(int breakno)
277 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
278 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
279 /* 280 /*
280 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
281 * remove failure. 282 * remove failure.
282 */ 283 */
283 return -1; 284 return -1;
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void)
315 if (!breakinfo[i].enabled) 316 if (!breakinfo[i].enabled)
316 continue; 317 continue;
317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 318 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
318 if (bp->attr.disabled == 1) 319 if (!bp->attr.disabled) {
320 arch_uninstall_hw_breakpoint(bp);
321 bp->attr.disabled = 1;
319 continue; 322 continue;
323 }
320 if (dbg_is_early) 324 if (dbg_is_early)
321 early_dr7 &= ~encode_dr7(i, breakinfo[i].len, 325 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
322 breakinfo[i].type); 326 breakinfo[i].type);
323 else 327 else if (hw_break_release_slot(i))
324 arch_uninstall_hw_breakpoint(bp); 328 printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
325 bp->attr.disabled = 1; 329 breakinfo[i].addr);
330 breakinfo[i].enabled = 0;
326 } 331 }
327} 332}
328 333
@@ -387,7 +392,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
387 * disable hardware debugging while it is processing gdb packets or 392 * disable hardware debugging while it is processing gdb packets or
388 * handling exception. 393 * handling exception.
389 */ 394 */
390void kgdb_disable_hw_debug(struct pt_regs *regs) 395static void kgdb_disable_hw_debug(struct pt_regs *regs)
391{ 396{
392 int i; 397 int i;
393 int cpu = raw_smp_processor_id(); 398 int cpu = raw_smp_processor_id();
@@ -477,8 +482,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
477 raw_smp_processor_id()); 482 raw_smp_processor_id());
478 } 483 }
479 484
480 kgdb_correct_hw_break();
481
482 return 0; 485 return 0;
483 } 486 }
484 487
@@ -523,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
523 } 526 }
524 return NOTIFY_DONE; 527 return NOTIFY_DONE;
525 528
526 case DIE_NMI_IPI:
527 /* Just ignore, we will handle the roundup on DIE_NMI. */
528 return NOTIFY_DONE;
529
530 case DIE_NMIUNKNOWN: 529 case DIE_NMIUNKNOWN:
531 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
532 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -534,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
534 } 533 }
535 return NOTIFY_DONE; 534 return NOTIFY_DONE;
536 535
537 case DIE_NMIWATCHDOG:
538 if (atomic_read(&kgdb_active) != -1) {
539 /* KGDB CPU roundup: */
540 kgdb_nmicallback(raw_smp_processor_id(), regs);
541 return NOTIFY_STOP;
542 }
543 /* Enter debugger: */
544 break;
545
546 case DIE_DEBUG: 536 case DIE_DEBUG:
547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
548 if (user_mode(regs)) 538 if (user_mode(regs))
@@ -604,7 +594,7 @@ static struct notifier_block kgdb_notifier = {
604 /* 594 /*
605 * Lowest-prio notifier priority, we want to be notified last: 595 * Lowest-prio notifier priority, we want to be notified last:
606 */ 596 */
607 .priority = -INT_MAX, 597 .priority = NMI_LOCAL_LOW_PRIOR,
608}; 598};
609 599
610/** 600/**
@@ -621,7 +611,12 @@ int kgdb_arch_init(void)
621static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, 611static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
622 struct perf_sample_data *data, struct pt_regs *regs) 612 struct perf_sample_data *data, struct pt_regs *regs)
623{ 613{
624 kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); 614 struct task_struct *tsk = current;
615 int i;
616
617 for (i = 0; i < 4; i++)
618 if (breakinfo[i].enabled)
619 tsk->thread.debugreg6 |= (DR_TRAP0 << i);
625} 620}
626 621
627void kgdb_arch_late(void) 622void kgdb_arch_late(void)
@@ -644,7 +639,7 @@ void kgdb_arch_late(void)
644 if (breakinfo[i].pev) 639 if (breakinfo[i].pev)
645 continue; 640 continue;
646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
647 if (IS_ERR(breakinfo[i].pev)) { 642 if (IS_ERR((void * __force)breakinfo[i].pev)) {
648 printk(KERN_ERR "kgdb: Could not allocate hw" 643 printk(KERN_ERR "kgdb: Could not allocate hw"
649 "breakpoints\nDisabling the kernel debugger\n"); 644 "breakpoints\nDisabling the kernel debugger\n");
650 breakinfo[i].pev = NULL; 645 breakinfo[i].pev = NULL;
@@ -721,6 +716,7 @@ struct kgdb_arch arch_kgdb_ops = {
721 .flags = KGDB_HW_BREAKPOINT, 716 .flags = KGDB_HW_BREAKPOINT,
722 .set_hw_breakpoint = kgdb_set_hw_break, 717 .set_hw_breakpoint = kgdb_set_hw_break,
723 .remove_hw_breakpoint = kgdb_remove_hw_break, 718 .remove_hw_breakpoint = kgdb_remove_hw_break,
719 .disable_hw_break = kgdb_disable_hw_debug,
724 .remove_all_hw_break = kgdb_remove_all_hw_break, 720 .remove_all_hw_break = kgdb_remove_all_hw_break,
725 .correct_hw_break = kgdb_correct_hw_break, 721 .correct_hw_break = kgdb_correct_hw_break,
726}; 722};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e9..f1a6244d7d93 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 return 0; 230 return 0;
231} 231}
232 232
233/* Dummy buffers for kallsyms_lookup */
234static char __dummy_buf[KSYM_NAME_LEN];
235
236/* Check if paddr is at an instruction boundary */ 233/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 234static int __kprobes can_probe(unsigned long paddr)
238{ 235{
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
241 struct insn insn; 238 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 239 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 240
244 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) 241 if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
245 return 0; 242 return 0;
246 243
247 /* Decode instructions */ 244 /* Decode instructions */
@@ -406,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
406 403
407static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
408{ 405{
409 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 406 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
410 kcb->kprobe_status = kcb->prev_kprobe.status; 407 kcb->kprobe_status = kcb->prev_kprobe.status;
411 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; 408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
412 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; 409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -415,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
415static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
416 struct kprobe_ctlblk *kcb) 413 struct kprobe_ctlblk *kcb)
417{ 414{
418 __get_cpu_var(current_kprobe) = p; 415 __this_cpu_write(current_kprobe, p);
419 kcb->kprobe_saved_flags = kcb->kprobe_old_flags 416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
420 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
421 if (is_IF_modifier(p->ainsn.insn)) 418 if (is_IF_modifier(p->ainsn.insn))
@@ -589,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
589 preempt_enable_no_resched(); 586 preempt_enable_no_resched();
590 return 1; 587 return 1;
591 } else if (kprobe_running()) { 588 } else if (kprobe_running()) {
592 p = __get_cpu_var(current_kprobe); 589 p = __this_cpu_read(current_kprobe);
593 if (p->break_handler && p->break_handler(p, regs)) { 590 if (p->break_handler && p->break_handler(p, regs)) {
594 setup_singlestep(p, regs, kcb, 0); 591 setup_singlestep(p, regs, kcb, 0);
595 return 1; 592 return 1;
@@ -762,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
762 759
763 orig_ret_address = (unsigned long)ri->ret_addr; 760 orig_ret_address = (unsigned long)ri->ret_addr;
764 if (ri->rp && ri->rp->handler) { 761 if (ri->rp && ri->rp->handler) {
765 __get_cpu_var(current_kprobe) = &ri->rp->kp; 762 __this_cpu_write(current_kprobe, &ri->rp->kp);
766 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
767 ri->ret_addr = correct_ret_addr; 764 ri->ret_addr = correct_ret_addr;
768 ri->rp->handler(ri, regs); 765 ri->rp->handler(ri, regs);
769 __get_cpu_var(current_kprobe) = NULL; 766 __this_cpu_write(current_kprobe, NULL);
770 } 767 }
771 768
772 recycle_rp_inst(ri, &empty_rp); 769 recycle_rp_inst(ri, &empty_rp);
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1129 *(unsigned long *)addr = val; 1126 *(unsigned long *)addr = val;
1130} 1127}
1131 1128
1132void __kprobes kprobes_optinsn_template_holder(void) 1129static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{ 1130{
1134 asm volatile ( 1131 asm volatile (
1135 ".global optprobe_template_entry\n" 1132 ".global optprobe_template_entry\n"
@@ -1186,8 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1186 struct pt_regs *regs) 1183 struct pt_regs *regs)
1187{ 1184{
1188 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 unsigned long flags;
1189 1187
1190 preempt_disable(); 1188 /* This is possible if op is under delayed unoptimizing */
1189 if (kprobe_disabled(&op->kp))
1190 return;
1191
1192 local_irq_save(flags);
1191 if (kprobe_running()) { 1193 if (kprobe_running()) {
1192 kprobes_inc_nmissed_count(&op->kp); 1194 kprobes_inc_nmissed_count(&op->kp);
1193 } else { 1195 } else {
@@ -1201,12 +1203,12 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1201 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; 1203 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1202 regs->orig_ax = ~0UL; 1204 regs->orig_ax = ~0UL;
1203 1205
1204 __get_cpu_var(current_kprobe) = &op->kp; 1206 __this_cpu_write(current_kprobe, &op->kp);
1205 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 1207 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1206 opt_pre_handler(&op->kp, regs); 1208 opt_pre_handler(&op->kp, regs);
1207 __get_cpu_var(current_kprobe) = NULL; 1209 __this_cpu_write(current_kprobe, NULL);
1208 } 1210 }
1209 preempt_enable_no_resched(); 1211 local_irq_restore(flags);
1210} 1212}
1211 1213
1212static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) 1214static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
@@ -1221,7 +1223,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1221 } 1223 }
1222 /* Check whether the address range is reserved */ 1224 /* Check whether the address range is reserved */
1223 if (ftrace_text_reserved(src, src + len - 1) || 1225 if (ftrace_text_reserved(src, src + len - 1) ||
1224 alternatives_text_reserved(src, src + len - 1)) 1226 alternatives_text_reserved(src, src + len - 1) ||
1227 jump_label_text_reserved(src, src + len - 1))
1225 return -EBUSY; 1228 return -EBUSY;
1226 1229
1227 return len; 1230 return len;
@@ -1269,11 +1272,17 @@ static int __kprobes can_optimize(unsigned long paddr)
1269 unsigned long addr, size = 0, offset = 0; 1272 unsigned long addr, size = 0, offset = 0;
1270 struct insn insn; 1273 struct insn insn;
1271 kprobe_opcode_t buf[MAX_INSN_SIZE]; 1274 kprobe_opcode_t buf[MAX_INSN_SIZE];
1272 /* Dummy buffers for lookup_symbol_attrs */
1273 static char __dummy_buf[KSYM_NAME_LEN];
1274 1275
1275 /* Lookup symbol including addr */ 1276 /* Lookup symbol including addr */
1276 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) 1277 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1278 return 0;
1279
1280 /*
1281 * Do not optimize in the entry code due to the unstable
1282 * stack handling.
1283 */
1284 if ((paddr >= (unsigned long )__entry_text_start) &&
1285 (paddr < (unsigned long )__entry_text_end))
1277 return 0; 1286 return 0;
1278 1287
1279 /* Check there is enough space for a relative jump. */ 1288 /* Check there is enough space for a relative jump. */
@@ -1405,10 +1414,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1405 return 0; 1414 return 0;
1406} 1415}
1407 1416
1408/* Replace a breakpoint (int3) with a relative jump. */ 1417#define MAX_OPTIMIZE_PROBES 256
1409int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) 1418static struct text_poke_param *jump_poke_params;
1419static struct jump_poke_buffer {
1420 u8 buf[RELATIVEJUMP_SIZE];
1421} *jump_poke_bufs;
1422
1423static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1424 u8 *insn_buf,
1425 struct optimized_kprobe *op)
1410{ 1426{
1411 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1412 s32 rel = (s32)((long)op->optinsn.insn - 1427 s32 rel = (s32)((long)op->optinsn.insn -
1413 ((long)op->kp.addr + RELATIVEJUMP_SIZE)); 1428 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1414 1429
@@ -1416,16 +1431,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1416 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, 1431 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1417 RELATIVE_ADDR_SIZE); 1432 RELATIVE_ADDR_SIZE);
1418 1433
1419 jmp_code[0] = RELATIVEJUMP_OPCODE; 1434 insn_buf[0] = RELATIVEJUMP_OPCODE;
1420 *(s32 *)(&jmp_code[1]) = rel; 1435 *(s32 *)(&insn_buf[1]) = rel;
1436
1437 tprm->addr = op->kp.addr;
1438 tprm->opcode = insn_buf;
1439 tprm->len = RELATIVEJUMP_SIZE;
1440}
1441
1442/*
1443 * Replace breakpoints (int3) with relative jumps.
1444 * Caller must call with locking kprobe_mutex and text_mutex.
1445 */
1446void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1447{
1448 struct optimized_kprobe *op, *tmp;
1449 int c = 0;
1450
1451 list_for_each_entry_safe(op, tmp, oplist, list) {
1452 WARN_ON(kprobe_disabled(&op->kp));
1453 /* Setup param */
1454 setup_optimize_kprobe(&jump_poke_params[c],
1455 jump_poke_bufs[c].buf, op);
1456 list_del_init(&op->list);
1457 if (++c >= MAX_OPTIMIZE_PROBES)
1458 break;
1459 }
1421 1460
1422 /* 1461 /*
1423 * text_poke_smp doesn't support NMI/MCE code modifying. 1462 * text_poke_smp doesn't support NMI/MCE code modifying.
1424 * However, since kprobes itself also doesn't support NMI/MCE 1463 * However, since kprobes itself also doesn't support NMI/MCE
1425 * code probing, it's not a problem. 1464 * code probing, it's not a problem.
1426 */ 1465 */
1427 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); 1466 text_poke_smp_batch(jump_poke_params, c);
1428 return 0; 1467}
1468
1469static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1470 u8 *insn_buf,
1471 struct optimized_kprobe *op)
1472{
1473 /* Set int3 to first byte for kprobes */
1474 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1475 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1476
1477 tprm->addr = op->kp.addr;
1478 tprm->opcode = insn_buf;
1479 tprm->len = RELATIVEJUMP_SIZE;
1480}
1481
1482/*
1483 * Recover original instructions and breakpoints from relative jumps.
1484 * Caller must call with locking kprobe_mutex.
1485 */
1486extern void arch_unoptimize_kprobes(struct list_head *oplist,
1487 struct list_head *done_list)
1488{
1489 struct optimized_kprobe *op, *tmp;
1490 int c = 0;
1491
1492 list_for_each_entry_safe(op, tmp, oplist, list) {
1493 /* Setup param */
1494 setup_unoptimize_kprobe(&jump_poke_params[c],
1495 jump_poke_bufs[c].buf, op);
1496 list_move(&op->list, done_list);
1497 if (++c >= MAX_OPTIMIZE_PROBES)
1498 break;
1499 }
1500
1501 /*
1502 * text_poke_smp doesn't support NMI/MCE code modifying.
1503 * However, since kprobes itself also doesn't support NMI/MCE
1504 * code probing, it's not a problem.
1505 */
1506 text_poke_smp_batch(jump_poke_params, c);
1429} 1507}
1430 1508
1431/* Replace a relative jump with a breakpoint (int3). */ 1509/* Replace a relative jump with a breakpoint (int3). */
@@ -1457,11 +1535,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
1457 } 1535 }
1458 return 0; 1536 return 0;
1459} 1537}
1538
1539static int __kprobes init_poke_params(void)
1540{
1541 /* Allocate code buffer and parameter array */
1542 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1543 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1544 if (!jump_poke_bufs)
1545 return -ENOMEM;
1546
1547 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1548 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1549 if (!jump_poke_params) {
1550 kfree(jump_poke_bufs);
1551 jump_poke_bufs = NULL;
1552 return -ENOMEM;
1553 }
1554
1555 return 0;
1556}
1557#else /* !CONFIG_OPTPROBES */
1558static int __kprobes init_poke_params(void)
1559{
1560 return 0;
1561}
1460#endif 1562#endif
1461 1563
1462int __init arch_init_kprobes(void) 1564int __init arch_init_kprobes(void)
1463{ 1565{
1464 return 0; 1566 return init_poke_params();
1465} 1567}
1466 1568
1467int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1569int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
26#include <asm/x86_init.h> 26#include <asm/x86_init.h>
27#include <asm/reboot.h> 27#include <asm/reboot.h>
28 28
29#define KVM_SCALE 22
30
31static int kvmclock = 1; 29static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 30static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 31static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,21 +118,21 @@ static struct clocksource kvm_clock = {
120 .read = kvm_clock_get_cycles, 118 .read = kvm_clock_get_cycles,
121 .rating = 400, 119 .rating = 400,
122 .mask = CLOCKSOURCE_MASK(64), 120 .mask = CLOCKSOURCE_MASK(64),
123 .mult = 1 << KVM_SCALE,
124 .shift = KVM_SCALE,
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 121 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 122};
127 123
128static int kvm_register_clock(char *txt) 124int kvm_register_clock(char *txt)
129{ 125{
130 int cpu = smp_processor_id(); 126 int cpu = smp_processor_id();
131 int low, high; 127 int low, high, ret;
128
132 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 129 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 132 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
135 cpu, high, low, txt); 133 cpu, high, low, txt);
136 134
137 return native_write_msr_safe(msr_kvm_system_time, low, high); 135 return ret;
138} 136}
139 137
140#ifdef CONFIG_X86_LOCAL_APIC 138#ifdef CONFIG_X86_LOCAL_APIC
@@ -150,14 +148,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
150} 148}
151#endif 149#endif
152 150
153#ifdef CONFIG_SMP
154static void __init kvm_smp_prepare_boot_cpu(void)
155{
156 WARN_ON(kvm_register_clock("primary cpu clock"));
157 native_smp_prepare_boot_cpu();
158}
159#endif
160
161/* 151/*
162 * After the clock is registered, the host will keep writing to the 152 * After the clock is registered, the host will keep writing to the
163 * registered memory location. If the guest happens to shutdown, this memory 153 * registered memory location. If the guest happens to shutdown, this memory
@@ -204,15 +194,12 @@ void __init kvmclock_init(void)
204 x86_cpuinit.setup_percpu_clockev = 194 x86_cpuinit.setup_percpu_clockev =
205 kvm_setup_secondary_clock; 195 kvm_setup_secondary_clock;
206#endif 196#endif
207#ifdef CONFIG_SMP
208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
209#endif
210 machine_ops.shutdown = kvm_shutdown; 197 machine_ops.shutdown = kvm_shutdown;
211#ifdef CONFIG_KEXEC 198#ifdef CONFIG_KEXEC
212 machine_ops.crash_shutdown = kvm_crash_shutdown; 199 machine_ops.crash_shutdown = kvm_crash_shutdown;
213#endif 200#endif
214 kvm_get_preset_lpj(); 201 kvm_get_preset_lpj();
215 clocksource_register(&kvm_clock); 202 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
216 pv_info.paravirt_enabled = 1; 203 pv_info.paravirt_enabled = 1;
217 pv_info.name = "KVM"; 204 pv_info.name = "KVM";
218 205
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
36 if (!page) 36 if (!page)
37 goto out; 37 goto out;
38 pud = (pud_t *)page_address(page); 38 pud = (pud_t *)page_address(page);
39 memset(pud, 0, PAGE_SIZE); 39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 } 41 }
42 pud = pud_offset(pgd, addr); 42 pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
45 if (!page) 45 if (!page)
46 goto out; 46 goto out;
47 pmd = (pmd_t *)page_address(page); 47 pmd = (pmd_t *)page_address(page);
48 memset(pmd, 0, PAGE_SIZE); 48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 } 50 }
51 pmd = pmd_offset(pud, addr); 51 pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7d..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,85 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static int get_ucode_data(void *to, const u8 *from, size_t n) 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159{ 158{
160 memcpy(to, from, n); 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
161 return 0; 160 unsigned int max_size, actual_size;
161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
177
178 actual_size = buf[4] + (buf[5] << 8);
179
180 if (actual_size > size || actual_size > max_size) {
181 pr_err("section size mismatch\n");
182 return 0;
183 }
184
185 return actual_size;
162} 186}
163 187
164static void * 188static struct microcode_header_amd *
165get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
166{ 190{
167 unsigned int total_size; 191 struct microcode_header_amd *mc = NULL;
168 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 192 unsigned int actual_size = 0;
169 void *mc;
170 193
171 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) 194 if (buf[0] != UCODE_UCODE_TYPE) {
172 return NULL; 195 pr_err("invalid type field in container file section header\n");
173 196 goto out;
174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
175 pr_err("error: invalid type field in container file section header\n");
176 return NULL;
177 } 197 }
178 198
179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
180 202
181 if (total_size > size || total_size > UCODE_MAX_SIZE) { 203 mc = vzalloc(actual_size);
182 pr_err("error: size mismatch\n"); 204 if (!mc)
183 return NULL; 205 goto out;
184 }
185 206
186 mc = vmalloc(UCODE_MAX_SIZE); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
187 if (mc) { 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
188 memset(mc, 0, UCODE_MAX_SIZE); 209
189 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, 210out:
190 total_size)) {
191 vfree(mc);
192 mc = NULL;
193 } else
194 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
195 }
196 return mc; 211 return mc;
197} 212}
198 213
199static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
200{ 215{
201 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
202 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
203 unsigned long size; 218 unsigned int size = ibuf[2];
204 219
205 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
206 return 0; 221 pr_err("empty section/"
207 222 "invalid type field in container file section header\n");
208 size = buf_pos[2]; 223 return -EINVAL;
209
210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
211 pr_err("error: invalid type field in container file section header\n");
212 return 0;
213 } 224 }
214 225
215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
216 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
217 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
218 return 0; 229 return -ENOMEM;
219 } 230 }
220 231
221 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
222 if (get_ucode_data(equiv_cpu_table, buf, size)) {
223 vfree(equiv_cpu_table);
224 return 0;
225 }
226 233
227 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
228} 235}
@@ -237,16 +244,16 @@ static enum ucode_state
237generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
238{ 245{
239 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
240 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
241 void *new_mc = NULL; 251 void *new_mc = NULL;
242 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
243 int new_rev = uci->cpu_sig.rev;
244 unsigned int leftover;
245 unsigned long offset;
246 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
247 254
248 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
249 if (!offset) { 256 if (offset < 0) {
250 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
251 return UCODE_ERROR; 258 return UCODE_ERROR;
252 } 259 }
@@ -255,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
255 leftover = size - offset; 262 leftover = size - offset;
256 263
257 while (leftover) { 264 while (leftover) {
258 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
259 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
260
261 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
262 if (!mc)
263 break; 267 break;
264 268
265 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
266 if (get_matching_microcode(cpu, mc, new_rev)) {
267 vfree(new_mc); 270 vfree(new_mc);
268 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
269 new_mc = mc; 272 new_mc = mc_hdr;
270 } else 273 } else
271 vfree(mc); 274 vfree(mc_hdr);
272 275
273 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
274 leftover -= mc_size; 277 leftover -= mc_size;
275 } 278 }
276 279
277 if (new_mc) { 280 if (!new_mc) {
278 if (!leftover) {
279 vfree(uci->mc);
280 uci->mc = new_mc;
281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
282 cpu, new_rev, uci->cpu_sig.rev);
283 } else {
284 vfree(new_mc);
285 state = UCODE_ERROR;
286 }
287 } else
288 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
289 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
290 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
291 297
292 return state; 298 return state;
293} 299}
294 300
295static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
296{ 302{
297 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
298 const struct firmware *firmware; 304 const struct firmware *fw;
299 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
300 306
301 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
302 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
303 return UCODE_NFOUND; 309 goto out;
304 } 310 }
305 311
306 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
307 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
308 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
309 return UCODE_ERROR; 315 goto fw_release;
310 } 316 }
311 317
312 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
313 319
314 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
315 322
323out:
316 return ret; 324 return ret;
317} 325}
318 326
@@ -333,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
333 341
334static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
335 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
336 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
337 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
338 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
339 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..f9242800bc84 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
@@ -82,6 +82,7 @@
82#include <linux/cpu.h> 82#include <linux/cpu.h>
83#include <linux/fs.h> 83#include <linux/fs.h>
84#include <linux/mm.h> 84#include <linux/mm.h>
85#include <linux/syscore_ops.h>
85 86
86#include <asm/microcode.h> 87#include <asm/microcode.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -232,6 +233,7 @@ static const struct file_operations microcode_fops = {
232 .owner = THIS_MODULE, 233 .owner = THIS_MODULE,
233 .write = microcode_write, 234 .write = microcode_write,
234 .open = microcode_open, 235 .open = microcode_open,
236 .llseek = no_llseek,
235}; 237};
236 238
237static struct miscdevice microcode_dev = { 239static struct miscdevice microcode_dev = {
@@ -416,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
416 if (err) 418 if (err)
417 return err; 419 return err;
418 420
419 if (microcode_init_cpu(cpu) == UCODE_ERROR) 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
420 err = -EINVAL; 422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
423 return -EINVAL;
424 }
421 425
422 return err; 426 return err;
423} 427}
@@ -435,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
435 return 0; 439 return 0;
436} 440}
437 441
438static int mc_sysdev_resume(struct sys_device *dev) 442static struct sysdev_driver mc_sysdev_driver = {
443 .add = mc_sysdev_add,
444 .remove = mc_sysdev_remove,
445};
446
447/**
448 * mc_bp_resume - Update boot CPU microcode during resume.
449 */
450static void mc_bp_resume(void)
439{ 451{
440 int cpu = dev->id; 452 int cpu = smp_processor_id();
441 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 453 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
442 454
443 if (!cpu_online(cpu))
444 return 0;
445
446 /*
447 * All non-bootup cpus are still disabled,
448 * so only CPU 0 will apply ucode here.
449 *
450 * Moreover, there can be no concurrent
451 * updates from any other places at this point.
452 */
453 WARN_ON(cpu != 0);
454
455 if (uci->valid && uci->mc) 455 if (uci->valid && uci->mc)
456 microcode_ops->apply_microcode(cpu); 456 microcode_ops->apply_microcode(cpu);
457
458 return 0;
459} 457}
460 458
461static struct sysdev_driver mc_sysdev_driver = { 459static struct syscore_ops mc_syscore_ops = {
462 .add = mc_sysdev_add, 460 .resume = mc_bp_resume,
463 .remove = mc_sysdev_remove,
464 .resume = mc_sysdev_resume,
465}; 461};
466 462
467static __cpuinit int 463static __cpuinit int
@@ -539,6 +535,7 @@ static int __init microcode_init(void)
539 if (error) 535 if (error)
540 return error; 536 return error;
541 537
538 register_syscore_ops(&mc_syscore_ops);
542 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
543 540
544 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
@@ -553,6 +550,7 @@ static void __exit microcode_exit(void)
553 microcode_dev_exit(); 550 microcode_dev_exit();
554 551
555 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
553 unregister_syscore_ops(&mc_syscore_ops);
556 554
557 get_online_cpus(); 555 get_online_cpus();
558 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a93..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
364 364
365 /* For performance reasons, reuse mc area when possible */ 365 /* For performance reasons, reuse mc area when possible */
366 if (!mc || mc_size > curr_mc_size) { 366 if (!mc || mc_size > curr_mc_size) {
367 if (mc) 367 vfree(mc);
368 vfree(mc);
369 mc = vmalloc(mc_size); 368 mc = vmalloc(mc_size);
370 if (!mc) 369 if (!mc)
371 break; 370 break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
374 373
375 if (get_ucode_data(mc, ucode_ptr, mc_size) || 374 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
376 microcode_sanity_check(mc) < 0) { 375 microcode_sanity_check(mc) < 0) {
377 vfree(mc);
378 break; 376 break;
379 } 377 }
380 378
381 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { 379 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
382 if (new_mc) 380 vfree(new_mc);
383 vfree(new_mc);
384 new_rev = mc_header.rev; 381 new_rev = mc_header.rev;
385 new_mc = mc; 382 new_mc = mc;
386 mc = NULL; /* trigger new vmalloc */ 383 mc = NULL; /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
390 leftover -= mc_size; 387 leftover -= mc_size;
391 } 388 }
392 389
393 if (mc) 390 vfree(mc);
394 vfree(mc);
395 391
396 if (leftover) { 392 if (leftover) {
397 if (new_mc) 393 vfree(new_mc);
398 vfree(new_mc);
399 state = UCODE_ERROR; 394 state = UCODE_ERROR;
400 goto out; 395 goto out;
401 } 396 }
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 goto out; 400 goto out;
406 } 401 }
407 402
408 if (uci->mc) 403 vfree(uci->mc);
409 vfree(uci->mc);
410 uci->mc = (struct microcode_intel *)new_mc; 404 uci->mc = (struct microcode_intel *)new_mc;
411 405
412 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", 406 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
25}; 25};
26 26
27static u64 __cpuinitdata fam10h_pci_mmconf_base; 27static u64 __cpuinitdata fam10h_pci_mmconf_base;
28static int __cpuinitdata fam10h_pci_mmconf_base_status;
29 28
30static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { 29static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
31 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
44 return start1 - start2; 43 return start1 - start2;
45} 44}
46 45
47/*[47:0] */ 46#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
48/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ 47#define MMCONF_MASK (~(MMCONF_UNIT - 1))
48#define MMCONF_SIZE (MMCONF_UNIT << 8)
49/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
49#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) 50#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
50#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) 51#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
51static void __cpuinit get_fam10h_pci_mmconf_base(void) 52static void __cpuinit get_fam10h_pci_mmconf_base(void)
52{ 53{
53 int i; 54 int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
64 struct range range[8]; 65 struct range range[8];
65 66
66 /* only try to get setting from BSP */ 67 /* only try to get setting from BSP */
67 /* -1 or 1 */ 68 if (fam10h_pci_mmconf_base)
68 if (fam10h_pci_mmconf_base_status)
69 return; 69 return;
70 70
71 if (!early_pci_allowed()) 71 if (!early_pci_allowed())
72 goto fail; 72 return;
73 73
74 found = 0; 74 found = 0;
75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
91 } 91 }
92 92
93 if (!found) 93 if (!found)
94 goto fail; 94 return;
95 95
96 /* SYS_CFG */ 96 /* SYS_CFG */
97 address = MSR_K8_SYSCFG; 97 address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
99 99
100 /* TOP_MEM2 is not enabled? */ 100 /* TOP_MEM2 is not enabled? */
101 if (!(val & (1<<21))) { 101 if (!(val & (1<<21))) {
102 tom2 = 0; 102 tom2 = 1ULL << 32;
103 } else { 103 } else {
104 /* TOP_MEM2 */ 104 /* TOP_MEM2 */
105 address = MSR_K8_TOP_MEM2; 105 address = MSR_K8_TOP_MEM2;
106 rdmsrl(address, val); 106 rdmsrl(address, val);
107 tom2 = val & (0xffffULL<<32); 107 tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
108 } 108 }
109 109
110 if (base <= tom2) 110 if (base <= tom2)
111 base = tom2 + (1ULL<<32); 111 base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
112 112
113 /* 113 /*
114 * need to check if the range is in the high mmio range that is 114 * need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
123 if (!(reg & 3)) 123 if (!(reg & 3))
124 continue; 124 continue;
125 125
126 start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 126 start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); 127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
128 end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 128 end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
129 129
130 if (!end) 130 if (end < tom2)
131 continue; 131 continue;
132 132
133 range[hi_mmio_num].start = start; 133 range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
143 143
144 if (range[hi_mmio_num - 1].end < base) 144 if (range[hi_mmio_num - 1].end < base)
145 goto out; 145 goto out;
146 if (range[0].start > base) 146 if (range[0].start > base + MMCONF_SIZE)
147 goto out; 147 goto out;
148 148
149 /* need to find one window */ 149 /* need to find one window */
150 base = range[0].start - (1ULL << 32); 150 base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
151 if ((base > tom2) && BASE_VALID(base)) 151 if ((base > tom2) && BASE_VALID(base))
152 goto out; 152 goto out;
153 base = range[hi_mmio_num - 1].end + (1ULL << 32); 153 base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
154 if ((base > tom2) && BASE_VALID(base)) 154 if (BASE_VALID(base))
155 goto out; 155 goto out;
156 /* need to find window between ranges */ 156 /* need to find window between ranges */
157 if (hi_mmio_num > 1) 157 for (i = 1; i < hi_mmio_num; i++) {
158 for (i = 0; i < hi_mmio_num - 1; i++) { 158 base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
159 if (range[i + 1].start > (range[i].end + (1ULL << 32))) { 159 val = range[i].start & MMCONF_MASK;
160 base = range[i].end + (1ULL << 32); 160 if (val >= base + MMCONF_SIZE && BASE_VALID(base))
161 if ((base > tom2) && BASE_VALID(base)) 161 goto out;
162 goto out;
163 }
164 } 162 }
165
166fail:
167 fam10h_pci_mmconf_base_status = -1;
168 return; 163 return;
164
169out: 165out:
170 fam10h_pci_mmconf_base = base; 166 fam10h_pci_mmconf_base = base;
171 fam10h_pci_mmconf_base_status = 1;
172} 167}
173 168
174void __cpuinit fam10h_check_enable_mmcfg(void) 169void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
190 185
191 /* only trust the one handle 256 buses, if acpi=off */ 186 /* only trust the one handle 256 buses, if acpi=off */
192 if (!acpi_pci_disabled || busnbits >= 8) { 187 if (!acpi_pci_disabled || busnbits >= 8) {
193 u64 base; 188 u64 base = val & MMCONF_MASK;
194 base = val & (0xffffULL << 32); 189
195 if (fam10h_pci_mmconf_base_status <= 0) { 190 if (!fam10h_pci_mmconf_base) {
196 fam10h_pci_mmconf_base = base; 191 fam10h_pci_mmconf_base = base;
197 fam10h_pci_mmconf_base_status = 1;
198 return; 192 return;
199 } else if (fam10h_pci_mmconf_base == base) 193 } else if (fam10h_pci_mmconf_base == base)
200 return; 194 return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
206 * with 256 buses 200 * with 256 buses
207 */ 201 */
208 get_fam10h_pci_mmconf_base(); 202 get_fam10h_pci_mmconf_base();
209 if (fam10h_pci_mmconf_base_status <= 0) 203 if (!fam10h_pci_mmconf_base) {
204 pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
210 return; 205 return;
206 }
211 207
212 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); 208 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
213 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | 209 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
217 wrmsrl(address, val); 213 wrmsrl(address, val);
218} 214}
219 215
220static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) 216static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
221{ 217{
222 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; 218 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
223 return 0; 219 return 0;
224} 220}
225 221
226static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { 222static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
227 { 223 {
228 .callback = set_check_enable_amd_mmconf, 224 .callback = set_check_enable_amd_mmconf,
229 .ident = "Sun Microsystems Machine", 225 .ident = "Sun Microsystems Machine",
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
234 {} 230 {}
235}; 231};
236 232
237void __cpuinit check_enable_amd_mmconf_dmi(void) 233/* Called from a __cpuinit function, but only on the BSP. */
234void __ref check_enable_amd_mmconf_dmi(void)
238{ 235{
239 dmi_check_system(mmconf_dmi_table); 236 dmi_check_system(mmconf_dmi_table);
240} 237}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c550960..52f256f2cc81 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/jump_label.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/page.h> 30#include <asm/page.h>
@@ -37,20 +38,11 @@
37 38
38void *module_alloc(unsigned long size) 39void *module_alloc(unsigned long size)
39{ 40{
40 struct vm_struct *area; 41 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL;
47
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
49 if (!area)
50 return NULL; 42 return NULL;
51 43 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, 44 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
53 PAGE_KERNEL_EXEC); 45 -1, __builtin_return_address(0));
54} 46}
55 47
56/* Free memory returned from module_alloc */ 48/* Free memory returned from module_alloc */
@@ -239,6 +231,9 @@ int module_finalize(const Elf_Ehdr *hdr,
239 apply_paravirt(pseg, pseg + para->sh_size); 231 apply_paravirt(pseg, pseg + para->sh_size);
240 } 232 }
241 233
234 /* make jump label nops */
235 jump_label_apply_nops(me);
236
242 return 0; 237 return 0;
243} 238}
244 239
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d7b6f7fb4fec..9103b89c145a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/memblock.h>
14#include <linux/kernel_stat.h> 15#include <linux/kernel_stat.h>
15#include <linux/mc146818rtc.h> 16#include <linux/mc146818rtc.h>
16#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -117,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
117 118
118static void __init MP_ioapic_info(struct mpc_ioapic *m) 119static void __init MP_ioapic_info(struct mpc_ioapic *m)
119{ 120{
120 if (!(m->flags & MPC_APIC_USABLE)) 121 if (m->flags & MPC_APIC_USABLE)
121 return; 122 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
122
123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
124 m->apicid, m->apicver, m->apicaddr);
125
126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
127}
128
129static void print_MP_intsrc_info(struct mpc_intsrc *m)
130{
131 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
132 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
133 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
134 m->srcbusirq, m->dstapic, m->dstirq);
135} 123}
136 124
137static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) 125static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
@@ -143,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
143 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); 131 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
144} 132}
145 133
146static void __init assign_to_mp_irq(struct mpc_intsrc *m,
147 struct mpc_intsrc *mp_irq)
148{
149 mp_irq->dstapic = m->dstapic;
150 mp_irq->type = m->type;
151 mp_irq->irqtype = m->irqtype;
152 mp_irq->irqflag = m->irqflag;
153 mp_irq->srcbus = m->srcbus;
154 mp_irq->srcbusirq = m->srcbusirq;
155 mp_irq->dstirq = m->dstirq;
156}
157
158static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
159 struct mpc_intsrc *m)
160{
161 m->dstapic = mp_irq->dstapic;
162 m->type = mp_irq->type;
163 m->irqtype = mp_irq->irqtype;
164 m->irqflag = mp_irq->irqflag;
165 m->srcbus = mp_irq->srcbus;
166 m->srcbusirq = mp_irq->srcbusirq;
167 m->dstirq = mp_irq->dstirq;
168}
169
170static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
171 struct mpc_intsrc *m)
172{
173 if (mp_irq->dstapic != m->dstapic)
174 return 1;
175 if (mp_irq->type != m->type)
176 return 2;
177 if (mp_irq->irqtype != m->irqtype)
178 return 3;
179 if (mp_irq->irqflag != m->irqflag)
180 return 4;
181 if (mp_irq->srcbus != m->srcbus)
182 return 5;
183 if (mp_irq->srcbusirq != m->srcbusirq)
184 return 6;
185 if (mp_irq->dstirq != m->dstirq)
186 return 7;
187
188 return 0;
189}
190
191static void __init MP_intsrc_info(struct mpc_intsrc *m)
192{
193 int i;
194
195 print_MP_intsrc_info(m);
196
197 for (i = 0; i < mp_irq_entries; i++) {
198 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
199 return;
200 }
201
202 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
203 if (++mp_irq_entries == MAX_IRQ_SOURCES)
204 panic("Max # of irq sources exceeded!!\n");
205}
206#else /* CONFIG_X86_IO_APIC */ 134#else /* CONFIG_X86_IO_APIC */
207static inline void __init MP_bus_info(struct mpc_bus *m) {} 135static inline void __init MP_bus_info(struct mpc_bus *m) {}
208static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} 136static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
209static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
210#endif /* CONFIG_X86_IO_APIC */ 137#endif /* CONFIG_X86_IO_APIC */
211 138
212
213static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 139static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
214{ 140{
215 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," 141 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
@@ -221,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
221/* 147/*
222 * Read/parse the MPC 148 * Read/parse the MPC
223 */ 149 */
224
225static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) 150static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
226{ 151{
227 152
@@ -274,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
274 199
275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } 200void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
276 201
277static void __init smp_register_lapic_address(unsigned long address)
278{
279 mp_lapic_addr = address;
280
281 set_fixmap_nocache(FIX_APIC_BASE, address);
282 if (boot_cpu_physical_apicid == -1U) {
283 boot_cpu_physical_apicid = read_apic_id();
284 apic_version[boot_cpu_physical_apicid] =
285 GET_APIC_VERSION(apic_read(APIC_LVR));
286 }
287}
288
289static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 202static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
290{ 203{
291 char str[16]; 204 char str[16];
@@ -300,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
300#ifdef CONFIG_X86_32 213#ifdef CONFIG_X86_32
301 generic_mps_oem_check(mpc, oem, str); 214 generic_mps_oem_check(mpc, oem, str);
302#endif 215#endif
303 /* save the local APIC address, it might be non-default */ 216 /* Initialize the lapic mapping */
304 if (!acpi_lapic) 217 if (!acpi_lapic)
305 mp_lapic_addr = mpc->lapic; 218 register_lapic_address(mpc->lapic);
306 219
307 if (early) 220 if (early)
308 return 1; 221 return 1;
309 222
310 /* Initialize the lapic mapping */
311 if (!acpi_lapic)
312 smp_register_lapic_address(mpc->lapic);
313
314 if (mpc->oemptr) 223 if (mpc->oemptr)
315 x86_init.mpparse.smp_read_mpc_oem(mpc); 224 x86_init.mpparse.smp_read_mpc_oem(mpc);
316 225
@@ -336,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
336 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); 245 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
337 break; 246 break;
338 case MP_INTSRC: 247 case MP_INTSRC:
339 MP_intsrc_info((struct mpc_intsrc *)mpt); 248 mp_save_irq((struct mpc_intsrc *)mpt);
340 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); 249 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
341 break; 250 break;
342 case MP_LINTSRC: 251 case MP_LINTSRC:
@@ -376,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
376 intsrc.type = MP_INTSRC; 285 intsrc.type = MP_INTSRC;
377 intsrc.irqflag = 0; /* conforming */ 286 intsrc.irqflag = 0; /* conforming */
378 intsrc.srcbus = 0; 287 intsrc.srcbus = 0;
379 intsrc.dstapic = mp_ioapics[0].apicid; 288 intsrc.dstapic = mpc_ioapic_id(0);
380 289
381 intsrc.irqtype = mp_INT; 290 intsrc.irqtype = mp_INT;
382 291
@@ -428,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
428 337
429 intsrc.srcbusirq = i; 338 intsrc.srcbusirq = i;
430 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ 339 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
431 MP_intsrc_info(&intsrc); 340 mp_save_irq(&intsrc);
432 } 341 }
433 342
434 intsrc.irqtype = mp_ExtINT; 343 intsrc.irqtype = mp_ExtINT;
435 intsrc.srcbusirq = 0; 344 intsrc.srcbusirq = 0;
436 intsrc.dstirq = 0; /* 8259A to INTIN0 */ 345 intsrc.dstirq = 0; /* 8259A to INTIN0 */
437 MP_intsrc_info(&intsrc); 346 mp_save_irq(&intsrc);
438} 347}
439 348
440 349
@@ -657,7 +566,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
657{ 566{
658 unsigned long size = get_mpc_size(mpf->physptr); 567 unsigned long size = get_mpc_size(mpf->physptr);
659 568
660 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 569 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
661} 570}
662 571
663static int __init smp_scan_config(unsigned long base, unsigned long length) 572static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -686,7 +595,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
686 mpf, (u64)virt_to_phys(mpf)); 595 mpf, (u64)virt_to_phys(mpf));
687 596
688 mem = virt_to_phys(mpf); 597 mem = virt_to_phys(mpf);
689 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); 598 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
690 if (mpf->physptr) 599 if (mpf->physptr)
691 smp_reserve_memory(mpf); 600 smp_reserve_memory(mpf);
692 601
@@ -783,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
783 int i; 692 int i;
784 693
785 apic_printk(APIC_VERBOSE, "OLD "); 694 apic_printk(APIC_VERBOSE, "OLD ");
786 print_MP_intsrc_info(m); 695 print_mp_irq_info(m);
787 696
788 i = get_MP_intsrc_index(m); 697 i = get_MP_intsrc_index(m);
789 if (i > 0) { 698 if (i > 0) {
790 assign_to_mpc_intsrc(&mp_irqs[i], m); 699 memcpy(m, &mp_irqs[i], sizeof(*m));
791 apic_printk(APIC_VERBOSE, "NEW "); 700 apic_printk(APIC_VERBOSE, "NEW ");
792 print_mp_irq_info(&mp_irqs[i]); 701 print_mp_irq_info(&mp_irqs[i]);
793 return; 702 return;
@@ -805,23 +714,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
805 *nr_m_spare += 1; 714 *nr_m_spare += 1;
806 } 715 }
807} 716}
808#else /* CONFIG_X86_IO_APIC */
809static
810inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
811#endif /* CONFIG_X86_IO_APIC */
812 717
813static int 718static int __init
814check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
815{ 720{
816 int ret = 0;
817
818 if (!mpc_new_phys || count <= mpc_new_length) { 721 if (!mpc_new_phys || count <= mpc_new_length) {
819 WARN(1, "update_mptable: No spare slots (length: %x)\n", count); 722 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
820 return -1; 723 return -1;
821 } 724 }
822 725
823 return ret; 726 return 0;
824} 727}
728#else /* CONFIG_X86_IO_APIC */
729static
730inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
731#endif /* CONFIG_X86_IO_APIC */
825 732
826static int __init replace_intsrc_all(struct mpc_table *mpc, 733static int __init replace_intsrc_all(struct mpc_table *mpc,
827 unsigned long mpc_new_phys, 734 unsigned long mpc_new_phys,
@@ -874,14 +781,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
874 if (nr_m_spare > 0) { 781 if (nr_m_spare > 0) {
875 apic_printk(APIC_VERBOSE, "*NEW* found\n"); 782 apic_printk(APIC_VERBOSE, "*NEW* found\n");
876 nr_m_spare--; 783 nr_m_spare--;
877 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 784 memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
878 m_spare[nr_m_spare] = NULL; 785 m_spare[nr_m_spare] = NULL;
879 } else { 786 } else {
880 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 787 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
881 count += sizeof(struct mpc_intsrc); 788 count += sizeof(struct mpc_intsrc);
882 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) 789 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
883 goto out; 790 goto out;
884 assign_to_mpc_intsrc(&mp_irqs[i], m); 791 memcpy(m, &mp_irqs[i], sizeof(*m));
885 mpc->length = count; 792 mpc->length = count;
886 mpt += sizeof(struct mpc_intsrc); 793 mpt += sizeof(struct mpc_intsrc);
887 } 794 }
@@ -974,7 +881,7 @@ static int __init update_mp_table(void)
974 881
975 if (!mpc_new_phys) { 882 if (!mpc_new_phys) {
976 unsigned char old, new; 883 unsigned char old, new;
977 /* check if we can change the postion */ 884 /* check if we can change the position */
978 mpc->checksum = 0; 885 mpc->checksum = 0;
979 old = mpf_checksum((unsigned char *)mpc, mpc->length); 886 old = mpf_checksum((unsigned char *)mpc, mpc->length);
980 mpc->checksum = 0xff; 887 mpc->checksum = 0xff;
@@ -983,7 +890,7 @@ static int __init update_mp_table(void)
983 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 890 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
984 return 0; 891 return 0;
985 } 892 }
986 printk(KERN_INFO "use in-positon replacing\n"); 893 printk(KERN_INFO "use in-position replacing\n");
987 } else { 894 } else {
988 mpf->physptr = mpc_new_phys; 895 mpf->physptr = mpc_new_phys;
989 mpc_new = phys_to_virt(mpc_new_phys); 896 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
deleted file mode 100644
index 79ae68154e87..000000000000
--- a/arch/x86/kernel/mrst.c
+++ /dev/null
@@ -1,311 +0,0 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
17
18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
56int sfi_mtimer_num;
57
58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
59EXPORT_SYMBOL_GPL(sfi_mrtc_array);
60int sfi_mrtc_num;
61
62static inline void assign_to_mp_irq(struct mpc_intsrc *m,
63 struct mpc_intsrc *mp_irq)
64{
65 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
66}
67
68static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
69 struct mpc_intsrc *m)
70{
71 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
72}
73
74static void save_mp_irq(struct mpc_intsrc *m)
75{
76 int i;
77
78 for (i = 0; i < mp_irq_entries; i++) {
79 if (!mp_irq_cmp(&mp_irqs[i], m))
80 return;
81 }
82
83 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
84 if (++mp_irq_entries == MAX_IRQ_SOURCES)
85 panic("Max # of irq sources exceeded!!\n");
86}
87
88/* parse all the mtimer info to a static mtimer array */
89static int __init sfi_parse_mtmr(struct sfi_table_header *table)
90{
91 struct sfi_table_simple *sb;
92 struct sfi_timer_table_entry *pentry;
93 struct mpc_intsrc mp_irq;
94 int totallen;
95
96 sb = (struct sfi_table_simple *)table;
97 if (!sfi_mtimer_num) {
98 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
99 struct sfi_timer_table_entry);
100 pentry = (struct sfi_timer_table_entry *) sb->pentry;
101 totallen = sfi_mtimer_num * sizeof(*pentry);
102 memcpy(sfi_mtimer_array, pentry, totallen);
103 }
104
105 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
106 pentry = sfi_mtimer_array;
107 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
108 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
109 " irq = %d\n", totallen, (u32)pentry->phys_addr,
110 pentry->freq_hz, pentry->irq);
111 if (!pentry->irq)
112 continue;
113 mp_irq.type = MP_IOAPIC;
114 mp_irq.irqtype = mp_INT;
115/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
116 mp_irq.irqflag = 5;
117 mp_irq.srcbus = 0;
118 mp_irq.srcbusirq = pentry->irq; /* IRQ */
119 mp_irq.dstapic = MP_APIC_ALL;
120 mp_irq.dstirq = pentry->irq;
121 save_mp_irq(&mp_irq);
122 }
123
124 return 0;
125}
126
127struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
128{
129 int i;
130 if (hint < sfi_mtimer_num) {
131 if (!sfi_mtimer_usage[hint]) {
132 pr_debug("hint taken for timer %d irq %d\n",\
133 hint, sfi_mtimer_array[hint].irq);
134 sfi_mtimer_usage[hint] = 1;
135 return &sfi_mtimer_array[hint];
136 }
137 }
138 /* take the first timer available */
139 for (i = 0; i < sfi_mtimer_num;) {
140 if (!sfi_mtimer_usage[i]) {
141 sfi_mtimer_usage[i] = 1;
142 return &sfi_mtimer_array[i];
143 }
144 i++;
145 }
146 return NULL;
147}
148
149void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
150{
151 int i;
152 for (i = 0; i < sfi_mtimer_num;) {
153 if (mtmr->irq == sfi_mtimer_array[i].irq) {
154 sfi_mtimer_usage[i] = 0;
155 return;
156 }
157 i++;
158 }
159}
160
161/* parse all the mrtc info to a global mrtc array */
162int __init sfi_parse_mrtc(struct sfi_table_header *table)
163{
164 struct sfi_table_simple *sb;
165 struct sfi_rtc_table_entry *pentry;
166 struct mpc_intsrc mp_irq;
167
168 int totallen;
169
170 sb = (struct sfi_table_simple *)table;
171 if (!sfi_mrtc_num) {
172 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
173 struct sfi_rtc_table_entry);
174 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
175 totallen = sfi_mrtc_num * sizeof(*pentry);
176 memcpy(sfi_mrtc_array, pentry, totallen);
177 }
178
179 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
180 pentry = sfi_mrtc_array;
181 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
182 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
183 totallen, (u32)pentry->phys_addr, pentry->irq);
184 mp_irq.type = MP_IOAPIC;
185 mp_irq.irqtype = mp_INT;
186 mp_irq.irqflag = 0;
187 mp_irq.srcbus = 0;
188 mp_irq.srcbusirq = pentry->irq; /* IRQ */
189 mp_irq.dstapic = MP_APIC_ALL;
190 mp_irq.dstirq = pentry->irq;
191 save_mp_irq(&mp_irq);
192 }
193 return 0;
194}
195
196static unsigned long __init mrst_calibrate_tsc(void)
197{
198 unsigned long flags, fast_calibrate;
199
200 local_irq_save(flags);
201 fast_calibrate = apbt_quick_calibrate();
202 local_irq_restore(flags);
203
204 if (fast_calibrate)
205 return fast_calibrate;
206
207 return 0;
208}
209
210void __init mrst_time_init(void)
211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
228 pre_init_apic_IRQ0();
229 apbt_time_init();
230}
231
232void __init mrst_rtc_init(void)
233{
234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
235}
236
237void __cpuinit mrst_arch_setup(void)
238{
239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
252
253/* MID systems don't have i8042 controller */
254static int mrst_i8042_detect(void)
255{
256 return 0;
257}
258
259/*
260 * Moorestown specific x86_init function overrides and early setup
261 * calls.
262 */
263void __init x86_mrst_early_setup(void)
264{
265 x86_init.resources.probe_roms = x86_init_noop;
266 x86_init.resources.reserve_resources = x86_init_noop;
267
268 x86_init.timers.timer_init = mrst_time_init;
269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
270
271 x86_init.irqs.pre_vector_init = x86_init_noop;
272
273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
276
277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
278 x86_platform.i8042_detect = mrst_i8042_detect;
279 x86_init.pci.init = pci_mrst_init;
280 x86_init.pci.fixup_irqs = x86_init_noop;
281
282 legacy_pic = &null_legacy_pic;
283
284 /* Avoid searching for BIOS MP tables */
285 x86_init.mpparse.find_smp_config = x86_init_noop;
286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
287
288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/smp.h> 32#include <linux/smp.h>
33#include <linux/smp_lock.h>
34#include <linux/major.h> 33#include <linux/major.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/device.h> 35#include <linux/device.h>
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
deleted file mode 100644
index 0e0cdde519be..000000000000
--- a/arch/x86/kernel/olpc.c
+++ /dev/null
@@ -1,260 +0,0 @@
1/*
2 * Support for the OLPC DCON and OLPC EC access
3 *
4 * Copyright © 2006 Advanced Micro Devices, Inc.
5 * Copyright © 2007-2008 Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 */
12
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/module.h>
16#include <linux/delay.h>
17#include <linux/spinlock.h>
18#include <linux/io.h>
19#include <linux/string.h>
20
21#include <asm/geode.h>
22#include <asm/setup.h>
23#include <asm/olpc.h>
24#include <asm/olpc_ofw.h>
25
26struct olpc_platform_t olpc_platform_info;
27EXPORT_SYMBOL_GPL(olpc_platform_info);
28
29static DEFINE_SPINLOCK(ec_lock);
30
31/* what the timeout *should* be (in ms) */
32#define EC_BASE_TIMEOUT 20
33
34/* the timeout that bugs in the EC might force us to actually use */
35static int ec_timeout = EC_BASE_TIMEOUT;
36
37static int __init olpc_ec_timeout_set(char *str)
38{
39 if (get_option(&str, &ec_timeout) != 1) {
40 ec_timeout = EC_BASE_TIMEOUT;
41 printk(KERN_ERR "olpc-ec: invalid argument to "
42 "'olpc_ec_timeout=', ignoring!\n");
43 }
44 printk(KERN_DEBUG "olpc-ec: using %d ms delay for EC commands.\n",
45 ec_timeout);
46 return 1;
47}
48__setup("olpc_ec_timeout=", olpc_ec_timeout_set);
49
50/*
51 * These {i,o}bf_status functions return whether the buffers are full or not.
52 */
53
54static inline unsigned int ibf_status(unsigned int port)
55{
56 return !!(inb(port) & 0x02);
57}
58
59static inline unsigned int obf_status(unsigned int port)
60{
61 return inb(port) & 0x01;
62}
63
64#define wait_on_ibf(p, d) __wait_on_ibf(__LINE__, (p), (d))
65static int __wait_on_ibf(unsigned int line, unsigned int port, int desired)
66{
67 unsigned int timeo;
68 int state = ibf_status(port);
69
70 for (timeo = ec_timeout; state != desired && timeo; timeo--) {
71 mdelay(1);
72 state = ibf_status(port);
73 }
74
75 if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
76 timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
77 printk(KERN_WARNING "olpc-ec: %d: waited %u ms for IBF!\n",
78 line, ec_timeout - timeo);
79 }
80
81 return !(state == desired);
82}
83
84#define wait_on_obf(p, d) __wait_on_obf(__LINE__, (p), (d))
85static int __wait_on_obf(unsigned int line, unsigned int port, int desired)
86{
87 unsigned int timeo;
88 int state = obf_status(port);
89
90 for (timeo = ec_timeout; state != desired && timeo; timeo--) {
91 mdelay(1);
92 state = obf_status(port);
93 }
94
95 if ((state == desired) && (ec_timeout > EC_BASE_TIMEOUT) &&
96 timeo < (ec_timeout - EC_BASE_TIMEOUT)) {
97 printk(KERN_WARNING "olpc-ec: %d: waited %u ms for OBF!\n",
98 line, ec_timeout - timeo);
99 }
100
101 return !(state == desired);
102}
103
104/*
105 * This allows the kernel to run Embedded Controller commands. The EC is
106 * documented at <http://wiki.laptop.org/go/Embedded_controller>, and the
107 * available EC commands are here:
108 * <http://wiki.laptop.org/go/Ec_specification>. Unfortunately, while
109 * OpenFirmware's source is available, the EC's is not.
110 */
111int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
112 unsigned char *outbuf, size_t outlen)
113{
114 unsigned long flags;
115 int ret = -EIO;
116 int i;
117
118 spin_lock_irqsave(&ec_lock, flags);
119
120 /* Clear OBF */
121 for (i = 0; i < 10 && (obf_status(0x6c) == 1); i++)
122 inb(0x68);
123 if (i == 10) {
124 printk(KERN_ERR "olpc-ec: timeout while attempting to "
125 "clear OBF flag!\n");
126 goto err;
127 }
128
129 if (wait_on_ibf(0x6c, 0)) {
130 printk(KERN_ERR "olpc-ec: timeout waiting for EC to "
131 "quiesce!\n");
132 goto err;
133 }
134
135restart:
136 /*
137 * Note that if we time out during any IBF checks, that's a failure;
138 * we have to return. There's no way for the kernel to clear that.
139 *
140 * If we time out during an OBF check, we can restart the command;
141 * reissuing it will clear the OBF flag, and we should be alright.
142 * The OBF flag will sometimes misbehave due to what we believe
143 * is a hardware quirk..
144 */
145 pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
146 outb(cmd, 0x6c);
147
148 if (wait_on_ibf(0x6c, 0)) {
149 printk(KERN_ERR "olpc-ec: timeout waiting for EC to read "
150 "command!\n");
151 goto err;
152 }
153
154 if (inbuf && inlen) {
155 /* write data to EC */
156 for (i = 0; i < inlen; i++) {
157 if (wait_on_ibf(0x6c, 0)) {
158 printk(KERN_ERR "olpc-ec: timeout waiting for"
159 " EC accept data!\n");
160 goto err;
161 }
162 pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
163 outb(inbuf[i], 0x68);
164 }
165 }
166 if (outbuf && outlen) {
167 /* read data from EC */
168 for (i = 0; i < outlen; i++) {
169 if (wait_on_obf(0x6c, 1)) {
170 printk(KERN_ERR "olpc-ec: timeout waiting for"
171 " EC to provide data!\n");
172 goto restart;
173 }
174 outbuf[i] = inb(0x68);
175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
176 }
177 }
178
179 ret = 0;
180err:
181 spin_unlock_irqrestore(&ec_lock, flags);
182 return ret;
183}
184EXPORT_SYMBOL_GPL(olpc_ec_cmd);
185
186#ifdef CONFIG_OLPC_OPENFIRMWARE
187static void __init platform_detect(void)
188{
189 size_t propsize;
190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
193
194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
195 printk(KERN_ERR "ofw: getprop call failed!\n");
196 rev = cpu_to_be32(0);
197 }
198 olpc_platform_info.boardrev = be32_to_cpu(rev);
199}
200#else
201static void __init platform_detect(void)
202{
203 /* stopgap until OFW support is added to the kernel */
204 olpc_platform_info.boardrev = olpc_board(0xc2);
205}
206#endif
207
208static int __init olpc_init(void)
209{
210 unsigned char *romsig;
211
212 /* The ioremap check is dangerous; limit what we run it on */
213 if (!is_geode() || cs5535_has_vsa2())
214 return 0;
215
216 spin_lock_init(&ec_lock);
217
218 romsig = ioremap(0xffffffc0, 16);
219 if (!romsig)
220 return 0;
221
222 if (strncmp(romsig, "CL1 Q", 7))
223 goto unmap;
224 if (strncmp(romsig+6, romsig+13, 3)) {
225 printk(KERN_INFO "OLPC BIOS signature looks invalid. "
226 "Assuming not OLPC\n");
227 goto unmap;
228 }
229
230 printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig);
231 olpc_platform_info.flags |= OLPC_F_PRESENT;
232
233 /* get the platform revision */
234 platform_detect();
235
236 /* assume B1 and above models always have a DCON */
237 if (olpc_board_at_least(olpc_board(0xb1)))
238 olpc_platform_info.flags |= OLPC_F_DCON;
239
240 /* get the EC revision */
241 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
242 (unsigned char *) &olpc_platform_info.ecver, 1);
243
244#ifdef CONFIG_PCI_OLPC
245 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
246 if (!cs5535_has_vsa2())
247 x86_init.pci.arch_init = pci_olpc_init;
248#endif
249
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
252 olpc_platform_info.boardrev >> 4,
253 olpc_platform_info.ecver);
254
255unmap:
256 iounmap(romsig);
257 return 0;
258}
259
260postcore_initcall(olpc_init);
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
deleted file mode 100644
index 3218aa71ab5e..000000000000
--- a/arch/x86/kernel/olpc_ofw.c
+++ /dev/null
@@ -1,106 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/module.h>
3#include <linux/init.h>
4#include <asm/page.h>
5#include <asm/setup.h>
6#include <asm/io.h>
7#include <asm/pgtable.h>
8#include <asm/olpc_ofw.h>
9
10/* address of OFW callback interface; will be NULL if OFW isn't found */
11static int (*olpc_ofw_cif)(int *);
12
13/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
14u32 olpc_ofw_pgd __initdata;
15
16static DEFINE_SPINLOCK(ofw_lock);
17
18#define MAXARGS 10
19
20void __init setup_olpc_ofw_pgd(void)
21{
22 pgd_t *base, *ofw_pde;
23
24 if (!olpc_ofw_cif)
25 return;
26
27 /* fetch OFW's PDE */
28 base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
29 if (!base) {
30 printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
31 olpc_ofw_cif = NULL;
32 return;
33 }
34 ofw_pde = &base[OLPC_OFW_PDE_NR];
35
36 /* install OFW's PDE permanently into the kernel's pgtable */
37 set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
38 /* implicit optimization barrier here due to uninline function return */
39
40 early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
41}
42
43int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
44 void **res)
45{
46 int ofw_args[MAXARGS + 3];
47 unsigned long flags;
48 int ret, i, *p;
49
50 BUG_ON(nr_args + nr_res > MAXARGS);
51
52 if (!olpc_ofw_cif)
53 return -EIO;
54
55 ofw_args[0] = (int)name;
56 ofw_args[1] = nr_args;
57 ofw_args[2] = nr_res;
58
59 p = &ofw_args[3];
60 for (i = 0; i < nr_args; i++, p++)
61 *p = (int)args[i];
62
63 /* call into ofw */
64 spin_lock_irqsave(&ofw_lock, flags);
65 ret = olpc_ofw_cif(ofw_args);
66 spin_unlock_irqrestore(&ofw_lock, flags);
67
68 if (!ret) {
69 for (i = 0; i < nr_res; i++, p++)
70 *((int *)res[i]) = *p;
71 }
72
73 return ret;
74}
75EXPORT_SYMBOL_GPL(__olpc_ofw);
76
77/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000
79
80/* OFW starts on a 1MB boundary */
81#define OFW_BOUND (1<<20)
82
83void __init olpc_ofw_detect(void)
84{
85 struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
86 unsigned long start;
87
88 /* ensure OFW booted us by checking for "OFW " string */
89 if (hdr->ofw_magic != OLPC_OFW_SIG)
90 return;
91
92 olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
93
94 if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
95 printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
96 (unsigned long)olpc_ofw_cif);
97 olpc_ofw_cif = NULL;
98 return;
99 }
100
101 /* determine where OFW starts in memory */
102 start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
103 printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
104 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start);
106}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
413 413
414 .alloc_pte = paravirt_nop, 414 .alloc_pte = paravirt_nop,
415 .alloc_pmd = paravirt_nop, 415 .alloc_pmd = paravirt_nop,
416 .alloc_pmd_clone = paravirt_nop,
417 .alloc_pud = paravirt_nop, 416 .alloc_pud = paravirt_nop,
418 .release_pte = paravirt_nop, 417 .release_pte = paravirt_nop,
419 .release_pmd = paravirt_nop, 418 .release_pmd = paravirt_nop,
@@ -422,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
422 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
423 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
424 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
425 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
426 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
427 429
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d9..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h> 49#include <asm/x86_init.h>
50#include <asm/iommu_table.h>
50 51
51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 52#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
52int use_calgary __read_mostly = 1; 53int use_calgary __read_mostly = 1;
@@ -1278,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1278 1279
1279 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1280 /* 1281 /*
1281 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1282 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1283 */ 1284 */
1284 return 1; 1285 return 1;
@@ -1294,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1294 1295
1295/* 1296/*
1296 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1297 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1298 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1299 */ 1300 */
1300static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
1364 return 0; 1365 return 0;
1365} 1366}
1366 1367
1367void __init detect_calgary(void) 1368int __init detect_calgary(void)
1368{ 1369{
1369 int bus; 1370 int bus;
1370 void *tbl; 1371 void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
1378 * another HW IOMMU already, bail out. 1379 * another HW IOMMU already, bail out.
1379 */ 1380 */
1380 if (no_iommu || iommu_detected) 1381 if (no_iommu || iommu_detected)
1381 return; 1382 return -ENODEV;
1382 1383
1383 if (!use_calgary) 1384 if (!use_calgary)
1384 return; 1385 return -ENODEV;
1385 1386
1386 if (!early_pci_allowed()) 1387 if (!early_pci_allowed())
1387 return; 1388 return -ENODEV;
1388 1389
1389 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); 1390 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
1390 1391
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
1410 if (!rio_table_hdr) { 1411 if (!rio_table_hdr) {
1411 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " 1412 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
1412 "in EBDA - bailing!\n"); 1413 "in EBDA - bailing!\n");
1413 return; 1414 return -ENODEV;
1414 } 1415 }
1415 1416
1416 ret = build_detail_arrays(); 1417 ret = build_detail_arrays();
1417 if (ret) { 1418 if (ret) {
1418 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); 1419 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
1419 return; 1420 return -ENOMEM;
1420 } 1421 }
1421 1422
1422 specified_table_size = determine_tce_table_size((is_kdump_kernel() ? 1423 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
1464 1465
1465 x86_init.iommu.iommu_init = calgary_iommu_init; 1466 x86_init.iommu.iommu_init = calgary_iommu_init;
1466 } 1467 }
1467 return; 1468 return calgary_found;
1468 1469
1469cleanup: 1470cleanup:
1470 for (--bus; bus >= 0; --bus) { 1471 for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
1473 if (info->tce_space) 1474 if (info->tce_space)
1474 free_tce_table(info->tce_space); 1475 free_tce_table(info->tce_space);
1475 } 1476 }
1477 return -ENOMEM;
1476} 1478}
1477 1479
1478static int __init calgary_parse_options(char *p) 1480static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
1594 * and before device_initcall. 1596 * and before device_initcall.
1595 */ 1597 */
1596rootfs_initcall(calgary_fixup_tce_spaces); 1598rootfs_initcall(calgary_fixup_tce_spaces);
1599
1600IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a5..b49d00da2aed 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
11#include <asm/iommu.h> 11#include <asm/iommu.h>
12#include <asm/gart.h> 12#include <asm/gart.h>
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 14#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h> 15#include <asm/iommu_table.h>
17 16
18static int forbid_dac __read_mostly; 17static int forbid_dac __read_mostly;
19 18
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
45 */ 44 */
46int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
47 46
47extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
48
48/* Dummy device used for NULL arguments (normally ISA). */ 49/* Dummy device used for NULL arguments (normally ISA). */
49struct device x86_dma_fallback_dev = { 50struct device x86_dma_fallback_dev = {
50 .init_name = "fallback device", 51 .init_name = "fallback device",
@@ -67,89 +68,23 @@ int dma_set_mask(struct device *dev, u64 mask)
67} 68}
68EXPORT_SYMBOL(dma_set_mask); 69EXPORT_SYMBOL(dma_set_mask);
69 70
70#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
71static __initdata void *dma32_bootmem_ptr;
72static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
73
74static int __init parse_dma32_size_opt(char *p)
75{
76 if (!p)
77 return -EINVAL;
78 dma32_bootmem_size = memparse(p, &p);
79 return 0;
80}
81early_param("dma32_size", parse_dma32_size_opt);
82
83void __init dma32_reserve_bootmem(void)
84{
85 unsigned long size, align;
86 if (max_pfn <= MAX_DMA32_PFN)
87 return;
88
89 /*
90 * check aperture_64.c allocate_aperture() for reason about
91 * using 512M as goal
92 */
93 align = 64ULL<<20;
94 size = roundup(dma32_bootmem_size, align);
95 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
96 512ULL<<20);
97 /*
98 * Kmemleak should not scan this block as it may not be mapped via the
99 * kernel direct mapping.
100 */
101 kmemleak_ignore(dma32_bootmem_ptr);
102 if (dma32_bootmem_ptr)
103 dma32_bootmem_size = size;
104 else
105 dma32_bootmem_size = 0;
106}
107static void __init dma32_free_bootmem(void)
108{
109
110 if (max_pfn <= MAX_DMA32_PFN)
111 return;
112
113 if (!dma32_bootmem_ptr)
114 return;
115
116 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
117
118 dma32_bootmem_ptr = NULL;
119 dma32_bootmem_size = 0;
120}
121#else
122void __init dma32_reserve_bootmem(void)
123{
124}
125static void __init dma32_free_bootmem(void)
126{
127}
128
129#endif
130
131void __init pci_iommu_alloc(void) 71void __init pci_iommu_alloc(void)
132{ 72{
133 /* free the range so iommu could get some range less than 4G */ 73 struct iommu_table_entry *p;
134 dma32_free_bootmem(); 74
135 75 sort_iommu_table(__iommu_table, __iommu_table_end);
136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) 76 check_iommu_entries(__iommu_table, __iommu_table_end);
137 goto out; 77
138 78 for (p = __iommu_table; p < __iommu_table_end; p++) {
139 gart_iommu_hole_init(); 79 if (p && p->detect && p->detect() > 0) {
140 80 p->flags |= IOMMU_DETECTED;
141 detect_calgary(); 81 if (p->early_init)
142 82 p->early_init();
143 detect_intel_iommu(); 83 if (p->flags & IOMMU_FINISH_IF_DETECTED)
144 84 break;
145 /* needs to be called after gart_iommu_hole_init */ 85 }
146 amd_iommu_detect(); 86 }
147out:
148 pci_xen_swiotlb_init();
149
150 pci_swiotlb_init();
151} 87}
152
153void *dma_generic_alloc_coherent(struct device *dev, size_t size, 88void *dma_generic_alloc_coherent(struct device *dev, size_t size,
154 dma_addr_t *dma_addr, gfp_t flag) 89 dma_addr_t *dma_addr, gfp_t flag)
155{ 90{
@@ -292,6 +227,7 @@ EXPORT_SYMBOL(dma_supported);
292 227
293static int __init pci_iommu_init(void) 228static int __init pci_iommu_init(void)
294{ 229{
230 struct iommu_table_entry *p;
295 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); 231 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
296 232
297#ifdef CONFIG_PCI 233#ifdef CONFIG_PCI
@@ -299,12 +235,10 @@ static int __init pci_iommu_init(void)
299#endif 235#endif
300 x86_init.iommu.iommu_init(); 236 x86_init.iommu.iommu_init();
301 237
302 if (swiotlb || xen_swiotlb) { 238 for (p = __iommu_table; p < __iommu_table_end; p++) {
303 printk(KERN_INFO "PCI-DMA: " 239 if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
304 "Using software bounce buffering for IO (SWIOTLB)\n"); 240 p->late_init();
305 swiotlb_print_info(); 241 }
306 } else
307 swiotlb_free();
308 242
309 return 0; 243 return 0;
310} 244}
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000000..35ccf75696eb
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,79 @@
1#include <linux/dma-mapping.h>
2#include <asm/iommu_table.h>
3#include <linux/string.h>
4#include <linux/kallsyms.h>
5
6
7#define DEBUG 1
8
9static struct iommu_table_entry * __init
10find_dependents_of(struct iommu_table_entry *start,
11 struct iommu_table_entry *finish,
12 struct iommu_table_entry *q)
13{
14 struct iommu_table_entry *p;
15
16 if (!q)
17 return NULL;
18
19 for (p = start; p < finish; p++)
20 if (p->detect == q->depend)
21 return p;
22
23 return NULL;
24}
25
26
27void __init sort_iommu_table(struct iommu_table_entry *start,
28 struct iommu_table_entry *finish) {
29
30 struct iommu_table_entry *p, *q, tmp;
31
32 for (p = start; p < finish; p++) {
33again:
34 q = find_dependents_of(start, finish, p);
35 /* We are bit sneaky here. We use the memory address to figure
36 * out if the node we depend on is past our point, if so, swap.
37 */
38 if (q > p) {
39 tmp = *p;
40 memmove(p, q, sizeof(*p));
41 *q = tmp;
42 goto again;
43 }
44 }
45
46}
47
48#ifdef DEBUG
49void __init check_iommu_entries(struct iommu_table_entry *start,
50 struct iommu_table_entry *finish)
51{
52 struct iommu_table_entry *p, *q, *x;
53
54 /* Simple cyclic dependency checker. */
55 for (p = start; p < finish; p++) {
56 q = find_dependents_of(start, finish, p);
57 x = find_dependents_of(start, finish, q);
58 if (p == x) {
59 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
60 p->detect, q->detect);
61 /* Heavy handed way..*/
62 x->depend = 0;
63 }
64 }
65
66 for (p = start; p < finish; p++) {
67 q = find_dependents_of(p, finish, p);
68 if (q && q > p) {
69 printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
70 p->detect, q->detect);
71 }
72 }
73}
74#else
75inline void check_iommu_entries(struct iommu_table_entry *start,
76 struct iommu_table_entry *finish)
77{
78}
79#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d4328..8f972cbddef0 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
10#include <asm/iommu.h> 10#include <asm/iommu.h>
11#include <asm/swiotlb.h> 11#include <asm/swiotlb.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13#include <asm/xen/swiotlb-xen.h>
14#include <asm/iommu_table.h>
14int swiotlb __read_mostly; 15int swiotlb __read_mostly;
15 16
16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
41}; 42};
42 43
43/* 44/*
44 * pci_swiotlb_detect - set swiotlb to 1 if necessary 45 * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
45 * 46 *
46 * This returns non-zero if we are forced to use swiotlb (by the boot 47 * This returns non-zero if we are forced to use swiotlb (by the boot
47 * option). 48 * option).
48 */ 49 */
49int __init pci_swiotlb_detect(void) 50int __init pci_swiotlb_detect_override(void)
50{ 51{
51 int use_swiotlb = swiotlb | swiotlb_force; 52 int use_swiotlb = swiotlb | swiotlb_force;
52 53
54 if (swiotlb_force)
55 swiotlb = 1;
56
57 return use_swiotlb;
58}
59IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
60 pci_xen_swiotlb_detect,
61 pci_swiotlb_init,
62 pci_swiotlb_late_init);
63
64/*
65 * if 4GB or more detected (and iommu=off not set) return 1
66 * and set swiotlb to 1.
67 */
68int __init pci_swiotlb_detect_4gb(void)
69{
53 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 70 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
54#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
55 if (!no_iommu && max_pfn > MAX_DMA32_PFN) 72 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
56 swiotlb = 1; 73 swiotlb = 1;
57#endif 74#endif
58 if (swiotlb_force) 75 return swiotlb;
59 swiotlb = 1;
60
61 return use_swiotlb;
62} 76}
77IOMMU_INIT(pci_swiotlb_detect_4gb,
78 pci_swiotlb_detect_override,
79 pci_swiotlb_init,
80 pci_swiotlb_late_init);
63 81
64void __init pci_swiotlb_init(void) 82void __init pci_swiotlb_init(void)
65{ 83{
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
68 dma_ops = &swiotlb_dma_ops; 86 dma_ops = &swiotlb_dma_ops;
69 } 87 }
70} 88}
89
90void __init pci_swiotlb_late_init(void)
91{
92 /* An IOMMU turned us off. */
93 if (!swiotlb)
94 swiotlb_free();
95 else {
96 printk(KERN_INFO "PCI-DMA: "
97 "Using software bounce buffering for IO (SWIOTLB)\n");
98 swiotlb_print_info();
99 }
100}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
24#include <asm/io.h>
25#include <asm/proto.h>
26#include <asm/msr.h>
27#include <asm/vsyscall.h>
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}; 74};
75 75
76/* does this oprom support the given pci device, or any of the devices
77 * that the driver supports?
78 */
79static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
80{
81 struct pci_driver *drv = pdev->driver;
82 const struct pci_device_id *id;
83
84 if (pdev->vendor == vendor && pdev->device == device)
85 return true;
86
87 for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
88 if (id->vendor == vendor && id->device == device)
89 break;
90
91 return id && id->vendor;
92}
93
94static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
95 const unsigned char *rom_list)
96{
97 unsigned short device;
98
99 do {
100 if (probe_kernel_address(rom_list, device) != 0)
101 device = 0;
102
103 if (device && match_id(pdev, vendor, device))
104 break;
105
106 rom_list += 2;
107 } while (device);
108
109 return !!device;
110}
111
112static struct resource *find_oprom(struct pci_dev *pdev)
113{
114 struct resource *oprom = NULL;
115 int i;
116
117 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
118 struct resource *res = &adapter_rom_resources[i];
119 unsigned short offset, vendor, device, list, rev;
120 const unsigned char *rom;
121
122 if (res->end == 0)
123 break;
124
125 rom = isa_bus_to_virt(res->start);
126 if (probe_kernel_address(rom + 0x18, offset) != 0)
127 continue;
128
129 if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
130 continue;
131
132 if (probe_kernel_address(rom + offset + 0x6, device) != 0)
133 continue;
134
135 if (match_id(pdev, vendor, device)) {
136 oprom = res;
137 break;
138 }
139
140 if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
141 probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
142 rev >= 3 && list &&
143 probe_list(pdev, vendor, rom + offset + list)) {
144 oprom = res;
145 break;
146 }
147 }
148
149 return oprom;
150}
151
152void *pci_map_biosrom(struct pci_dev *pdev)
153{
154 struct resource *oprom = find_oprom(pdev);
155
156 if (!oprom)
157 return NULL;
158
159 return ioremap(oprom->start, resource_size(oprom));
160}
161EXPORT_SYMBOL(pci_map_biosrom);
162
163void pci_unmap_biosrom(void __iomem *image)
164{
165 iounmap(image);
166}
167EXPORT_SYMBOL(pci_unmap_biosrom);
168
169size_t pci_biosrom_size(struct pci_dev *pdev)
170{
171 struct resource *oprom = find_oprom(pdev);
172
173 return oprom ? resource_size(oprom) : 0;
174}
175EXPORT_SYMBOL(pci_biosrom_size);
176
76#define ROMSIGNATURE 0xaa55 177#define ROMSIGNATURE 0xaa55
77 178
78static int __init romsignature(const unsigned char *rom) 179static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..e1ba8cb24e4e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h>
17#include <asm/system.h> 18#include <asm/system.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
19#include <asm/syscalls.h> 20#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
22#include <asm/i387.h> 23#include <asm/i387.h>
23#include <asm/debugreg.h> 24#include <asm/debugreg.h>
24 25
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 27EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 28
@@ -91,27 +87,33 @@ void exit_thread(void)
91void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
92{ 88{
93 show_registers(regs); 89 show_registers(regs);
94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
95 regs->bp);
96} 91}
97 92
98void show_regs_common(void) 93void show_regs_common(void)
99{ 94{
100 const char *board, *product; 95 const char *vendor, *product, *board;
101 96
102 board = dmi_get_system_info(DMI_BOARD_NAME); 97 vendor = dmi_get_system_info(DMI_SYS_VENDOR);
103 if (!board) 98 if (!vendor)
104 board = ""; 99 vendor = "";
105 product = dmi_get_system_info(DMI_PRODUCT_NAME); 100 product = dmi_get_system_info(DMI_PRODUCT_NAME);
106 if (!product) 101 if (!product)
107 product = ""; 102 product = "";
108 103
104 /* Board Name is optional */
105 board = dmi_get_system_info(DMI_BOARD_NAME);
106
109 printk(KERN_CONT "\n"); 107 printk(KERN_CONT "\n");
110 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", 108 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
111 current->pid, current->comm, print_tainted(), 109 current->pid, current->comm, print_tainted(),
112 init_utsname()->release, 110 init_utsname()->release,
113 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
114 init_utsname()->version, board, product); 112 init_utsname()->version);
113 printk(KERN_CONT " %s %s", vendor, product);
114 if (board)
115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "\n");
115} 117}
116 118
117void flush_thread(void) 119void flush_thread(void)
@@ -328,14 +330,16 @@ long sys_execve(const char __user *name,
328/* 330/*
329 * Idle related variables and functions 331 * Idle related variables and functions
330 */ 332 */
331unsigned long boot_option_idle_override = 0; 333unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
332EXPORT_SYMBOL(boot_option_idle_override); 334EXPORT_SYMBOL(boot_option_idle_override);
333 335
334/* 336/*
335 * Powermanagement idle function, if any.. 337 * Powermanagement idle function, if any..
336 */ 338 */
337void (*pm_idle)(void); 339void (*pm_idle)(void);
340#ifdef CONFIG_APM_MODULE
338EXPORT_SYMBOL(pm_idle); 341EXPORT_SYMBOL(pm_idle);
342#endif
339 343
340#ifdef CONFIG_X86_32 344#ifdef CONFIG_X86_32
341/* 345/*
@@ -374,6 +378,7 @@ void default_idle(void)
374{ 378{
375 if (hlt_use_halt()) { 379 if (hlt_use_halt()) {
376 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 380 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
381 trace_cpu_idle(1, smp_processor_id());
377 current_thread_info()->status &= ~TS_POLLING; 382 current_thread_info()->status &= ~TS_POLLING;
378 /* 383 /*
379 * TS_POLLING-cleared state must be visible before we 384 * TS_POLLING-cleared state must be visible before we
@@ -386,6 +391,8 @@ void default_idle(void)
386 else 391 else
387 local_irq_enable(); 392 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 393 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 396 } else {
390 local_irq_enable(); 397 local_irq_enable();
391 /* loop is done by the caller */ 398 /* loop is done by the caller */
@@ -443,9 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 450 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 451void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 452{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 if (!need_resched()) { 453 if (!need_resched()) {
448 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 454 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
449 clflush((void *)&current_thread_info()->flags); 455 clflush((void *)&current_thread_info()->flags);
450 456
451 __monitor((void *)&current_thread_info()->flags, 0, 0); 457 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -460,7 +466,8 @@ static void mwait_idle(void)
460{ 466{
461 if (!need_resched()) { 467 if (!need_resched()) {
462 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 468 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
463 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 469 trace_cpu_idle(1, smp_processor_id());
470 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
464 clflush((void *)&current_thread_info()->flags); 471 clflush((void *)&current_thread_info()->flags);
465 472
466 __monitor((void *)&current_thread_info()->flags, 0, 0); 473 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -469,6 +476,8 @@ static void mwait_idle(void)
469 __sti_mwait(0, 0); 476 __sti_mwait(0, 0);
470 else 477 else
471 local_irq_enable(); 478 local_irq_enable();
479 trace_power_end(smp_processor_id());
480 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
472 } else 481 } else
473 local_irq_enable(); 482 local_irq_enable();
474} 483}
@@ -481,10 +490,12 @@ static void mwait_idle(void)
481static void poll_idle(void) 490static void poll_idle(void)
482{ 491{
483 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 492 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
493 trace_cpu_idle(0, smp_processor_id());
484 local_irq_enable(); 494 local_irq_enable();
485 while (!need_resched()) 495 while (!need_resched())
486 cpu_relax(); 496 cpu_relax();
487 trace_power_end(0); 497 trace_power_end(smp_processor_id());
498 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
488} 499}
489 500
490/* 501/*
@@ -499,17 +510,16 @@ static void poll_idle(void)
499 * 510 *
500 * idle=mwait overrides this decision and forces the usage of mwait. 511 * idle=mwait overrides this decision and forces the usage of mwait.
501 */ 512 */
502static int __cpuinitdata force_mwait;
503 513
504#define MWAIT_INFO 0x05 514#define MWAIT_INFO 0x05
505#define MWAIT_ECX_EXTENDED_INFO 0x01 515#define MWAIT_ECX_EXTENDED_INFO 0x01
506#define MWAIT_EDX_C1 0xf0 516#define MWAIT_EDX_C1 0xf0
507 517
508static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 518int mwait_usable(const struct cpuinfo_x86 *c)
509{ 519{
510 u32 eax, ebx, ecx, edx; 520 u32 eax, ebx, ecx, edx;
511 521
512 if (force_mwait) 522 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
513 return 1; 523 return 1;
514 524
515 if (c->cpuid_level < MWAIT_INFO) 525 if (c->cpuid_level < MWAIT_INFO)
@@ -527,45 +537,45 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
527 return (edx & MWAIT_EDX_C1); 537 return (edx & MWAIT_EDX_C1);
528} 538}
529 539
530bool c1e_detected; 540bool amd_e400_c1e_detected;
531EXPORT_SYMBOL(c1e_detected); 541EXPORT_SYMBOL(amd_e400_c1e_detected);
532 542
533static cpumask_var_t c1e_mask; 543static cpumask_var_t amd_e400_c1e_mask;
534 544
535void c1e_remove_cpu(int cpu) 545void amd_e400_remove_cpu(int cpu)
536{ 546{
537 if (c1e_mask != NULL) 547 if (amd_e400_c1e_mask != NULL)
538 cpumask_clear_cpu(cpu, c1e_mask); 548 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
539} 549}
540 550
541/* 551/*
542 * C1E aware idle routine. We check for C1E active in the interrupt 552 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
543 * pending message MSR. If we detect C1E, then we handle it the same 553 * pending message MSR. If we detect C1E, then we handle it the same
544 * way as C3 power states (local apic timer and TSC stop) 554 * way as C3 power states (local apic timer and TSC stop)
545 */ 555 */
546static void c1e_idle(void) 556static void amd_e400_idle(void)
547{ 557{
548 if (need_resched()) 558 if (need_resched())
549 return; 559 return;
550 560
551 if (!c1e_detected) { 561 if (!amd_e400_c1e_detected) {
552 u32 lo, hi; 562 u32 lo, hi;
553 563
554 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 564 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
555 565
556 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 566 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
557 c1e_detected = true; 567 amd_e400_c1e_detected = true;
558 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 568 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
559 mark_tsc_unstable("TSC halt in AMD C1E"); 569 mark_tsc_unstable("TSC halt in AMD C1E");
560 printk(KERN_INFO "System has AMD C1E enabled\n"); 570 printk(KERN_INFO "System has AMD C1E enabled\n");
561 } 571 }
562 } 572 }
563 573
564 if (c1e_detected) { 574 if (amd_e400_c1e_detected) {
565 int cpu = smp_processor_id(); 575 int cpu = smp_processor_id();
566 576
567 if (!cpumask_test_cpu(cpu, c1e_mask)) { 577 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
568 cpumask_set_cpu(cpu, c1e_mask); 578 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
569 /* 579 /*
570 * Force broadcast so ACPI can not interfere. 580 * Force broadcast so ACPI can not interfere.
571 */ 581 */
@@ -608,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
608 pm_idle = mwait_idle; 618 pm_idle = mwait_idle;
609 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 619 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
610 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 620 /* E400: APIC timer interrupt does not wake up CPU from C1e */
611 printk(KERN_INFO "using C1E aware idle routine\n"); 621 printk(KERN_INFO "using AMD E400 aware idle routine\n");
612 pm_idle = c1e_idle; 622 pm_idle = amd_e400_idle;
613 } else 623 } else
614 pm_idle = default_idle; 624 pm_idle = default_idle;
615} 625}
616 626
617void __init init_c1e_mask(void) 627void __init init_amd_e400_c1e_mask(void)
618{ 628{
619 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 629 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
620 if (pm_idle == c1e_idle) 630 if (pm_idle == amd_e400_idle)
621 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); 631 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
622} 632}
623 633
624static int __init idle_setup(char *str) 634static int __init idle_setup(char *str)
@@ -629,9 +639,11 @@ static int __init idle_setup(char *str)
629 if (!strcmp(str, "poll")) { 639 if (!strcmp(str, "poll")) {
630 printk("using polling idle threads.\n"); 640 printk("using polling idle threads.\n");
631 pm_idle = poll_idle; 641 pm_idle = poll_idle;
632 } else if (!strcmp(str, "mwait")) 642 boot_option_idle_override = IDLE_POLL;
633 force_mwait = 1; 643 } else if (!strcmp(str, "mwait")) {
634 else if (!strcmp(str, "halt")) { 644 boot_option_idle_override = IDLE_FORCE_MWAIT;
645 WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
646 } else if (!strcmp(str, "halt")) {
635 /* 647 /*
636 * When the boot option of idle=halt is added, halt is 648 * When the boot option of idle=halt is added, halt is
637 * forced to be used for CPU idle. In such case CPU C2/C3 649 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -640,8 +652,7 @@ static int __init idle_setup(char *str)
640 * the boot_option_idle_override. 652 * the boot_option_idle_override.
641 */ 653 */
642 pm_idle = default_idle; 654 pm_idle = default_idle;
643 idle_halt = 1; 655 boot_option_idle_override = IDLE_HALT;
644 return 0;
645 } else if (!strcmp(str, "nomwait")) { 656 } else if (!strcmp(str, "nomwait")) {
646 /* 657 /*
647 * If the boot option of "idle=nomwait" is added, 658 * If the boot option of "idle=nomwait" is added,
@@ -649,12 +660,10 @@ static int __init idle_setup(char *str)
649 * states. In such case it won't touch the variable 660 * states. In such case it won't touch the variable
650 * of boot_option_idle_override. 661 * of boot_option_idle_override.
651 */ 662 */
652 idle_nomwait = 1; 663 boot_option_idle_override = IDLE_NOMWAIT;
653 return 0;
654 } else 664 } else
655 return -1; 665 return -1;
656 666
657 boot_option_idle_override = 1;
658 return 0; 667 return 0;
659} 668}
660early_param("idle", idle_setup); 669early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..a3d0dc59067b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
@@ -249,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
249{ 245{
250 set_user_gs(regs, 0); 246 set_user_gs(regs, 0);
251 regs->fs = 0; 247 regs->fs = 0;
252 set_fs(USER_DS);
253 regs->ds = __USER_DS; 248 regs->ds = __USER_DS;
254 regs->es = __USER_DS; 249 regs->es = __USER_DS;
255 regs->ss = __USER_DS; 250 regs->ss = __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..ca6f7ab8df33 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,8 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145
146 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
@@ -342,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 regs->cs = _cs; 338 regs->cs = _cs;
343 regs->ss = _ss; 339 regs->ss = _ss;
344 regs->flags = X86_EFLAGS_IF; 340 regs->flags = X86_EFLAGS_IF;
345 set_fs(USER_DS);
346 /* 341 /*
347 * Free the old FP and other extended state 342 * Free the old FP and other extended state
348 */ 343 */
@@ -424,7 +419,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
424 load_TLS(next, cpu); 419 load_TLS(next, cpu);
425 420
426 /* Must be after DS reload */ 421 /* Must be after DS reload */
427 unlazy_fpu(prev_p); 422 __unlazy_fpu(prev_p);
428 423
429 /* Make sure cpu is ready for new context */ 424 /* Make sure cpu is ready for new context */
430 if (preload_fpu) 425 if (preload_fpu)
@@ -505,6 +500,10 @@ void set_personality_64bit(void)
505 /* Make sure to be in 64bit mode */ 500 /* Make sure to be in 64bit mode */
506 clear_thread_flag(TIF_IA32); 501 clear_thread_flag(TIF_IA32);
507 502
503 /* Ensure the corresponding mm is not marked. */
504 if (current->mm)
505 current->mm->context.ia32_compat = 0;
506
508 /* TBD: overwrites user setup. Should have two bits. 507 /* TBD: overwrites user setup. Should have two bits.
509 But 64bit processes have always behaved this way, 508 But 64bit processes have always behaved this way,
510 so it's not too bad. The main problem is just that 509 so it's not too bad. The main problem is just that
@@ -520,6 +519,10 @@ void set_personality_ia32(void)
520 set_thread_flag(TIF_IA32); 519 set_thread_flag(TIF_IA32);
521 current->personality |= force_personality32; 520 current->personality |= force_personality32;
522 521
522 /* Mark the associated mm as containing 32-bit tasks. */
523 if (current->mm)
524 current->mm->context.ia32_compat = 1;
525
523 /* Prepare the first "return" to user space */ 526 /* Prepare the first "return" to user space */
524 current_thread_info()->status |= TS_COMPAT; 527 current_thread_info()->status |= TS_COMPAT;
525} 528}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..807c2a2b80f1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608 unsigned len, type; 608 unsigned len, type;
609 struct perf_event *bp; 609 struct perf_event *bp;
610 610
611 if (ptrace_get_breakpoints(tsk) < 0)
612 return -ESRCH;
613
611 data &= ~DR_CONTROL_RESERVED; 614 data &= ~DR_CONTROL_RESERVED;
612 old_dr7 = ptrace_get_dr7(thread->ptrace_bps); 615 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
613restore: 616restore:
@@ -655,6 +658,9 @@ restore:
655 } 658 }
656 goto restore; 659 goto restore;
657 } 660 }
661
662 ptrace_put_breakpoints(tsk);
663
658 return ((orig_ret < 0) ? orig_ret : rc); 664 return ((orig_ret < 0) ? orig_ret : rc);
659} 665}
660 666
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
668 674
669 if (n < HBP_NUM) { 675 if (n < HBP_NUM) {
670 struct perf_event *bp; 676 struct perf_event *bp;
677
678 if (ptrace_get_breakpoints(tsk) < 0)
679 return -ESRCH;
680
671 bp = thread->ptrace_bps[n]; 681 bp = thread->ptrace_bps[n];
672 if (!bp) 682 if (!bp)
673 return 0; 683 val = 0;
674 val = bp->hw.info.address; 684 else
685 val = bp->hw.info.address;
686
687 ptrace_put_breakpoints(tsk);
675 } else if (n == 6) { 688 } else if (n == 6) {
676 val = thread->debugreg6; 689 val = thread->debugreg6;
677 } else if (n == 7) { 690 } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
686 struct perf_event *bp; 699 struct perf_event *bp;
687 struct thread_struct *t = &tsk->thread; 700 struct thread_struct *t = &tsk->thread;
688 struct perf_event_attr attr; 701 struct perf_event_attr attr;
702 int err = 0;
703
704 if (ptrace_get_breakpoints(tsk) < 0)
705 return -ESRCH;
689 706
690 if (!t->ptrace_bps[nr]) { 707 if (!t->ptrace_bps[nr]) {
691 ptrace_breakpoint_init(&attr); 708 ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
709 * writing for the user. And anyway this is the previous 726 * writing for the user. And anyway this is the previous
710 * behaviour. 727 * behaviour.
711 */ 728 */
712 if (IS_ERR(bp)) 729 if (IS_ERR(bp)) {
713 return PTR_ERR(bp); 730 err = PTR_ERR(bp);
731 goto put;
732 }
714 733
715 t->ptrace_bps[nr] = bp; 734 t->ptrace_bps[nr] = bp;
716 } else { 735 } else {
717 int err;
718
719 bp = t->ptrace_bps[nr]; 736 bp = t->ptrace_bps[nr];
720 737
721 attr = bp->attr; 738 attr = bp->attr;
722 attr.bp_addr = addr; 739 attr.bp_addr = addr;
723 err = modify_user_hw_breakpoint(bp, &attr); 740 err = modify_user_hw_breakpoint(bp, &attr);
724 if (err)
725 return err;
726 } 741 }
727 742
728 743put:
729 return 0; 744 ptrace_put_breakpoints(tsk);
745 return err;
730} 746}
731 747
732/* 748/*
@@ -801,7 +817,8 @@ void ptrace_disable(struct task_struct *child)
801static const struct user_regset_view user_x86_32_view; /* Initialized below. */ 817static const struct user_regset_view user_x86_32_view; /* Initialized below. */
802#endif 818#endif
803 819
804long arch_ptrace(struct task_struct *child, long request, long addr, long data) 820long arch_ptrace(struct task_struct *child, long request,
821 unsigned long addr, unsigned long data)
805{ 822{
806 int ret; 823 int ret;
807 unsigned long __user *datap = (unsigned long __user *)data; 824 unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +829,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
812 unsigned long tmp; 829 unsigned long tmp;
813 830
814 ret = -EIO; 831 ret = -EIO;
815 if ((addr & (sizeof(data) - 1)) || addr < 0 || 832 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
816 addr >= sizeof(struct user))
817 break; 833 break;
818 834
819 tmp = 0; /* Default return condition */ 835 tmp = 0; /* Default return condition */
@@ -830,8 +846,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
830 846
831 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ 847 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
832 ret = -EIO; 848 ret = -EIO;
833 if ((addr & (sizeof(data) - 1)) || addr < 0 || 849 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
834 addr >= sizeof(struct user))
835 break; 850 break;
836 851
837 if (addr < sizeof(struct user_regs_struct)) 852 if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +903,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
888 903
889#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 904#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
890 case PTRACE_GET_THREAD_AREA: 905 case PTRACE_GET_THREAD_AREA:
891 if (addr < 0) 906 if ((int) addr < 0)
892 return -EIO; 907 return -EIO;
893 ret = do_get_thread_area(child, addr, 908 ret = do_get_thread_area(child, addr,
894 (struct user_desc __user *) data); 909 (struct user_desc __user *)data);
895 break; 910 break;
896 911
897 case PTRACE_SET_THREAD_AREA: 912 case PTRACE_SET_THREAD_AREA:
898 if (addr < 0) 913 if ((int) addr < 0)
899 return -EIO; 914 return -EIO;
900 ret = do_set_thread_area(child, addr, 915 ret = do_set_thread_area(child, addr,
901 (struct user_desc __user *) data, 0); 916 (struct user_desc __user *)data, 0);
902 break; 917 break;
903#endif 918#endif
904 919
@@ -1348,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1348 * We must return the syscall number to actually look up in the table. 1363 * We must return the syscall number to actually look up in the table.
1349 * This can be -1L to skip running any syscall at all. 1364 * This can be -1L to skip running any syscall at all.
1350 */ 1365 */
1351asmregparm long syscall_trace_enter(struct pt_regs *regs) 1366long syscall_trace_enter(struct pt_regs *regs)
1352{ 1367{
1353 long ret = 0; 1368 long ret = 0;
1354 1369
@@ -1393,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1393 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1394} 1409}
1395 1410
1396asmregparm void syscall_trace_leave(struct pt_regs *regs) 1411void syscall_trace_leave(struct pt_regs *regs)
1397{ 1412{
1398 bool step; 1413 bool step;
1399 1414
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,48 +41,11 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 41 valid_flags = flags;
42} 42}
43 43
44/*
45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
46 * yielding a 64-bit result.
47 */
48static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
49{
50 u64 product;
51#ifdef __i386__
52 u32 tmp1, tmp2;
53#endif
54
55 if (shift < 0)
56 delta >>= -shift;
57 else
58 delta <<= shift;
59
60#ifdef __i386__
61 __asm__ (
62 "mul %5 ; "
63 "mov %4,%%eax ; "
64 "mov %%edx,%4 ; "
65 "mul %5 ; "
66 "xor %5,%5 ; "
67 "add %4,%%eax ; "
68 "adc %5,%%edx ; "
69 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
70 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
71#elif defined(__x86_64__)
72 __asm__ (
73 "mul %%rdx ; shrd $32,%%rdx,%%rax"
74 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
75#else
76#error implement me!
77#endif
78
79 return product;
80}
81
82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
83{ 45{
84 u64 delta = native_read_tsc() - shadow->tsc_timestamp; 46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
85 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
86} 49}
87 50
88/* 51/*
@@ -120,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
120 83
121static atomic64_t last_value = ATOMIC64_INIT(0); 84static atomic64_t last_value = ATOMIC64_INIT(0);
122 85
86void pvclock_resume(void)
87{
88 atomic64_set(&last_value, 0);
89}
90
123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
124{ 92{
125 struct pvclock_shadow_time shadow; 93 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..8bbe8c56916d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
344 vt8237_force_enable_hpet); 344 vt8237_force_enable_hpet);
345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
346 vt8237_force_enable_hpet); 346 vt8237_force_enable_hpet);
347DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
348 vt8237_force_enable_hpet);
347 349
348static void ati_force_hpet_resume(void) 350static void ati_force_hpet_resume(void)
349{ 351{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..9242436e9937 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/tboot.h> 8#include <linux/tboot.h>
9#include <linux/delay.h>
9#include <acpi/reboot.h> 10#include <acpi/reboot.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
@@ -18,6 +19,7 @@
18#include <asm/pci_x86.h> 19#include <asm/pci_x86.h>
19#include <asm/virtext.h> 20#include <asm/virtext.h>
20#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/nmi.h>
21 23
22#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
23# include <linux/ctype.h> 25# include <linux/ctype.h>
@@ -34,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
34 36
35static const struct desc_ptr no_idt = {}; 37static const struct desc_ptr no_idt = {};
36static int reboot_mode; 38static int reboot_mode;
37enum reboot_type reboot_type = BOOT_KBD; 39enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 40int reboot_force;
39 41
40#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -84,7 +86,7 @@ static int __init reboot_setup(char *str)
84 } 86 }
85 /* we will leave sorting out the final value 87 /* we will leave sorting out the final value
86 when we are ready to reboot, since we might not 88 when we are ready to reboot, since we might not
87 have set up boot_cpu_id or smp_num_cpu */ 89 have detected BSP APIC ID or smp_num_cpu */
88 break; 90 break;
89#endif /* CONFIG_SMP */ 91#endif /* CONFIG_SMP */
90 92
@@ -284,6 +286,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
284 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 286 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
285 }, 287 },
286 }, 288 },
289 { /* Handle problems with rebooting on VersaLogic Menlow boards */
290 .callback = set_bios_reboot,
291 .ident = "VersaLogic Menlow based board",
292 .matches = {
293 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 },
296 },
297 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot,
299 .ident = "Acer Aspire One A110",
300 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
302 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
303 },
304 },
287 { } 305 { }
288}; 306};
289 307
@@ -294,68 +312,16 @@ static int __init reboot_init(void)
294} 312}
295core_initcall(reboot_init); 313core_initcall(reboot_init);
296 314
297/* The following code and data reboots the machine by switching to real 315extern const unsigned char machine_real_restart_asm[];
298 mode and jumping to the BIOS reset entry point, as if the CPU has 316extern const u64 machine_real_restart_gdt[3];
299 really been reset. The previous version asked the keyboard
300 controller to pulse the CPU reset line, which is more thorough, but
301 doesn't work with at least one type of 486 motherboard. It is easy
302 to stop this code working; hence the copious comments. */
303static const unsigned long long
304real_mode_gdt_entries [3] =
305{
306 0x0000000000000000ULL, /* Null descriptor */
307 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
308 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
309};
310 317
311static const struct desc_ptr 318void machine_real_restart(unsigned int type)
312real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
313real_mode_idt = { 0x3ff, 0 };
314
315/* This is 16-bit protected mode code to disable paging and the cache,
316 switch to real mode and jump to the BIOS reset code.
317
318 The instruction that switches to real mode by writing to CR0 must be
319 followed immediately by a far jump instruction, which set CS to a
320 valid value for real mode, and flushes the prefetch queue to avoid
321 running instructions that have already been decoded in protected
322 mode.
323
324 Clears all the flags except ET, especially PG (paging), PE
325 (protected-mode enable) and TS (task switch for coprocessor state
326 save). Flushes the TLB after paging has been disabled. Sets CD and
327 NW, to disable the cache on a 486, and invalidates the cache. This
328 is more like the state of a 486 after reset. I don't know if
329 something else should be done for other chips.
330
331 More could be done here to set up the registers as if a CPU reset had
332 occurred; hopefully real BIOSs don't assume much. */
333static const unsigned char real_mode_switch [] =
334{ 319{
335 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 320 void *restart_va;
336 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ 321 unsigned long restart_pa;
337 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ 322 void (*restart_lowmem)(unsigned int);
338 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ 323 u64 *lowmem_gdt;
339 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
340 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
341 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
342 0x74, 0x02, /* jz f */
343 0x0f, 0x09, /* wbinvd */
344 0x24, 0x10, /* f: andb $0x10,al */
345 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
346};
347static const unsigned char jump_to_bios [] =
348{
349 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
350};
351 324
352/*
353 * Switch to real mode and then execute the code
354 * specified by the code and length parameters.
355 * We assume that length will aways be less that 100!
356 */
357void machine_real_restart(const unsigned char *code, int length)
358{
359 local_irq_disable(); 325 local_irq_disable();
360 326
361 /* Write zero to CMOS register number 0x0f, which the BIOS POST 327 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -371,16 +337,10 @@ void machine_real_restart(const unsigned char *code, int length)
371 CMOS_WRITE(0x00, 0x8f); 337 CMOS_WRITE(0x00, 0x8f);
372 spin_unlock(&rtc_lock); 338 spin_unlock(&rtc_lock);
373 339
374 /* Remap the kernel at virtual address zero, as well as offset zero
375 from the kernel segment. This assumes the kernel segment starts at
376 virtual address PAGE_OFFSET. */
377 memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
378 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
379
380 /* 340 /*
381 * Use `swapper_pg_dir' as our page directory. 341 * Switch back to the initial page table.
382 */ 342 */
383 load_cr3(swapper_pg_dir); 343 load_cr3(initial_page_table);
384 344
385 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads 345 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
386 this on booting to tell it to "Bypass memory test (also warm 346 this on booting to tell it to "Bypass memory test (also warm
@@ -389,41 +349,23 @@ void machine_real_restart(const unsigned char *code, int length)
389 too. */ 349 too. */
390 *((unsigned short *)0x472) = reboot_mode; 350 *((unsigned short *)0x472) = reboot_mode;
391 351
392 /* For the switch to real mode, copy some code to low memory. It has 352 /* Patch the GDT in the low memory trampoline */
393 to be in the first 64k because it is running in 16-bit mode, and it 353 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
394 has to have the same physical and virtual address, because it turns 354
395 off paging. Copy it near the end of the first page, out of the way 355 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
396 of BIOS variables. */ 356 restart_pa = virt_to_phys(restart_va);
397 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 357 restart_lowmem = (void (*)(unsigned int))restart_pa;
398 real_mode_switch, sizeof (real_mode_switch)); 358
399 memcpy((void *)(0x1000 - 100), code, length); 359 /* GDT[0]: GDT self-pointer */
400 360 lowmem_gdt[0] =
401 /* Set up the IDT for real mode. */ 361 (u64)(sizeof(machine_real_restart_gdt) - 1) +
402 load_idt(&real_mode_idt); 362 ((u64)virt_to_phys(lowmem_gdt) << 16);
403 363 /* GDT[1]: 64K real mode code segment */
404 /* Set up a GDT from which we can load segment descriptors for real 364 lowmem_gdt[1] =
405 mode. The GDT is not used in real mode; it is just needed here to 365 GDT_ENTRY(0x009b, restart_pa, 0xffff);
406 prepare the descriptors. */ 366
407 load_gdt(&real_mode_gdt); 367 /* Jump to the identity-mapped low memory code */
408 368 restart_lowmem(type);
409 /* Load the data segment registers, and thus the descriptors ready for
410 real mode. The base address of each segment is 0x100, 16 times the
411 selector value being loaded here. This is so that the segment
412 registers don't have to be reloaded after switching to real mode:
413 the values are consistent for real mode operation already. */
414 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
415 "\tmovl %%eax,%%ds\n"
416 "\tmovl %%eax,%%es\n"
417 "\tmovl %%eax,%%fs\n"
418 "\tmovl %%eax,%%gs\n"
419 "\tmovl %%eax,%%ss" : : : "eax");
420
421 /* Jump to the 16-bit code that we copied earlier. It disables paging
422 and the cache, switches to real mode, and jumps to the BIOS reset
423 entry point. */
424 __asm__ __volatile__ ("ljmp $0x0008,%0"
425 :
426 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
427} 369}
428#ifdef CONFIG_APM_MODULE 370#ifdef CONFIG_APM_MODULE
429EXPORT_SYMBOL(machine_real_restart); 371EXPORT_SYMBOL(machine_real_restart);
@@ -477,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
477 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), 419 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
478 }, 420 },
479 }, 421 },
422 { /* Handle problems with rebooting on the Latitude E6320. */
423 .callback = set_pci_reboot,
424 .ident = "Dell Latitude E6320",
425 .matches = {
426 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
427 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
428 },
429 },
430 { /* Handle problems with rebooting on the Latitude E5420. */
431 .callback = set_pci_reboot,
432 .ident = "Dell Latitude E5420",
433 .matches = {
434 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
435 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
436 },
437 },
438 { /* Handle problems with rebooting on the Latitude E6420. */
439 .callback = set_pci_reboot,
440 .ident = "Dell Latitude E6420",
441 .matches = {
442 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
443 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
444 },
445 },
480 { } 446 { }
481}; 447};
482 448
@@ -544,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
544{ 510{
545} 511}
546 512
513/*
514 * Windows compatible x86 hardware expects the following on reboot:
515 *
516 * 1) If the FADT has the ACPI reboot register flag set, try it
517 * 2) If still alive, write to the keyboard controller
518 * 3) If still alive, write to the ACPI reboot register again
519 * 4) If still alive, write to the keyboard controller again
520 *
521 * If the machine is still alive at this stage, it gives up. We default to
522 * following the same pattern, except that if we're still alive after (4) we'll
523 * try to force a triple fault and then cycle between hitting the keyboard
524 * controller and doing that
525 */
547static void native_machine_emergency_restart(void) 526static void native_machine_emergency_restart(void)
548{ 527{
549 int i; 528 int i;
529 int attempt = 0;
530 int orig_reboot_type = reboot_type;
550 531
551 if (reboot_emergency) 532 if (reboot_emergency)
552 emergency_vmx_disable_all(); 533 emergency_vmx_disable_all();
@@ -568,6 +549,13 @@ static void native_machine_emergency_restart(void)
568 outb(0xfe, 0x64); /* pulse reset low */ 549 outb(0xfe, 0x64); /* pulse reset low */
569 udelay(50); 550 udelay(50);
570 } 551 }
552 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
553 attempt = 1;
554 reboot_type = BOOT_ACPI;
555 } else {
556 reboot_type = BOOT_TRIPLE;
557 }
558 break;
571 559
572 case BOOT_TRIPLE: 560 case BOOT_TRIPLE:
573 load_idt(&no_idt); 561 load_idt(&no_idt);
@@ -578,7 +566,7 @@ static void native_machine_emergency_restart(void)
578 566
579#ifdef CONFIG_X86_32 567#ifdef CONFIG_X86_32
580 case BOOT_BIOS: 568 case BOOT_BIOS:
581 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 569 machine_real_restart(MRR_BIOS);
582 570
583 reboot_type = BOOT_KBD; 571 reboot_type = BOOT_KBD;
584 break; 572 break;
@@ -641,7 +629,7 @@ void native_machine_shutdown(void)
641 /* O.K Now that I'm on the appropriate processor, 629 /* O.K Now that I'm on the appropriate processor,
642 * stop all of the others. 630 * stop all of the others.
643 */ 631 */
644 smp_send_stop(); 632 stop_other_cpus();
645#endif 633#endif
646 634
647 lapic_shutdown(); 635 lapic_shutdown();
@@ -753,7 +741,7 @@ static int crash_nmi_callback(struct notifier_block *self,
753{ 741{
754 int cpu; 742 int cpu;
755 743
756 if (val != DIE_NMI_IPI) 744 if (val != DIE_NMI)
757 return NOTIFY_OK; 745 return NOTIFY_OK;
758 746
759 cpu = raw_smp_processor_id(); 747 cpu = raw_smp_processor_id();
@@ -784,6 +772,8 @@ static void smp_send_nmi_allbutself(void)
784 772
785static struct notifier_block crash_nmi_nb = { 773static struct notifier_block crash_nmi_nb = {
786 .notifier_call = crash_nmi_callback, 774 .notifier_call = crash_nmi_callback,
775 /* we want to be the first one called */
776 .priority = NMI_LOCAL_HIGH_PRIOR+1,
787}; 777};
788 778
789/* Halt all other CPUs, calling the specified function on each of them 779/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..1d5c46df0d78
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $(1b - r_base), %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
32 movw %ax, (101f - r_base)(%ebx)
33 movw %cx, (102f - r_base)(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl (machine_real_restart_idt - r_base)(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl (machine_real_restart_gdt - r_base)(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..c8e41e90f59c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
43 outb(1, 0x92); 43 outb(1, 0x92);
44} 44}
45 45
46static void ce4100_reset(struct pci_dev *dev)
47{
48 int i;
49
50 for (i = 0; i < 10; i++) {
51 outb(0x2, 0xcf9);
52 udelay(50);
53 }
54}
55
46struct device_fixup { 56struct device_fixup {
47 unsigned int vendor; 57 unsigned int vendor;
48 unsigned int device; 58 unsigned int device;
49 void (*reboot_fixup)(struct pci_dev *); 59 void (*reboot_fixup)(struct pci_dev *);
50}; 60};
51 61
62/*
63 * PCI ids solely used for fixups_table go here
64 */
65#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
66
52static const struct device_fixup fixups_table[] = { 67static const struct device_fixup fixups_table[] = {
53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 68{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 69{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 70{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, 71{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
72{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
57}; 73};
58 74
59/* 75/*
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
1#include <linux/ioport.h>
2#include <asm/e820.h>
3
4static void resource_clip(struct resource *res, resource_size_t start,
5 resource_size_t end)
6{
7 resource_size_t low = 0, high = 0;
8
9 if (res->end < start || res->start > end)
10 return; /* no conflict */
11
12 if (res->start < start)
13 low = start - res->start;
14
15 if (res->end > end)
16 high = res->end - end;
17
18 /* Keep the area above or below the conflict, whichever is larger */
19 if (low > high)
20 res->end = start - 1;
21 else
22 res->start = end + 1;
23}
24
25static void remove_e820_regions(struct resource *avail)
26{
27 int i;
28 struct e820entry *entry;
29
30 for (i = 0; i < e820.nr_map; i++) {
31 entry = &e820.map[i];
32
33 resource_clip(avail, entry->addr,
34 entry->addr + entry->size - 1);
35 }
36}
37
38void arch_remove_reservations(struct resource *avail)
39{
40 /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
41 if (avail->flags & IORESOURCE_MEM) {
42 if (avail->start < BIOS_END)
43 avail->start = BIOS_END;
44 resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
45
46 remove_e820_regions(avail);
47 }
48}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -76,7 +77,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
76 CMOS_WRITE(real_seconds, RTC_SECONDS); 77 CMOS_WRITE(real_seconds, RTC_SECONDS);
77 CMOS_WRITE(real_minutes, RTC_MINUTES); 78 CMOS_WRITE(real_minutes, RTC_MINUTES);
78 } else { 79 } else {
79 printk(KERN_WARNING 80 printk_once(KERN_NOTICE
80 "set_rtc_mmss: can't update from %d to %d\n", 81 "set_rtc_mmss: can't update from %d to %d\n",
81 cmos_minutes, real_minutes); 82 cmos_minutes, real_minutes);
82 retval = -1; 83 retval = -1;
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c
deleted file mode 100644
index 7e004acbe526..000000000000
--- a/arch/x86/kernel/scx200_32.c
+++ /dev/null
@@ -1,131 +0,0 @@
1/*
2 * Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
3 *
4 * National Semiconductor SCx200 support.
5 */
6
7#include <linux/module.h>
8#include <linux/errno.h>
9#include <linux/kernel.h>
10#include <linux/init.h>
11#include <linux/mutex.h>
12#include <linux/pci.h>
13
14#include <linux/scx200.h>
15#include <linux/scx200_gpio.h>
16
17/* Verify that the configuration block really is there */
18#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
19
20#define NAME "scx200"
21
22MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
23MODULE_DESCRIPTION("NatSemi SCx200 Driver");
24MODULE_LICENSE("GPL");
25
26unsigned scx200_gpio_base = 0;
27unsigned long scx200_gpio_shadow[2];
28
29unsigned scx200_cb_base = 0;
30
31static struct pci_device_id scx200_tbl[] = {
32 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) },
33 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) },
34 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) },
35 { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) },
36 { },
37};
38MODULE_DEVICE_TABLE(pci,scx200_tbl);
39
40static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *);
41
42static struct pci_driver scx200_pci_driver = {
43 .name = "scx200",
44 .id_table = scx200_tbl,
45 .probe = scx200_probe,
46};
47
48static DEFINE_MUTEX(scx200_gpio_config_lock);
49
50static void __devinit scx200_init_shadow(void)
51{
52 int bank;
53
54 /* read the current values driven on the GPIO signals */
55 for (bank = 0; bank < 2; ++bank)
56 scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
57}
58
59static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
60{
61 unsigned base;
62
63 if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE ||
64 pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) {
65 base = pci_resource_start(pdev, 0);
66 printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
67
68 if (!request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO")) {
69 printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
70 return -EBUSY;
71 }
72
73 scx200_gpio_base = base;
74 scx200_init_shadow();
75
76 } else {
77 /* find the base of the Configuration Block */
78 if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) {
79 scx200_cb_base = SCx200_CB_BASE_FIXED;
80 } else {
81 pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base);
82 if (scx200_cb_probe(base)) {
83 scx200_cb_base = base;
84 } else {
85 printk(KERN_WARNING NAME ": Configuration Block not found\n");
86 return -ENODEV;
87 }
88 }
89 printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base);
90 }
91
92 return 0;
93}
94
95u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits)
96{
97 u32 config, new_config;
98
99 mutex_lock(&scx200_gpio_config_lock);
100
101 outl(index, scx200_gpio_base + 0x20);
102 config = inl(scx200_gpio_base + 0x24);
103
104 new_config = (config & mask) | bits;
105 outl(new_config, scx200_gpio_base + 0x24);
106
107 mutex_unlock(&scx200_gpio_config_lock);
108
109 return config;
110}
111
112static int __init scx200_init(void)
113{
114 printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
115
116 return pci_register_driver(&scx200_pci_driver);
117}
118
119static void __exit scx200_cleanup(void)
120{
121 pci_unregister_driver(&scx200_pci_driver);
122 release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
123}
124
125module_init(scx200_init);
126module_exit(scx200_cleanup);
127
128EXPORT_SYMBOL(scx200_gpio_base);
129EXPORT_SYMBOL(scx200_gpio_shadow);
130EXPORT_SYMBOL(scx200_gpio_configure);
131EXPORT_SYMBOL(scx200_cb_base);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b996..afaf38447ef5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
31#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
32#include <linux/initrd.h> 32#include <linux/initrd.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/console.h> 36#include <linux/console.h>
36#include <linux/mca.h> 37#include <linux/mca.h>
@@ -83,7 +84,6 @@
83#include <asm/dmi.h> 84#include <asm/dmi.h>
84#include <asm/io_apic.h> 85#include <asm/io_apic.h>
85#include <asm/ist.h> 86#include <asm/ist.h>
86#include <asm/vmi.h>
87#include <asm/setup_arch.h> 87#include <asm/setup_arch.h>
88#include <asm/bios_ebda.h> 88#include <asm/bios_ebda.h>
89#include <asm/cacheflush.h> 89#include <asm/cacheflush.h>
@@ -107,11 +107,13 @@
107#include <asm/percpu.h> 107#include <asm/percpu.h>
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/k8.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h> 112#include <asm/numa_64.h>
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h>
116#include <asm/prom.h>
115 117
116/* 118/*
117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -125,7 +127,6 @@ unsigned long max_pfn_mapped;
125RESERVE_BRK(dmi_alloc, 65536); 127RESERVE_BRK(dmi_alloc, 65536);
126#endif 128#endif
127 129
128unsigned int boot_cpu_id __read_mostly;
129 130
130static __initdata unsigned long _brk_start = (unsigned long)__brk_base; 131static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
131unsigned long _brk_end = (unsigned long)__brk_base; 132unsigned long _brk_end = (unsigned long)__brk_base;
@@ -297,12 +298,15 @@ static void __init init_gbpages(void)
297static inline void init_gbpages(void) 298static inline void init_gbpages(void)
298{ 299{
299} 300}
301static void __init cleanup_highmap(void)
302{
303}
300#endif 304#endif
301 305
302static void __init reserve_brk(void) 306static void __init reserve_brk(void)
303{ 307{
304 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
305 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); 309 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
306 310
307 /* Mark brk area as locked down and no longer taking any 311 /* Mark brk area as locked down and no longer taking any
308 new allocations */ 312 new allocations */
@@ -324,17 +328,16 @@ static void __init relocate_initrd(void)
324 char *p, *q; 328 char *p, *q;
325 329
326 /* We need to move the initrd down into lowmem */ 330 /* We need to move the initrd down into lowmem */
327 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, 331 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
328 PAGE_SIZE); 332 PAGE_SIZE);
329 333
330 if (ramdisk_here == -1ULL) 334 if (ramdisk_here == MEMBLOCK_ERROR)
331 panic("Cannot find place for new RAMDISK of size %lld\n", 335 panic("Cannot find place for new RAMDISK of size %lld\n",
332 ramdisk_size); 336 ramdisk_size);
333 337
334 /* Note: this includes all the lowmem currently occupied by 338 /* Note: this includes all the lowmem currently occupied by
335 the initrd, we rely on that fact to keep the data intact. */ 339 the initrd, we rely on that fact to keep the data intact. */
336 reserve_early(ramdisk_here, ramdisk_here + area_size, 340 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
337 "NEW RAMDISK");
338 initrd_start = ramdisk_here + PAGE_OFFSET; 341 initrd_start = ramdisk_here + PAGE_OFFSET;
339 initrd_end = initrd_start + ramdisk_size; 342 initrd_end = initrd_start + ramdisk_size;
340 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -390,7 +393,7 @@ static void __init reserve_initrd(void)
390 initrd_start = 0; 393 initrd_start = 0;
391 394
392 if (ramdisk_size >= (end_of_lowmem>>1)) { 395 if (ramdisk_size >= (end_of_lowmem>>1)) {
393 free_early(ramdisk_image, ramdisk_end); 396 memblock_x86_free_range(ramdisk_image, ramdisk_end);
394 printk(KERN_ERR "initrd too large to handle, " 397 printk(KERN_ERR "initrd too large to handle, "
395 "disabling initrd\n"); 398 "disabling initrd\n");
396 return; 399 return;
@@ -413,7 +416,7 @@ static void __init reserve_initrd(void)
413 416
414 relocate_initrd(); 417 relocate_initrd();
415 418
416 free_early(ramdisk_image, ramdisk_end); 419 memblock_x86_free_range(ramdisk_image, ramdisk_end);
417} 420}
418#else 421#else
419static void __init reserve_initrd(void) 422static void __init reserve_initrd(void)
@@ -430,16 +433,30 @@ static void __init parse_setup_data(void)
430 return; 433 return;
431 pa_data = boot_params.hdr.setup_data; 434 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) { 435 while (pa_data) {
433 data = early_memremap(pa_data, PAGE_SIZE); 436 u32 data_len, map_len;
437
438 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
439 (u64)sizeof(struct setup_data));
440 data = early_memremap(pa_data, map_len);
441 data_len = data->len + sizeof(struct setup_data);
442 if (data_len > map_len) {
443 early_iounmap(data, map_len);
444 data = early_memremap(pa_data, data_len);
445 map_len = data_len;
446 }
447
434 switch (data->type) { 448 switch (data->type) {
435 case SETUP_E820_EXT: 449 case SETUP_E820_EXT:
436 parse_e820_ext(data, pa_data); 450 parse_e820_ext(data);
451 break;
452 case SETUP_DTB:
453 add_dtb(pa_data);
437 break; 454 break;
438 default: 455 default:
439 break; 456 break;
440 } 457 }
441 pa_data = data->next; 458 pa_data = data->next;
442 early_iounmap(data, PAGE_SIZE); 459 early_iounmap(data, map_len);
443 } 460 }
444} 461}
445 462
@@ -469,7 +486,7 @@ static void __init e820_reserve_setup_data(void)
469 e820_print_map("reserve setup_data"); 486 e820_print_map("reserve setup_data");
470} 487}
471 488
472static void __init reserve_early_setup_data(void) 489static void __init memblock_x86_reserve_range_setup_data(void)
473{ 490{
474 struct setup_data *data; 491 struct setup_data *data;
475 u64 pa_data; 492 u64 pa_data;
@@ -481,7 +498,7 @@ static void __init reserve_early_setup_data(void)
481 while (pa_data) { 498 while (pa_data) {
482 data = early_memremap(pa_data, sizeof(*data)); 499 data = early_memremap(pa_data, sizeof(*data));
483 sprintf(buf, "setup data %x", data->type); 500 sprintf(buf, "setup data %x", data->type);
484 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 501 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
485 pa_data = data->next; 502 pa_data = data->next;
486 early_iounmap(data, sizeof(*data)); 503 early_iounmap(data, sizeof(*data));
487 } 504 }
@@ -502,6 +519,18 @@ static inline unsigned long long get_total_mem(void)
502 return total << PAGE_SHIFT; 519 return total << PAGE_SHIFT;
503} 520}
504 521
522/*
523 * Keep the crash kernel below this limit. On 32 bits earlier kernels
524 * would limit the kernel to the low 512 MiB due to mapping restrictions.
525 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
526 * limit once kexec-tools are fixed.
527 */
528#ifdef CONFIG_X86_32
529# define CRASH_KERNEL_ADDR_MAX (512 << 20)
530#else
531# define CRASH_KERNEL_ADDR_MAX (896 << 20)
532#endif
533
505static void __init reserve_crashkernel(void) 534static void __init reserve_crashkernel(void)
506{ 535{
507 unsigned long long total_mem; 536 unsigned long long total_mem;
@@ -519,23 +548,27 @@ static void __init reserve_crashkernel(void)
519 if (crash_base <= 0) { 548 if (crash_base <= 0) {
520 const unsigned long long alignment = 16<<20; /* 16M */ 549 const unsigned long long alignment = 16<<20; /* 16M */
521 550
522 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, 551 /*
523 alignment); 552 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
524 if (crash_base == -1ULL) { 553 */
554 crash_base = memblock_find_in_range(alignment,
555 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
556
557 if (crash_base == MEMBLOCK_ERROR) {
525 pr_info("crashkernel reservation failed - No suitable area found.\n"); 558 pr_info("crashkernel reservation failed - No suitable area found.\n");
526 return; 559 return;
527 } 560 }
528 } else { 561 } else {
529 unsigned long long start; 562 unsigned long long start;
530 563
531 start = find_e820_area(crash_base, ULONG_MAX, crash_size, 564 start = memblock_find_in_range(crash_base,
532 1<<20); 565 crash_base + crash_size, crash_size, 1<<20);
533 if (start != crash_base) { 566 if (start != crash_base) {
534 pr_info("crashkernel reservation failed - memory is in use.\n"); 567 pr_info("crashkernel reservation failed - memory is in use.\n");
535 return; 568 return;
536 } 569 }
537 } 570 }
538 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); 571 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
539 572
540 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 573 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
541 "for crashkernel (System RAM: %ldMB)\n", 574 "for crashkernel (System RAM: %ldMB)\n",
@@ -586,28 +619,6 @@ void __init reserve_standard_io_resources(void)
586 619
587} 620}
588 621
589/*
590 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
591 * is_kdump_kernel() to determine if we are booting after a panic. Hence
592 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
593 */
594
595#ifdef CONFIG_CRASH_DUMP
596/* elfcorehdr= specifies the location of elf core header
597 * stored by the crashed kernel. This option will be passed
598 * by kexec loader to the capture kernel.
599 */
600static int __init setup_elfcorehdr(char *arg)
601{
602 char *end;
603 if (!arg)
604 return -EINVAL;
605 elfcorehdr_addr = memparse(arg, &end);
606 return end > arg ? 0 : -EINVAL;
607}
608early_param("elfcorehdr", setup_elfcorehdr);
609#endif
610
611static __init void reserve_ibft_region(void) 622static __init void reserve_ibft_region(void)
612{ 623{
613 unsigned long addr, size = 0; 624 unsigned long addr, size = 0;
@@ -615,82 +626,10 @@ static __init void reserve_ibft_region(void)
615 addr = find_ibft_region(&size); 626 addr = find_ibft_region(&size);
616 627
617 if (size) 628 if (size)
618 reserve_early_overlap_ok(addr, addr + size, "ibft"); 629 memblock_x86_reserve_range(addr, addr + size, "* ibft");
619} 630}
620 631
621#ifdef CONFIG_X86_RESERVE_LOW_64K 632static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
622static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
623{
624 printk(KERN_NOTICE
625 "%s detected: BIOS may corrupt low RAM, working around it.\n",
626 d->ident);
627
628 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
629 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
630
631 return 0;
632}
633#endif
634
635/* List of systems that have known low memory corruption BIOS problems */
636static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
637#ifdef CONFIG_X86_RESERVE_LOW_64K
638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "AMI BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
643 },
644 },
645 {
646 .callback = dmi_low_memory_corruption,
647 .ident = "Phoenix BIOS",
648 .matches = {
649 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
650 },
651 },
652 {
653 .callback = dmi_low_memory_corruption,
654 .ident = "Phoenix/MSC BIOS",
655 .matches = {
656 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
657 },
658 },
659 /*
660 * AMI BIOS with low memory corruption was found on Intel DG45ID and
661 * DG45FC boards.
662 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
663 * match only DMI_BOARD_NAME and see if there is more bad products
664 * with this vendor.
665 */
666 {
667 .callback = dmi_low_memory_corruption,
668 .ident = "AMI BIOS",
669 .matches = {
670 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
671 },
672 },
673 {
674 .callback = dmi_low_memory_corruption,
675 .ident = "AMI BIOS",
676 .matches = {
677 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
678 },
679 },
680 /*
681 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
682 * match on the product name.
683 */
684 {
685 .callback = dmi_low_memory_corruption,
686 .ident = "Phoenix BIOS",
687 .matches = {
688 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
689 },
690 },
691#endif
692 {}
693};
694 633
695static void __init trim_bios_range(void) 634static void __init trim_bios_range(void)
696{ 635{
@@ -698,8 +637,14 @@ static void __init trim_bios_range(void)
698 * A special case is the first 4Kb of memory; 637 * A special case is the first 4Kb of memory;
699 * This is a BIOS owned area, not kernel ram, but generally 638 * This is a BIOS owned area, not kernel ram, but generally
700 * not listed as such in the E820 table. 639 * not listed as such in the E820 table.
640 *
641 * This typically reserves additional memory (64KiB by default)
642 * since some BIOSes are known to corrupt low memory. See the
643 * Kconfig help text for X86_RESERVE_LOW.
701 */ 644 */
702 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); 645 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
646 E820_RAM, E820_RESERVED);
647
703 /* 648 /*
704 * special case: Some BIOSen report the PC BIOS 649 * special case: Some BIOSen report the PC BIOS
705 * area (640->1Mb) as ram even though it is not. 650 * area (640->1Mb) as ram even though it is not.
@@ -709,6 +654,28 @@ static void __init trim_bios_range(void)
709 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 654 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
710} 655}
711 656
657static int __init parse_reservelow(char *p)
658{
659 unsigned long long size;
660
661 if (!p)
662 return -EINVAL;
663
664 size = memparse(p, &p);
665
666 if (size < 4096)
667 size = 4096;
668
669 if (size > 640*1024)
670 size = 640*1024;
671
672 reserve_low = size;
673
674 return 0;
675}
676
677early_param("reservelow", parse_reservelow);
678
712/* 679/*
713 * Determine if we were loaded by an EFI loader. If so, then we have also been 680 * Determine if we were loaded by an EFI loader. If so, then we have also been
714 * passed the efi memmap, systab, etc., so we should use these data structures 681 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -724,20 +691,28 @@ static void __init trim_bios_range(void)
724 691
725void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
726{ 693{
727 int acpi = 0;
728 int k8 = 0;
729
730#ifdef CONFIG_X86_32 694#ifdef CONFIG_X86_32
731 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
732 visws_early_detect(); 696 visws_early_detect();
697
698 /*
699 * copy kernel address range established so far and switch
700 * to the proper swapper page table
701 */
702 clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
703 initial_page_table + KERNEL_PGD_BOUNDARY,
704 KERNEL_PGD_PTRS);
705
706 load_cr3(swapper_pg_dir);
707 __flush_tlb_all();
733#else 708#else
734 printk(KERN_INFO "Command line: %s\n", boot_command_line); 709 printk(KERN_INFO "Command line: %s\n", boot_command_line);
735#endif 710#endif
736 711
737 /* VMI may relocate the fixmap; do this before touching ioremap area */ 712 /*
738 vmi_init(); 713 * If we have OLPC OFW, we might end up relocating the fixmap due to
739 714 * reserve_top(), so do this before touching the ioremap area.
740 /* OFW also may relocate the fixmap */ 715 */
741 olpc_ofw_detect(); 716 olpc_ofw_detect();
742 717
743 early_trap_init(); 718 early_trap_init();
@@ -782,12 +757,13 @@ void __init setup_arch(char **cmdline_p)
782#endif 757#endif
783 4)) { 758 4)) {
784 efi_enabled = 1; 759 efi_enabled = 1;
785 efi_reserve_early(); 760 efi_memblock_x86_reserve_range();
786 } 761 }
787#endif 762#endif
788 763
789 x86_init.oem.arch_setup(); 764 x86_init.oem.arch_setup();
790 765
766 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
791 setup_memory_map(); 767 setup_memory_map();
792 parse_setup_data(); 768 parse_setup_data();
793 /* update the e820_saved too */ 769 /* update the e820_saved too */
@@ -838,11 +814,8 @@ void __init setup_arch(char **cmdline_p)
838 814
839 x86_report_nx(); 815 x86_report_nx();
840 816
841 /* Must be before kernel pagetables are setup */
842 vmi_activate();
843
844 /* after early param, so could get panic from serial */ 817 /* after early param, so could get panic from serial */
845 reserve_early_setup_data(); 818 memblock_x86_reserve_range_setup_data();
846 819
847 if (acpi_mps_check()) { 820 if (acpi_mps_check()) {
848#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
@@ -863,8 +836,6 @@ void __init setup_arch(char **cmdline_p)
863 836
864 dmi_scan_machine(); 837 dmi_scan_machine();
865 838
866 dmi_check_system(bad_bios_dmi_table);
867
868 /* 839 /*
869 * VMware detection requires dmi to be available, so this 840 * VMware detection requires dmi to be available, so this
870 * needs to be done after dmi_scan_machine, for the BP. 841 * needs to be done after dmi_scan_machine, for the BP.
@@ -897,8 +868,6 @@ void __init setup_arch(char **cmdline_p)
897 */ 868 */
898 max_pfn = e820_end_of_ram_pfn(); 869 max_pfn = e820_end_of_ram_pfn();
899 870
900 /* preallocate 4k for mptable mpc */
901 early_reserve_e820_mpc_new();
902 /* update e820 for memory not covered by WB MTRRs */ 871 /* update e820 for memory not covered by WB MTRRs */
903 mtrr_bp_init(); 872 mtrr_bp_init();
904 if (mtrr_trim_uncached_memory(max_pfn)) 873 if (mtrr_trim_uncached_memory(max_pfn))
@@ -920,18 +889,8 @@ void __init setup_arch(char **cmdline_p)
920 max_low_pfn = max_pfn; 889 max_low_pfn = max_pfn;
921 890
922 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 891 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
923 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
924#endif 892#endif
925 893
926#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
927 setup_bios_corruption_check();
928#endif
929
930 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
931 max_pfn_mapped<<PAGE_SHIFT);
932
933 reserve_brk();
934
935 /* 894 /*
936 * Find and reserve possible boot-time SMP configuration: 895 * Find and reserve possible boot-time SMP configuration:
937 */ 896 */
@@ -939,15 +898,37 @@ void __init setup_arch(char **cmdline_p)
939 898
940 reserve_ibft_region(); 899 reserve_ibft_region();
941 900
942 reserve_trampoline_memory(); 901 /*
902 * Need to conclude brk, before memblock_x86_fill()
903 * it could use memblock_find_in_range, could overlap with
904 * brk area.
905 */
906 reserve_brk();
907
908 cleanup_highmap();
909
910 memblock.current_limit = get_max_mapped();
911 memblock_x86_fill();
943 912
944#ifdef CONFIG_ACPI_SLEEP
945 /* 913 /*
946 * Reserve low memory region for sleep support. 914 * The EFI specification says that boot service code won't be called
947 * even before init_memory_mapping 915 * after ExitBootServices(). This is, in fact, a lie.
948 */ 916 */
949 acpi_reserve_wakeup_memory(); 917 if (efi_enabled)
918 efi_reserve_boot_services();
919
920 /* preallocate 4k for mptable mpc */
921 early_reserve_e820_mpc_new();
922
923#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
924 setup_bios_corruption_check();
950#endif 925#endif
926
927 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
928 max_pfn_mapped<<PAGE_SHIFT);
929
930 setup_trampolines();
931
951 init_gbpages(); 932 init_gbpages();
952 933
953 /* max_pfn_mapped is updated here */ 934 /* max_pfn_mapped is updated here */
@@ -962,6 +943,7 @@ void __init setup_arch(char **cmdline_p)
962 max_low_pfn = max_pfn; 943 max_low_pfn = max_pfn;
963 } 944 }
964#endif 945#endif
946 memblock.current_limit = get_max_mapped();
965 947
966 /* 948 /*
967 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 949 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -971,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
971 if (init_ohci1394_dma_early) 953 if (init_ohci1394_dma_early)
972 init_ohci1394_dma_on_all_controllers(); 954 init_ohci1394_dma_on_all_controllers();
973#endif 955#endif
956 /* Allocate bigger log buffer */
957 setup_log_buf(1);
974 958
975 reserve_initrd(); 959 reserve_initrd();
976 960
@@ -987,24 +971,8 @@ void __init setup_arch(char **cmdline_p)
987 971
988 early_acpi_boot_init(); 972 early_acpi_boot_init();
989 973
990#ifdef CONFIG_ACPI_NUMA 974 initmem_init();
991 /* 975 memblock_find_dma_reserve();
992 * Parse SRAT to discover nodes.
993 */
994 acpi = acpi_numa_init();
995#endif
996
997#ifdef CONFIG_K8_NUMA
998 if (!acpi)
999 k8 = !k8_numa_init(0, max_pfn);
1000#endif
1001
1002 initmem_init(0, max_pfn, acpi, k8);
1003#ifndef CONFIG_NO_BOOTMEM
1004 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
1005#endif
1006
1007 dma32_reserve_bootmem();
1008 976
1009#ifdef CONFIG_KVM_CLOCK 977#ifdef CONFIG_KVM_CLOCK
1010 kvmclock_init(); 978 kvmclock_init();
@@ -1014,7 +982,17 @@ void __init setup_arch(char **cmdline_p)
1014 paging_init(); 982 paging_init();
1015 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 983 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1016 984
1017 setup_trampoline_page_table(); 985 if (boot_cpu_data.cpuid_level >= 0) {
986 /* A CPU has %cr4 if and only if it has CPUID */
987 mmu_cr4_features = read_cr4();
988 }
989
990#ifdef CONFIG_X86_32
991 /* sync back kernel address range */
992 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
993 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
994 KERNEL_PGD_PTRS);
995#endif
1018 996
1019 tboot_probe(); 997 tboot_probe();
1020 998
@@ -1030,8 +1008,8 @@ void __init setup_arch(char **cmdline_p)
1030 * Read APIC and some other early information from ACPI tables. 1008 * Read APIC and some other early information from ACPI tables.
1031 */ 1009 */
1032 acpi_boot_init(); 1010 acpi_boot_init();
1033
1034 sfi_init(); 1011 sfi_init();
1012 x86_dtb_init();
1035 1013
1036 /* 1014 /*
1037 * get boot-time SMP configuration: 1015 * get boot-time SMP configuration:
@@ -1041,15 +1019,10 @@ void __init setup_arch(char **cmdline_p)
1041 1019
1042 prefill_possible_map(); 1020 prefill_possible_map();
1043 1021
1044#ifdef CONFIG_X86_64
1045 init_cpu_to_node(); 1022 init_cpu_to_node();
1046#endif
1047 1023
1048 init_apic_mappings(); 1024 init_apic_mappings();
1049 ioapic_init_mappings(); 1025 ioapic_and_gsi_init();
1050
1051 /* need to wait for io_apic is mapped */
1052 probe_nr_irqs_gsi();
1053 1026
1054 kvm_guest_init(); 1027 kvm_guest_init();
1055 1028
@@ -1070,7 +1043,11 @@ void __init setup_arch(char **cmdline_p)
1070#endif 1043#endif
1071 x86_init.oem.banner(); 1044 x86_init.oem.banner();
1072 1045
1046 x86_init.timers.wallclock_init();
1047
1073 mcheck_init(); 1048 mcheck_init();
1049
1050 arch_init_ideal_nops();
1074} 1051}
1075 1052
1076#ifdef CONFIG_X86_32 1053#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
131 131
132static void __init pcpu_fc_free(void *ptr, size_t size) 132static void __init pcpu_fc_free(void *ptr, size_t size)
133{ 133{
134#ifdef CONFIG_NO_BOOTMEM
135 u64 start = __pa(ptr);
136 u64 end = start + size;
137 free_early_partial(start, end);
138#else
139 free_bootmem(__pa(ptr), size); 134 free_bootmem(__pa(ptr), size);
140#endif
141} 135}
142 136
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 137static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -231,10 +225,15 @@ void __init setup_per_cpu_areas(void)
231 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
232 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
233#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
234#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
235 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
236 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
237 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
238#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
239 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
240 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -248,12 +247,11 @@ void __init setup_per_cpu_areas(void)
248 */ 247 */
249 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
250#endif 249#endif
251#endif
252 /* 250 /*
253 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
254 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
255 */ 253 */
256 if (cpu == boot_cpu_id) 254 if (!cpu)
257 switch_to_new_gdt(cpu); 255 switch_to_new_gdt(cpu);
258 } 256 }
259 257
@@ -262,7 +260,10 @@ void __init setup_per_cpu_areas(void)
262 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
263 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
264#endif 262#endif
265#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
266 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
267#endif 268#endif
268 269
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
deleted file mode 100644
index cb22acf3ed09..000000000000
--- a/arch/x86/kernel/sfi.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * sfi.c - x86 architecture SFI support.
3 *
4 * Copyright (c) 2009, Intel Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc.,
17 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 */
20
21#define KMSG_COMPONENT "SFI"
22#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24#include <linux/acpi.h>
25#include <linux/init.h>
26#include <linux/sfi.h>
27#include <linux/io.h>
28
29#include <asm/io_apic.h>
30#include <asm/mpspec.h>
31#include <asm/setup.h>
32#include <asm/apic.h>
33
34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id)
50{
51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS);
54 return;
55 }
56
57 pr_info("registering lapic[%d]\n", id);
58
59 generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR)));
60}
61
62static int __init sfi_parse_cpus(struct sfi_table_header *table)
63{
64 struct sfi_table_simple *sb;
65 struct sfi_cpu_table_entry *pentry;
66 int i;
67 int cpu_num;
68
69 sb = (struct sfi_table_simple *)table;
70 cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry);
71 pentry = (struct sfi_cpu_table_entry *)sb->pentry;
72
73 for (i = 0; i < cpu_num; i++) {
74 mp_sfi_register_lapic(pentry->apic_id);
75 pentry++;
76 }
77
78 smp_found_config = 1;
79 return 0;
80}
81#endif /* CONFIG_X86_LOCAL_APIC */
82
83#ifdef CONFIG_X86_IO_APIC
84
85static int __init sfi_parse_ioapic(struct sfi_table_header *table)
86{
87 struct sfi_table_simple *sb;
88 struct sfi_apic_table_entry *pentry;
89 int i, num;
90
91 sb = (struct sfi_table_simple *)table;
92 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry);
93 pentry = (struct sfi_apic_table_entry *)sb->pentry;
94
95 for (i = 0; i < num; i++) {
96 mp_register_ioapic(i, pentry->phys_addr, gsi_top);
97 pentry++;
98 }
99
100 WARN(pic_mode, KERN_WARNING
101 "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n");
102 pic_mode = 0;
103 return 0;
104}
105#endif /* CONFIG_X86_IO_APIC */
106
107/*
108 * sfi_platform_init(): register lapics & io-apics
109 */
110int __init sfi_platform_init(void)
111{
112#ifdef CONFIG_X86_LOCAL_APIC
113 mp_sfi_register_lapic_address(sfi_lapic_addr);
114 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
115#endif
116#ifdef CONFIG_X86_IO_APIC
117 sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic);
118#endif
119 return 0;
120}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..40a24932a8a1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
601 goto badframe; 601 goto badframe;
602 602
603 sigdelsetmask(&set, ~_BLOCKABLE); 603 sigdelsetmask(&set, ~_BLOCKABLE);
604 spin_lock_irq(&current->sighand->siglock); 604 set_current_blocked(&set);
605 current->blocked = set;
606 recalc_sigpending();
607 spin_unlock_irq(&current->sighand->siglock);
608 605
609 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 606 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
610 goto badframe; 607 goto badframe;
@@ -682,6 +679,7 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 sigset_t *oldset, struct pt_regs *regs) 680 sigset_t *oldset, struct pt_regs *regs)
684{ 681{
682 sigset_t blocked;
685 int ret; 683 int ret;
686 684
687 /* Are we from a system call? */ 685 /* Are we from a system call? */
@@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
741 */ 739 */
742 regs->flags &= ~X86_EFLAGS_TF; 740 regs->flags &= ~X86_EFLAGS_TF;
743 741
744 spin_lock_irq(&current->sighand->siglock); 742 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
745 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
746 if (!(ka->sa.sa_flags & SA_NODEFER)) 743 if (!(ka->sa.sa_flags & SA_NODEFER))
747 sigaddset(&current->blocked, sig); 744 sigaddset(&blocked, sig);
748 recalc_sigpending(); 745 set_current_blocked(&blocked);
749 spin_unlock_irq(&current->sighand->siglock);
750 746
751 tracehook_signal_handler(sig, info, ka, regs, 747 tracehook_signal_handler(sig, info, ka, regs,
752 test_thread_flag(TIF_SINGLESTEP)); 748 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 74cca6014c0e..ed4c4f54e2ae 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -174,10 +174,10 @@ asmlinkage void smp_reboot_interrupt(void)
174 irq_exit(); 174 irq_exit();
175} 175}
176 176
177static void native_smp_send_stop(void) 177static void native_stop_other_cpus(int wait)
178{ 178{
179 unsigned long flags; 179 unsigned long flags;
180 unsigned long wait; 180 unsigned long timeout;
181 181
182 if (reboot_force) 182 if (reboot_force)
183 return; 183 return;
@@ -194,9 +194,12 @@ static void native_smp_send_stop(void)
194 if (num_online_cpus() > 1) { 194 if (num_online_cpus() > 1) {
195 apic->send_IPI_allbutself(REBOOT_VECTOR); 195 apic->send_IPI_allbutself(REBOOT_VECTOR);
196 196
197 /* Don't wait longer than a second */ 197 /*
198 wait = USEC_PER_SEC; 198 * Don't wait longer than a second if the caller
199 while (num_online_cpus() > 1 && wait--) 199 * didn't ask us to wait.
200 */
201 timeout = USEC_PER_SEC;
202 while (num_online_cpus() > 1 && (wait || timeout--))
200 udelay(1); 203 udelay(1);
201 } 204 }
202 205
@@ -206,9 +209,7 @@ static void native_smp_send_stop(void)
206} 209}
207 210
208/* 211/*
209 * Reschedule call back. Nothing to do, 212 * Reschedule call back.
210 * all the work is done automatically when
211 * we return from the interrupt.
212 */ 213 */
213void smp_reschedule_interrupt(struct pt_regs *regs) 214void smp_reschedule_interrupt(struct pt_regs *regs)
214{ 215{
@@ -216,6 +217,11 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
216 /* LITMUS^RT: this IPI might need to trigger the sched state machine. */ 217 /* LITMUS^RT: this IPI might need to trigger the sched state machine. */
217 sched_state_ipi(); 218 sched_state_ipi();
218 inc_irq_stat(irq_resched_count); 219 inc_irq_stat(irq_resched_count);
220 /*
221 * LITMUS^RT: starting from 3.0 schedule_ipi() actually does something.
222 * This may increase IPI latencies compared with previous versions.
223 */
224 scheduler_ipi();
219 TS_SEND_RESCHED_END; 225 TS_SEND_RESCHED_END;
220 /* 226 /*
221 * KVM uses this interrupt to force a cpu out of guest mode 227 * KVM uses this interrupt to force a cpu out of guest mode
@@ -254,7 +260,7 @@ struct smp_ops smp_ops = {
254 .smp_prepare_cpus = native_smp_prepare_cpus, 260 .smp_prepare_cpus = native_smp_prepare_cpus,
255 .smp_cpus_done = native_smp_cpus_done, 261 .smp_cpus_done = native_smp_cpus_done,
256 262
257 .smp_send_stop = native_smp_send_stop, 263 .stop_other_cpus = native_stop_other_cpus,
258 .smp_send_reschedule = native_smp_send_reschedule, 264 .smp_send_reschedule = native_smp_send_reschedule,
259 265
260 .cpu_up = native_cpu_up, 266 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd708..9fd3137230d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,8 +62,9 @@
62#include <asm/pgtable.h> 62#include <asm/pgtable.h>
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/vmi.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -97,12 +94,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
97 */ 94 */
98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); 95static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99 96
100void cpu_hotplug_driver_lock() 97void cpu_hotplug_driver_lock(void)
101{ 98{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex); 99 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103} 100}
104 101
105void cpu_hotplug_driver_unlock() 102void cpu_hotplug_driver_unlock(void)
106{ 103{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 104 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108} 105}
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -281,6 +223,13 @@ static void __cpuinit smp_callin(void)
281 */ 223 */
282 smp_store_cpu_info(cpuid); 224 smp_store_cpu_info(cpuid);
283 225
226 /*
227 * This must be done before setting cpu_online_mask
228 * or calling notify_cpu_starting.
229 */
230 set_cpu_sibling_map(raw_smp_processor_id());
231 wmb();
232
284 notify_cpu_starting(cpuid); 233 notify_cpu_starting(cpuid);
285 234
286 /* 235 /*
@@ -299,23 +248,16 @@ notrace static void __cpuinit start_secondary(void *unused)
299 * fragile that we want to limit the things done here to the 248 * fragile that we want to limit the things done here to the
300 * most necessary things. 249 * most necessary things.
301 */ 250 */
251 cpu_init();
252 preempt_disable();
253 smp_callin();
302 254
303#ifdef CONFIG_X86_32 255#ifdef CONFIG_X86_32
304 /* 256 /* switch away from the initial page table */
305 * Switch away from the trampoline page-table
306 *
307 * Do this before cpu_init() because it needs to access per-cpu
308 * data which may not be mapped in the trampoline page-table.
309 */
310 load_cr3(swapper_pg_dir); 257 load_cr3(swapper_pg_dir);
311 __flush_tlb_all(); 258 __flush_tlb_all();
312#endif 259#endif
313 260
314 vmi_bringup();
315 cpu_init();
316 preempt_disable();
317 smp_callin();
318
319 /* otherwise gcc will move up smp_processor_id before the cpu_init */ 261 /* otherwise gcc will move up smp_processor_id before the cpu_init */
320 barrier(); 262 barrier();
321 /* 263 /*
@@ -323,16 +265,6 @@ notrace static void __cpuinit start_secondary(void *unused)
323 */ 265 */
324 check_tsc_sync_target(); 266 check_tsc_sync_target();
325 267
326 if (nmi_watchdog == NMI_IO_APIC) {
327 legacy_pic->chip->mask(0);
328 enable_NMI_through_LVT0();
329 legacy_pic->chip->unmask(0);
330 }
331
332 /* This must be done before setting cpu_online_mask */
333 set_cpu_sibling_map(raw_smp_processor_id());
334 wmb();
335
336 /* 268 /*
337 * We need to hold call_lock, so there is no inconsistency 269 * We need to hold call_lock, so there is no inconsistency
338 * between the time smp_call_function() determines number of 270 * between the time smp_call_function() determines number of
@@ -353,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
353 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 285 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
354 x86_platform.nmi_init(); 286 x86_platform.nmi_init();
355 287
288 /*
289 * Wait until the cpu which brought this one up marked it
290 * online before enabling interrupts. If we don't do that then
291 * we can end up waking up the softirq thread before this cpu
292 * reached the active state, which makes the scheduler unhappy
293 * and schedule the softirq thread on the wrong cpu. This is
294 * only observable with forced threaded interrupts, but in
295 * theory it could also happen w/o them. It's just way harder
296 * to achieve.
297 */
298 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
299 cpu_relax();
300
356 /* enable local interrupts */ 301 /* enable local interrupts */
357 local_irq_enable(); 302 local_irq_enable();
358 303
@@ -365,23 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused)
365 cpu_idle(); 310 cpu_idle();
366} 311}
367 312
368#ifdef CONFIG_CPUMASK_OFFSTACK
369/* In this case, llc_shared_map is a pointer to a cpumask. */
370static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
371 const struct cpuinfo_x86 *src)
372{
373 struct cpumask *llc = dst->llc_shared_map;
374 *dst = *src;
375 dst->llc_shared_map = llc;
376}
377#else
378static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
379 const struct cpuinfo_x86 *src)
380{
381 *dst = *src;
382}
383#endif /* CONFIG_CPUMASK_OFFSTACK */
384
385/* 313/*
386 * The bootstrap kernel entry code has set these up. Save them for 314 * The bootstrap kernel entry code has set these up. Save them for
387 * a given CPU 315 * a given CPU
@@ -391,12 +319,22 @@ void __cpuinit smp_store_cpu_info(int id)
391{ 319{
392 struct cpuinfo_x86 *c = &cpu_data(id); 320 struct cpuinfo_x86 *c = &cpu_data(id);
393 321
394 copy_cpuinfo_x86(c, &boot_cpu_data); 322 *c = boot_cpu_data;
395 c->cpu_index = id; 323 c->cpu_index = id;
396 if (id != 0) 324 if (id != 0)
397 identify_secondary_cpu(c); 325 identify_secondary_cpu(c);
398} 326}
399 327
328static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
329{
330 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
331 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
332 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
333 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
334 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
335 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
336}
337
400 338
401void __cpuinit set_cpu_sibling_map(int cpu) 339void __cpuinit set_cpu_sibling_map(int cpu)
402{ 340{
@@ -409,23 +347,23 @@ void __cpuinit set_cpu_sibling_map(int cpu)
409 for_each_cpu(i, cpu_sibling_setup_mask) { 347 for_each_cpu(i, cpu_sibling_setup_mask) {
410 struct cpuinfo_x86 *o = &cpu_data(i); 348 struct cpuinfo_x86 *o = &cpu_data(i);
411 349
412 if (c->phys_proc_id == o->phys_proc_id && 350 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
413 c->cpu_core_id == o->cpu_core_id) { 351 if (c->phys_proc_id == o->phys_proc_id &&
414 cpumask_set_cpu(i, cpu_sibling_mask(cpu)); 352 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
415 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 353 c->compute_unit_id == o->compute_unit_id)
416 cpumask_set_cpu(i, cpu_core_mask(cpu)); 354 link_thread_siblings(cpu, i);
417 cpumask_set_cpu(cpu, cpu_core_mask(i)); 355 } else if (c->phys_proc_id == o->phys_proc_id &&
418 cpumask_set_cpu(i, c->llc_shared_map); 356 c->cpu_core_id == o->cpu_core_id) {
419 cpumask_set_cpu(cpu, o->llc_shared_map); 357 link_thread_siblings(cpu, i);
420 } 358 }
421 } 359 }
422 } else { 360 } else {
423 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 361 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
424 } 362 }
425 363
426 cpumask_set_cpu(cpu, c->llc_shared_map); 364 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
427 365
428 if (current_cpu_data.x86_max_cores == 1) { 366 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
429 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 367 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
430 c->booted_cores = 1; 368 c->booted_cores = 1;
431 return; 369 return;
@@ -434,8 +372,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
434 for_each_cpu(i, cpu_sibling_setup_mask) { 372 for_each_cpu(i, cpu_sibling_setup_mask) {
435 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 373 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
436 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 374 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
437 cpumask_set_cpu(i, c->llc_shared_map); 375 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
438 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 376 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
439 } 377 }
440 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 378 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
441 cpumask_set_cpu(i, cpu_core_mask(cpu)); 379 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -474,7 +412,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
474 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 412 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
475 return cpu_core_mask(cpu); 413 return cpu_core_mask(cpu);
476 else 414 else
477 return c->llc_shared_map; 415 return cpu_llc_shared_mask(cpu);
478} 416}
479 417
480static void impress_friends(void) 418static void impress_friends(void)
@@ -636,7 +574,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
636 * target processor state. 574 * target processor state.
637 */ 575 */
638 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 576 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
639 (unsigned long)stack_start.sp); 577 stack_start);
640 578
641 /* 579 /*
642 * Run STARTUP IPI loop. 580 * Run STARTUP IPI loop.
@@ -742,7 +680,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
742 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 680 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
743 }; 681 };
744 682
745 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); 683 INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
746 684
747 alternatives_smp_switch(1); 685 alternatives_smp_switch(1);
748 686
@@ -774,7 +712,6 @@ do_rest:
774#ifdef CONFIG_X86_32 712#ifdef CONFIG_X86_32
775 /* Stack for startup_32 can be just as for start_secondary onwards */ 713 /* Stack for startup_32 can be just as for start_secondary onwards */
776 irq_ctx_init(cpu); 714 irq_ctx_init(cpu);
777 initial_page_table = __pa(&trampoline_pg_dir);
778#else 715#else
779 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 716 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
780 initial_gs = per_cpu_offset(cpu); 717 initial_gs = per_cpu_offset(cpu);
@@ -784,10 +721,10 @@ do_rest:
784#endif 721#endif
785 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 722 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
786 initial_code = (unsigned long)start_secondary; 723 initial_code = (unsigned long)start_secondary;
787 stack_start.sp = (void *) c_idle.idle->thread.sp; 724 stack_start = c_idle.idle->thread.sp;
788 725
789 /* start_ip had better be page-aligned! */ 726 /* start_ip had better be page-aligned! */
790 start_ip = setup_trampoline(); 727 start_ip = trampoline_address();
791 728
792 /* So we see what's up */ 729 /* So we see what's up */
793 announce_cpu(cpu, apicid); 730 announce_cpu(cpu, apicid);
@@ -797,6 +734,8 @@ do_rest:
797 * the targeted processor. 734 * the targeted processor.
798 */ 735 */
799 736
737 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
738
800 atomic_set(&init_deasserted, 0); 739 atomic_set(&init_deasserted, 0);
801 740
802 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 741 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -850,8 +789,8 @@ do_rest:
850 pr_debug("CPU%d: has booted.\n", cpu); 789 pr_debug("CPU%d: has booted.\n", cpu);
851 else { 790 else {
852 boot_error = 1; 791 boot_error = 1;
853 if (*((volatile unsigned char *)trampoline_base) 792 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
854 == 0xA5) 793 == 0xA5A5A5A5)
855 /* trampoline started but...? */ 794 /* trampoline started but...? */
856 pr_err("CPU%d: Stuck ??\n", cpu); 795 pr_err("CPU%d: Stuck ??\n", cpu);
857 else 796 else
@@ -877,7 +816,7 @@ do_rest:
877 } 816 }
878 817
879 /* mark "stuck" area as not stuck */ 818 /* mark "stuck" area as not stuck */
880 *((volatile unsigned long *)trampoline_base) = 0; 819 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
881 820
882 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 821 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
883 /* 822 /*
@@ -923,7 +862,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
923 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 862 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
924 863
925 err = do_boot_cpu(apicid, cpu); 864 err = do_boot_cpu(apicid, cpu);
926
927 if (err) { 865 if (err) {
928 pr_debug("do_boot_cpu failed %d\n", err); 866 pr_debug("do_boot_cpu failed %d\n", err);
929 return -EIO; 867 return -EIO;
@@ -945,6 +883,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
945 return 0; 883 return 0;
946} 884}
947 885
886/**
887 * arch_disable_smp_support() - disables SMP support for x86 at runtime
888 */
889void arch_disable_smp_support(void)
890{
891 disable_ioapic_support();
892}
893
948/* 894/*
949 * Fall back to non SMP mode after errors. 895 * Fall back to non SMP mode after errors.
950 * 896 *
@@ -960,7 +906,6 @@ static __init void disable_smp(void)
960 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 906 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
961 else 907 else
962 physid_set_mask_of_physid(0, &phys_cpu_present_map); 908 physid_set_mask_of_physid(0, &phys_cpu_present_map);
963 map_cpu_to_logical_apicid();
964 cpumask_set_cpu(0, cpu_sibling_mask(0)); 909 cpumask_set_cpu(0, cpu_sibling_mask(0));
965 cpumask_set_cpu(0, cpu_core_mask(0)); 910 cpumask_set_cpu(0, cpu_core_mask(0));
966} 911}
@@ -1045,7 +990,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1045 "(tell your hw vendor)\n"); 990 "(tell your hw vendor)\n");
1046 } 991 }
1047 smpboot_clear_io_apic(); 992 smpboot_clear_io_apic();
1048 arch_disable_smp_support(); 993 disable_ioapic_support();
1049 return -1; 994 return -1;
1050 } 995 }
1051 996
@@ -1058,11 +1003,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
1058 printk(KERN_INFO "SMP mode deactivated.\n"); 1003 printk(KERN_INFO "SMP mode deactivated.\n");
1059 smpboot_clear_io_apic(); 1004 smpboot_clear_io_apic();
1060 1005
1061 localise_nmi_watchdog();
1062
1063 connect_bsp_APIC(); 1006 connect_bsp_APIC();
1064 setup_local_APIC(); 1007 setup_local_APIC();
1065 end_local_APIC_setup(); 1008 bsp_end_local_APIC_setup();
1066 return -1; 1009 return -1;
1067 } 1010 }
1068 1011
@@ -1091,26 +1034,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1091 1034
1092 preempt_disable(); 1035 preempt_disable();
1093 smp_cpu_index_default(); 1036 smp_cpu_index_default();
1094 current_cpu_data = boot_cpu_data; 1037
1095 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1096 mb();
1097 /* 1038 /*
1098 * Setup boot CPU information 1039 * Setup boot CPU information
1099 */ 1040 */
1100 smp_store_cpu_info(0); /* Final full version of the data */ 1041 smp_store_cpu_info(0); /* Final full version of the data */
1101#ifdef CONFIG_X86_32 1042 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1102 boot_cpu_logical_apicid = logical_smp_processor_id(); 1043 mb();
1103#endif 1044
1104 current_thread_info()->cpu = 0; /* needed? */ 1045 current_thread_info()->cpu = 0; /* needed? */
1105 for_each_possible_cpu(i) { 1046 for_each_possible_cpu(i) {
1106 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1047 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1107 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1048 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1108 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1049 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1109 } 1050 }
1110 set_cpu_sibling_map(0); 1051 set_cpu_sibling_map(0);
1111 1052
1112 enable_IR_x2apic();
1113 default_setup_apic_routing();
1114 1053
1115 if (smp_sanity_check(max_cpus) < 0) { 1054 if (smp_sanity_check(max_cpus) < 0) {
1116 printk(KERN_INFO "SMP disabled\n"); 1055 printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1057,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1118 goto out; 1057 goto out;
1119 } 1058 }
1120 1059
1060 default_setup_apic_routing();
1061
1121 preempt_disable(); 1062 preempt_disable();
1122 if (read_apic_id() != boot_cpu_physical_apicid) { 1063 if (read_apic_id() != boot_cpu_physical_apicid) {
1123 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1064 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1139,9 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1139 if (!skip_ioapic_setup && nr_ioapics) 1080 if (!skip_ioapic_setup && nr_ioapics)
1140 enable_IO_APIC(); 1081 enable_IO_APIC();
1141 1082
1142 end_local_APIC_setup(); 1083 bsp_end_local_APIC_setup();
1143
1144 map_cpu_to_logical_apicid();
1145 1084
1146 if (apic->setup_portio_remap) 1085 if (apic->setup_portio_remap)
1147 apic->setup_portio_remap(); 1086 apic->setup_portio_remap();
@@ -1163,6 +1102,20 @@ out:
1163 preempt_enable(); 1102 preempt_enable();
1164} 1103}
1165 1104
1105void arch_disable_nonboot_cpus_begin(void)
1106{
1107 /*
1108 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1109 * In the suspend path, we will be back in the SMP mode shortly anyways.
1110 */
1111 skip_smp_alternatives = true;
1112}
1113
1114void arch_disable_nonboot_cpus_end(void)
1115{
1116 skip_smp_alternatives = false;
1117}
1118
1166void arch_enable_nonboot_cpus_begin(void) 1119void arch_enable_nonboot_cpus_begin(void)
1167{ 1120{
1168 set_mtrr_aps_delayed_init(); 1121 set_mtrr_aps_delayed_init();
@@ -1193,7 +1146,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1193#ifdef CONFIG_X86_IO_APIC 1146#ifdef CONFIG_X86_IO_APIC
1194 setup_ioapic_dest(); 1147 setup_ioapic_dest();
1195#endif 1148#endif
1196 check_nmi_watchdog();
1197 mtrr_aps_init(); 1149 mtrr_aps_init();
1198} 1150}
1199 1151
@@ -1338,8 +1290,6 @@ int native_cpu_disable(void)
1338 if (cpu == 0) 1290 if (cpu == 0)
1339 return -EBUSY; 1291 return -EBUSY;
1340 1292
1341 if (nmi_watchdog == NMI_LOCAL_APIC)
1342 stop_apic_nmi_watchdog(NULL);
1343 clear_local_APIC(); 1293 clear_local_APIC();
1344 1294
1345 cpu_disable_common(); 1295 cpu_disable_common();
@@ -1370,12 +1320,11 @@ void play_dead_common(void)
1370{ 1320{
1371 idle_task_exit(); 1321 idle_task_exit();
1372 reset_lazy_tlbstate(); 1322 reset_lazy_tlbstate();
1373 irq_ctx_exit(raw_smp_processor_id()); 1323 amd_e400_remove_cpu(raw_smp_processor_id());
1374 c1e_remove_cpu(raw_smp_processor_id());
1375 1324
1376 mb(); 1325 mb();
1377 /* Ack it */ 1326 /* Ack it */
1378 __get_cpu_var(cpu_state) = CPU_DEAD; 1327 __this_cpu_write(cpu_state, CPU_DEAD);
1379 1328
1380 /* 1329 /*
1381 * With physical CPU hotplug, we should halt the cpu 1330 * With physical CPU hotplug, we should halt the cpu
@@ -1383,11 +1332,89 @@ void play_dead_common(void)
1383 local_irq_disable(); 1332 local_irq_disable();
1384} 1333}
1385 1334
1335/*
1336 * We need to flush the caches before going to sleep, lest we have
1337 * dirty data in our caches when we come back up.
1338 */
1339static inline void mwait_play_dead(void)
1340{
1341 unsigned int eax, ebx, ecx, edx;
1342 unsigned int highest_cstate = 0;
1343 unsigned int highest_subcstate = 0;
1344 int i;
1345 void *mwait_ptr;
1346 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1347
1348 if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
1349 return;
1350 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1351 return;
1352 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1353 return;
1354
1355 eax = CPUID_MWAIT_LEAF;
1356 ecx = 0;
1357 native_cpuid(&eax, &ebx, &ecx, &edx);
1358
1359 /*
1360 * eax will be 0 if EDX enumeration is not valid.
1361 * Initialized below to cstate, sub_cstate value when EDX is valid.
1362 */
1363 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1364 eax = 0;
1365 } else {
1366 edx >>= MWAIT_SUBSTATE_SIZE;
1367 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1368 if (edx & MWAIT_SUBSTATE_MASK) {
1369 highest_cstate = i;
1370 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1371 }
1372 }
1373 eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1374 (highest_subcstate - 1);
1375 }
1376
1377 /*
1378 * This should be a memory location in a cache line which is
1379 * unlikely to be touched by other processors. The actual
1380 * content is immaterial as it is not actually modified in any way.
1381 */
1382 mwait_ptr = &current_thread_info()->flags;
1383
1384 wbinvd();
1385
1386 while (1) {
1387 /*
1388 * The CLFLUSH is a workaround for erratum AAI65 for
1389 * the Xeon 7400 series. It's not clear it is actually
1390 * needed, but it should be harmless in either case.
1391 * The WBINVD is insufficient due to the spurious-wakeup
1392 * case where we return around the loop.
1393 */
1394 clflush(mwait_ptr);
1395 __monitor(mwait_ptr, 0, 0);
1396 mb();
1397 __mwait(eax, 0);
1398 }
1399}
1400
1401static inline void hlt_play_dead(void)
1402{
1403 if (__this_cpu_read(cpu_info.x86) >= 4)
1404 wbinvd();
1405
1406 while (1) {
1407 native_halt();
1408 }
1409}
1410
1386void native_play_dead(void) 1411void native_play_dead(void)
1387{ 1412{
1388 play_dead_common(); 1413 play_dead_common();
1389 tboot_shutdown(TB_SHUTDOWN_WFS); 1414 tboot_shutdown(TB_SHUTDOWN_WFS);
1390 wbinvd_halt(); 1415
1416 mwait_play_dead(); /* Only returns on failure */
1417 hlt_play_dead();
1391} 1418}
1392 1419
1393#else /* ... !CONFIG_HOTPLUG_CPU */ 1420#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..55d9bc03f696 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
11 11
12static void save_stack_warning(void *data, char *msg)
13{
14}
15
16static void
17save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
18{
19}
20
21static int save_stack_stack(void *data, char *name) 12static int save_stack_stack(void *data, char *name)
22{ 13{
23 return 0; 14 return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 44}
54 45
55static const struct stacktrace_ops save_stack_ops = { 46static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 47 .stack = save_stack_stack,
59 .address = save_stack_address, 48 .address = save_stack_address,
60 .walk_stack = print_context_stack, 49 .walk_stack = print_context_stack,
61}; 50};
62 51
63static const struct stacktrace_ops save_stack_ops_nosched = { 52static const struct stacktrace_ops save_stack_ops_nosched = {
64 .warning = save_stack_warning,
65 .warning_symbol = save_stack_warning_symbol,
66 .stack = save_stack_stack, 53 .stack = save_stack_stack,
67 .address = save_stack_address_nosched, 54 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack, 55 .walk_stack = print_context_stack,
@@ -79,9 +66,9 @@ void save_stack_trace(struct stack_trace *trace)
79} 66}
80EXPORT_SYMBOL_GPL(save_stack_trace); 67EXPORT_SYMBOL_GPL(save_stack_trace);
81 68
82void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) 69void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 70{
84 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); 71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 72 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 73 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 74}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
33 const char *const envp[]) 33 const char *const envp[])
34{ 34{
35 long __res; 35 long __res;
36 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 36 asm volatile ("int $0x80"
37 : "=a" (__res) 37 : "=a" (__res)
38 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); 38 : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
39 return __res; 39 return __res;
40} 40}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 37702905f658..d0126222b394 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,15 +340,21 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_set_rt_task_param /* LITMUS^RT 341 */ 343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
349 .long sys_set_rt_task_param /* LITMUS^RT 347 */
344 .long sys_get_rt_task_param 350 .long sys_get_rt_task_param
345 .long sys_complete_job 351 .long sys_complete_job
346 .long sys_od_open 352 .long sys_od_open
347 .long sys_od_close 353 .long sys_od_close
348 .long sys_litmus_lock 354 .long sys_litmus_lock /* +5 */
349 .long sys_litmus_unlock 355 .long sys_litmus_unlock
350 .long sys_query_job_no 356 .long sys_query_job_no
351 .long sys_wait_for_job_release 357 .long sys_wait_for_job_release
352 .long sys_wait_for_ts_release 358 .long sys_wait_for_ts_release
353 .long sys_release_ts 359 .long sys_release_ts /* +10 */
354 .long sys_null_call 360 .long sys_null_call
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..30ac65df7d4e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = {
110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
113 .cpu_vm_mask = CPU_MASK_ALL,
114}; 113};
115 114
116static inline void switch_to_tboot_pt(void) 115static inline void switch_to_tboot_pt(void)
@@ -133,7 +132,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 133 if (!pmd)
135 return -1; 134 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 135 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 136 if (!pte)
138 return -1; 137 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd1..3f92ce07e525 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
161 } 161 }
162 162
163#endif 163#endif
164 return 0; 164 return ret;
165} 165}
166 166
167static void test_exit(void) 167static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..00cbb272627f 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,12 +22,8 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
31#endif 27#endif
32 28
33unsigned long profile_pc(struct pt_regs *regs) 29unsigned long profile_pc(struct pt_regs *regs)
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
63 /* Keep nmi watchdog up to date */ 59 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs); 60 inc_irq_stat(irq0_irqs);
65 61
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 raw_spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
81 63
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 312ef0292815..000000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,1655 +0,0 @@
1/*
2 * SGI UltraViolet TLB flush routines.
3 *
4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 * This code is released under the GNU General Public License version 2 or
7 * later.
8 */
9#include <linux/seq_file.h>
10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
12#include <linux/kernel.h>
13#include <linux/slab.h>
14
15#include <asm/mmu_context.h>
16#include <asm/uv/uv.h>
17#include <asm/uv/uv_mmrs.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20#include <asm/apic.h>
21#include <asm/idle.h>
22#include <asm/tsc.h>
23#include <asm/irq_vectors.h>
24#include <asm/timer.h>
25
26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
27static int timeout_base_ns[] = {
28 20,
29 160,
30 1280,
31 10240,
32 81920,
33 655360,
34 5242880,
35 167772160
36};
37static int timeout_us;
38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
57static int __init setup_nobau(char *arg)
58{
59 nobau = 1;
60 return 0;
61}
62early_param("nobau", setup_nobau);
63
64/* base pnode in this partition */
65static int uv_partition_base_pnode __read_mostly;
66/* position of pnode (which is nasid>>1): */
67static int uv_nshift __read_mostly;
68static unsigned long uv_mmask __read_mostly;
69
70static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
71static DEFINE_PER_CPU(struct bau_control, bau_control);
72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
73
74/*
75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
76 * memory allocation.
77 */
78static int __init uvhub_to_first_node(int uvhub)
79{
80 int node, b;
81
82 for_each_online_node(node) {
83 b = uv_node_to_blade_id(node);
84 if (uvhub == b)
85 return node;
86 }
87 return -1;
88}
89
90/*
91 * Determine the apicid of the first cpu on a uvhub.
92 */
93static int __init uvhub_to_first_apicid(int uvhub)
94{
95 int cpu;
96
97 for_each_present_cpu(cpu)
98 if (uvhub == uv_cpu_to_blade_id(cpu))
99 return per_cpu(x86_cpu_to_apicid, cpu);
100 return -1;
101}
102
103/*
104 * Free a software acknowledge hardware resource by clearing its Pending
105 * bit. This will return a reply to the sender.
106 * If the message has timed out, a reply has already been sent by the
107 * hardware but the resource has not been released. In that case our
108 * clear of the Timeout bit (as well) will free the resource. No reply will
109 * be sent (the hardware will only do one reply per message).
110 */
111static inline void uv_reply_to_message(struct msg_desc *mdp,
112 struct bau_control *bcp)
113{
114 unsigned long dw;
115 struct bau_payload_queue_entry *msg;
116
117 msg = mdp->msg;
118 if (!msg->canceled) {
119 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
120 msg->sw_ack_vector;
121 uv_write_local_mmr(
122 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
123 }
124 msg->replied_to = 1;
125 msg->sw_ack_vector = 0;
126}
127
128/*
129 * Process the receipt of a RETRY message
130 */
131static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
132 struct bau_control *bcp)
133{
134 int i;
135 int cancel_count = 0;
136 int slot2;
137 unsigned long msg_res;
138 unsigned long mmr = 0;
139 struct bau_payload_queue_entry *msg;
140 struct bau_payload_queue_entry *msg2;
141 struct ptc_stats *stat;
142
143 msg = mdp->msg;
144 stat = bcp->statp;
145 stat->d_retries++;
146 /*
147 * cancel any message from msg+1 to the retry itself
148 */
149 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
150 if (msg2 > mdp->va_queue_last)
151 msg2 = mdp->va_queue_first;
152 if (msg2 == msg)
153 break;
154
155 /* same conditions for cancellation as uv_do_reset */
156 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
157 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
158 msg->sw_ack_vector) == 0) &&
159 (msg2->sending_cpu == msg->sending_cpu) &&
160 (msg2->msg_type != MSG_NOOP)) {
161 slot2 = msg2 - mdp->va_queue_first;
162 mmr = uv_read_local_mmr
163 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
164 msg_res = msg2->sw_ack_vector;
165 /*
166 * This is a message retry; clear the resources held
167 * by the previous message only if they timed out.
168 * If it has not timed out we have an unexpected
169 * situation to report.
170 */
171 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
172 /*
173 * is the resource timed out?
174 * make everyone ignore the cancelled message.
175 */
176 msg2->canceled = 1;
177 stat->d_canceled++;
178 cancel_count++;
179 uv_write_local_mmr(
180 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
181 (msg_res << UV_SW_ACK_NPENDING) |
182 msg_res);
183 }
184 }
185 }
186 if (!cancel_count)
187 stat->d_nocanceled++;
188}
189
190/*
191 * Do all the things a cpu should do for a TLB shootdown message.
192 * Other cpu's may come here at the same time for this message.
193 */
194static void uv_bau_process_message(struct msg_desc *mdp,
195 struct bau_control *bcp)
196{
197 int msg_ack_count;
198 short socket_ack_count = 0;
199 struct ptc_stats *stat;
200 struct bau_payload_queue_entry *msg;
201 struct bau_control *smaster = bcp->socket_master;
202
203 /*
204 * This must be a normal message, or retry of a normal message
205 */
206 msg = mdp->msg;
207 stat = bcp->statp;
208 if (msg->address == TLB_FLUSH_ALL) {
209 local_flush_tlb();
210 stat->d_alltlb++;
211 } else {
212 __flush_tlb_one(msg->address);
213 stat->d_onetlb++;
214 }
215 stat->d_requestee++;
216
217 /*
218 * One cpu on each uvhub has the additional job on a RETRY
219 * of releasing the resource held by the message that is
220 * being retried. That message is identified by sending
221 * cpu number.
222 */
223 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
224 uv_bau_process_retry_msg(mdp, bcp);
225
226 /*
227 * This is a sw_ack message, so we have to reply to it.
228 * Count each responding cpu on the socket. This avoids
229 * pinging the count's cache line back and forth between
230 * the sockets.
231 */
232 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
233 &smaster->socket_acknowledge_count[mdp->msg_slot]);
234 if (socket_ack_count == bcp->cpus_in_socket) {
235 /*
236 * Both sockets dump their completed count total into
237 * the message's count.
238 */
239 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
240 msg_ack_count = atomic_add_short_return(socket_ack_count,
241 (struct atomic_short *)&msg->acknowledge_count);
242
243 if (msg_ack_count == bcp->cpus_in_uvhub) {
244 /*
245 * All cpus in uvhub saw it; reply
246 */
247 uv_reply_to_message(mdp, bcp);
248 }
249 }
250
251 return;
252}
253
254/*
255 * Determine the first cpu on a uvhub.
256 */
257static int uvhub_to_first_cpu(int uvhub)
258{
259 int cpu;
260 for_each_present_cpu(cpu)
261 if (uvhub == uv_cpu_to_blade_id(cpu))
262 return cpu;
263 return -1;
264}
265
266/*
267 * Last resort when we get a large number of destination timeouts is
268 * to clear resources held by a given cpu.
269 * Do this with IPI so that all messages in the BAU message queue
270 * can be identified by their nonzero sw_ack_vector field.
271 *
272 * This is entered for a single cpu on the uvhub.
273 * The sender want's this uvhub to free a specific message's
274 * sw_ack resources.
275 */
276static void
277uv_do_reset(void *ptr)
278{
279 int i;
280 int slot;
281 int count = 0;
282 unsigned long mmr;
283 unsigned long msg_res;
284 struct bau_control *bcp;
285 struct reset_args *rap;
286 struct bau_payload_queue_entry *msg;
287 struct ptc_stats *stat;
288
289 bcp = &per_cpu(bau_control, smp_processor_id());
290 rap = (struct reset_args *)ptr;
291 stat = bcp->statp;
292 stat->d_resets++;
293
294 /*
295 * We're looking for the given sender, and
296 * will free its sw_ack resource.
297 * If all cpu's finally responded after the timeout, its
298 * message 'replied_to' was set.
299 */
300 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
301 /* uv_do_reset: same conditions for cancellation as
302 uv_bau_process_retry_msg() */
303 if ((msg->replied_to == 0) &&
304 (msg->canceled == 0) &&
305 (msg->sending_cpu == rap->sender) &&
306 (msg->sw_ack_vector) &&
307 (msg->msg_type != MSG_NOOP)) {
308 /*
309 * make everyone else ignore this message
310 */
311 msg->canceled = 1;
312 slot = msg - bcp->va_queue_first;
313 count++;
314 /*
315 * only reset the resource if it is still pending
316 */
317 mmr = uv_read_local_mmr
318 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
319 msg_res = msg->sw_ack_vector;
320 if (mmr & msg_res) {
321 stat->d_rcanceled++;
322 uv_write_local_mmr(
323 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
324 (msg_res << UV_SW_ACK_NPENDING) |
325 msg_res);
326 }
327 }
328 }
329 return;
330}
331
332/*
333 * Use IPI to get all target uvhubs to release resources held by
334 * a given sending cpu number.
335 */
336static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
337 int sender)
338{
339 int uvhub;
340 int cpu;
341 cpumask_t mask;
342 struct reset_args reset_args;
343
344 reset_args.sender = sender;
345
346 cpus_clear(mask);
347 /* find a single cpu for each uvhub in this distribution mask */
348 for (uvhub = 0;
349 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
350 uvhub++) {
351 if (!bau_uvhub_isset(uvhub, distribution))
352 continue;
353 /* find a cpu for this uvhub */
354 cpu = uvhub_to_first_cpu(uvhub);
355 cpu_set(cpu, mask);
356 }
357 /* IPI all cpus; Preemption is already disabled */
358 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
359 return;
360}
361
362static inline unsigned long
363cycles_2_us(unsigned long long cyc)
364{
365 unsigned long long ns;
366 unsigned long us;
367 ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
368 >> CYC2NS_SCALE_FACTOR;
369 us = ns / 1000;
370 return us;
371}
372
373/*
374 * wait for all cpus on this hub to finish their sends and go quiet
375 * leaves uvhub_quiesce set so that no new broadcasts are started by
376 * bau_flush_send_and_wait()
377 */
378static inline void
379quiesce_local_uvhub(struct bau_control *hmaster)
380{
381 atomic_add_short_return(1, (struct atomic_short *)
382 &hmaster->uvhub_quiesce);
383}
384
385/*
386 * mark this quiet-requestor as done
387 */
388static inline void
389end_uvhub_quiesce(struct bau_control *hmaster)
390{
391 atomic_add_short_return(-1, (struct atomic_short *)
392 &hmaster->uvhub_quiesce);
393}
394
395/*
396 * Wait for completion of a broadcast software ack message
397 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
398 */
399static int uv_wait_completion(struct bau_desc *bau_desc,
400 unsigned long mmr_offset, int right_shift, int this_cpu,
401 struct bau_control *bcp, struct bau_control *smaster, long try)
402{
403 unsigned long descriptor_status;
404 cycles_t ttime;
405 struct ptc_stats *stat = bcp->statp;
406 struct bau_control *hmaster;
407
408 hmaster = bcp->uvhub_master;
409
410 /* spin on the status MMR, waiting for it to go idle */
411 while ((descriptor_status = (((unsigned long)
412 uv_read_local_mmr(mmr_offset) >>
413 right_shift) & UV_ACT_STATUS_MASK)) !=
414 DESC_STATUS_IDLE) {
415 /*
416 * Our software ack messages may be blocked because there are
417 * no swack resources available. As long as none of them
418 * has timed out hardware will NACK our message and its
419 * state will stay IDLE.
420 */
421 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
422 stat->s_stimeout++;
423 return FLUSH_GIVEUP;
424 } else if (descriptor_status ==
425 DESC_STATUS_DESTINATION_TIMEOUT) {
426 stat->s_dtimeout++;
427 ttime = get_cycles();
428
429 /*
430 * Our retries may be blocked by all destination
431 * swack resources being consumed, and a timeout
432 * pending. In that case hardware returns the
433 * ERROR that looks like a destination timeout.
434 */
435 if (cycles_2_us(ttime - bcp->send_message) <
436 timeout_us) {
437 bcp->conseccompletes = 0;
438 return FLUSH_RETRY_PLUGGED;
439 }
440
441 bcp->conseccompletes = 0;
442 return FLUSH_RETRY_TIMEOUT;
443 } else {
444 /*
445 * descriptor_status is still BUSY
446 */
447 cpu_relax();
448 }
449 }
450 bcp->conseccompletes++;
451 return FLUSH_COMPLETE;
452}
453
454static inline cycles_t
455sec_2_cycles(unsigned long sec)
456{
457 unsigned long ns;
458 cycles_t cyc;
459
460 ns = sec * 1000000000;
461 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
462 return cyc;
463}
464
465/*
466 * conditionally add 1 to *v, unless *v is >= u
467 * return 0 if we cannot add 1 to *v because it is >= u
468 * return 1 if we can add 1 to *v because it is < u
469 * the add is atomic
470 *
471 * This is close to atomic_add_unless(), but this allows the 'u' value
472 * to be lowered below the current 'v'. atomic_add_unless can only stop
473 * on equal.
474 */
475static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
476{
477 spin_lock(lock);
478 if (atomic_read(v) >= u) {
479 spin_unlock(lock);
480 return 0;
481 }
482 atomic_inc(v);
483 spin_unlock(lock);
484 return 1;
485}
486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494 struct bau_control *hmaster, struct ptc_stats *stat)
495{
496 udelay(bcp->plugged_delay);
497 bcp->plugged_tries++;
498 if (bcp->plugged_tries >= bcp->plugsb4reset) {
499 bcp->plugged_tries = 0;
500 quiesce_local_uvhub(hmaster);
501 spin_lock(&hmaster->queue_lock);
502 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503 spin_unlock(&hmaster->queue_lock);
504 end_uvhub_quiesce(hmaster);
505 bcp->ipi_attempts++;
506 stat->s_resets_plug++;
507 }
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512 struct bau_control *hmaster, struct ptc_stats *stat)
513{
514 hmaster->max_bau_concurrent = 1;
515 bcp->timeout_tries++;
516 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517 bcp->timeout_tries = 0;
518 quiesce_local_uvhub(hmaster);
519 spin_lock(&hmaster->queue_lock);
520 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521 spin_unlock(&hmaster->queue_lock);
522 end_uvhub_quiesce(hmaster);
523 bcp->ipi_attempts++;
524 stat->s_resets_timeout++;
525 }
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535 int tcpu;
536 struct bau_control *tbcp;
537
538 /* let only one cpu do this disabling */
539 spin_lock(&disable_lock);
540 if (!baudisabled && bcp->period_requests &&
541 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542 /* it becomes this cpu's job to turn on the use of the
543 BAU again */
544 baudisabled = 1;
545 bcp->set_bau_off = 1;
546 bcp->set_bau_on_time = get_cycles() +
547 sec_2_cycles(bcp->congested_period);
548 stat->s_bau_disabled++;
549 for_each_present_cpu(tcpu) {
550 tbcp = &per_cpu(bau_control, tcpu);
551 tbcp->baudisabled = 1;
552 }
553 }
554 spin_unlock(&disable_lock);
555}
556
557/**
558 * uv_flush_send_and_wait
559 *
560 * Send a broadcast and wait for it to complete.
561 *
562 * The flush_mask contains the cpus the broadcast is to be sent to including
563 * cpus that are on the local uvhub.
564 *
565 * Returns 0 if all flushing represented in the mask was done.
566 * Returns 1 if it gives up entirely and the original cpu mask is to be
567 * returned to the kernel.
568 */
569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
570 struct cpumask *flush_mask, struct bau_control *bcp)
571{
572 int right_shift;
573 int completion_status = 0;
574 int seq_number = 0;
575 long try = 0;
576 int cpu = bcp->uvhub_cpu;
577 int this_cpu = bcp->cpu;
578 unsigned long mmr_offset;
579 unsigned long index;
580 cycles_t time1;
581 cycles_t time2;
582 cycles_t elapsed;
583 struct ptc_stats *stat = bcp->statp;
584 struct bau_control *smaster = bcp->socket_master;
585 struct bau_control *hmaster = bcp->uvhub_master;
586
587 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
588 &hmaster->active_descriptor_count,
589 hmaster->max_bau_concurrent)) {
590 stat->s_throttles++;
591 do {
592 cpu_relax();
593 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
594 &hmaster->active_descriptor_count,
595 hmaster->max_bau_concurrent));
596 }
597 while (hmaster->uvhub_quiesce)
598 cpu_relax();
599
600 if (cpu < UV_CPUS_PER_ACT_STATUS) {
601 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
602 right_shift = cpu * UV_ACT_STATUS_SIZE;
603 } else {
604 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
605 right_shift =
606 ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
607 }
608 time1 = get_cycles();
609 do {
610 if (try == 0) {
611 bau_desc->header.msg_type = MSG_REGULAR;
612 seq_number = bcp->message_number++;
613 } else {
614 bau_desc->header.msg_type = MSG_RETRY;
615 stat->s_retry_messages++;
616 }
617 bau_desc->header.sequence = seq_number;
618 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
619 bcp->uvhub_cpu;
620 bcp->send_message = get_cycles();
621 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
622 try++;
623 completion_status = uv_wait_completion(bau_desc, mmr_offset,
624 right_shift, this_cpu, bcp, smaster, try);
625
626 if (completion_status == FLUSH_RETRY_PLUGGED) {
627 destination_plugged(bau_desc, bcp, hmaster, stat);
628 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
629 destination_timeout(bau_desc, bcp, hmaster, stat);
630 }
631 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
632 bcp->ipi_attempts = 0;
633 completion_status = FLUSH_GIVEUP;
634 break;
635 }
636 cpu_relax();
637 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
638 (completion_status == FLUSH_RETRY_TIMEOUT));
639 time2 = get_cycles();
640 bcp->plugged_tries = 0;
641 bcp->timeout_tries = 0;
642 if ((completion_status == FLUSH_COMPLETE) &&
643 (bcp->conseccompletes > bcp->complete_threshold) &&
644 (hmaster->max_bau_concurrent <
645 hmaster->max_bau_concurrent_constant))
646 hmaster->max_bau_concurrent++;
647 while (hmaster->uvhub_quiesce)
648 cpu_relax();
649 atomic_dec(&hmaster->active_descriptor_count);
650 if (time2 > time1) {
651 elapsed = time2 - time1;
652 stat->s_time += elapsed;
653 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
654 bcp->period_requests++;
655 bcp->period_time += elapsed;
656 if ((elapsed > congested_cycles) &&
657 (bcp->period_requests > bcp->congested_reps)) {
658 disable_for_congestion(bcp, stat);
659 }
660 }
661 } else
662 stat->s_requestor--;
663 if (completion_status == FLUSH_COMPLETE && try > 1)
664 stat->s_retriesok++;
665 else if (completion_status == FLUSH_GIVEUP) {
666 stat->s_giveup++;
667 return 1;
668 }
669 return 0;
670}
671
672/**
673 * uv_flush_tlb_others - globally purge translation cache of a virtual
674 * address or all TLB's
675 * @cpumask: mask of all cpu's in which the address is to be removed
676 * @mm: mm_struct containing virtual address range
677 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
678 * @cpu: the current cpu
679 *
680 * This is the entry point for initiating any UV global TLB shootdown.
681 *
682 * Purges the translation caches of all specified processors of the given
683 * virtual address, or purges all TLB's on specified processors.
684 *
685 * The caller has derived the cpumask from the mm_struct. This function
686 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
687 *
688 * The cpumask is converted into a uvhubmask of the uvhubs containing
689 * those cpus.
690 *
691 * Note that this function should be called with preemption disabled.
692 *
693 * Returns NULL if all remote flushing was done.
694 * Returns pointer to cpumask if some remote flushing remains to be
695 * done. The returned pointer is valid till preemption is re-enabled.
696 */
697const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
698 struct mm_struct *mm,
699 unsigned long va, unsigned int cpu)
700{
701 int tcpu;
702 int uvhub;
703 int locals = 0;
704 int remotes = 0;
705 int hubs = 0;
706 struct bau_desc *bau_desc;
707 struct cpumask *flush_mask;
708 struct ptc_stats *stat;
709 struct bau_control *bcp;
710 struct bau_control *tbcp;
711
712 /* kernel was booted 'nobau' */
713 if (nobau)
714 return cpumask;
715
716 bcp = &per_cpu(bau_control, cpu);
717 stat = bcp->statp;
718
719 /* bau was disabled due to slow response */
720 if (bcp->baudisabled) {
721 /* the cpu that disabled it must re-enable it */
722 if (bcp->set_bau_off) {
723 if (get_cycles() >= bcp->set_bau_on_time) {
724 stat->s_bau_reenabled++;
725 baudisabled = 0;
726 for_each_present_cpu(tcpu) {
727 tbcp = &per_cpu(bau_control, tcpu);
728 tbcp->baudisabled = 0;
729 tbcp->period_requests = 0;
730 tbcp->period_time = 0;
731 }
732 }
733 }
734 return cpumask;
735 }
736
737 /*
738 * Each sending cpu has a per-cpu mask which it fills from the caller's
739 * cpu mask. All cpus are converted to uvhubs and copied to the
740 * activation descriptor.
741 */
742 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
743 /* don't actually do a shootdown of the local cpu */
744 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
745 if (cpu_isset(cpu, *cpumask))
746 stat->s_ntargself++;
747
748 bau_desc = bcp->descriptor_base;
749 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
750 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
751
752 /* cpu statistics */
753 for_each_cpu(tcpu, flush_mask) {
754 uvhub = uv_cpu_to_blade_id(tcpu);
755 bau_uvhub_set(uvhub, &bau_desc->distribution);
756 if (uvhub == bcp->uvhub)
757 locals++;
758 else
759 remotes++;
760 }
761 if ((locals + remotes) == 0)
762 return NULL;
763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes + locals;
765 stat->s_ntargremotes += remotes;
766 stat->s_ntarglocals += locals;
767 remotes = bau_uvhub_weight(&bau_desc->distribution);
768
769 /* uvhub statistics */
770 hubs = bau_uvhub_weight(&bau_desc->distribution);
771 if (locals) {
772 stat->s_ntarglocaluvhub++;
773 stat->s_ntargremoteuvhub += (hubs - 1);
774 } else
775 stat->s_ntargremoteuvhub += hubs;
776 stat->s_ntarguvhub += hubs;
777 if (hubs >= 16)
778 stat->s_ntarguvhub16++;
779 else if (hubs >= 8)
780 stat->s_ntarguvhub8++;
781 else if (hubs >= 4)
782 stat->s_ntarguvhub4++;
783 else if (hubs >= 2)
784 stat->s_ntarguvhub2++;
785 else
786 stat->s_ntarguvhub1++;
787
788 bau_desc->payload.address = va;
789 bau_desc->payload.sending_cpu = cpu;
790
791 /*
792 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
793 * or 1 if it gave up and the original cpumask should be returned.
794 */
795 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796 return NULL;
797 else
798 return cpumask;
799}
800
801/*
802 * The BAU message interrupt comes here. (registered by set_intr_gate)
803 * See entry_64.S
804 *
805 * We received a broadcast assist message.
806 *
807 * Interrupts are disabled; this interrupt could represent
808 * the receipt of several messages.
809 *
810 * All cores/threads on this hub get this interrupt.
811 * The last one to see it does the software ack.
812 * (the resource will not be freed until noninterruptable cpus see this
813 * interrupt; hardware may timeout the s/w ack and reply ERROR)
814 */
815void uv_bau_message_interrupt(struct pt_regs *regs)
816{
817 int count = 0;
818 cycles_t time_start;
819 struct bau_payload_queue_entry *msg;
820 struct bau_control *bcp;
821 struct ptc_stats *stat;
822 struct msg_desc msgdesc;
823
824 time_start = get_cycles();
825 bcp = &per_cpu(bau_control, smp_processor_id());
826 stat = bcp->statp;
827 msgdesc.va_queue_first = bcp->va_queue_first;
828 msgdesc.va_queue_last = bcp->va_queue_last;
829 msg = bcp->bau_msg_head;
830 while (msg->sw_ack_vector) {
831 count++;
832 msgdesc.msg_slot = msg - msgdesc.va_queue_first;
833 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
834 msgdesc.msg = msg;
835 uv_bau_process_message(&msgdesc, bcp);
836 msg++;
837 if (msg > msgdesc.va_queue_last)
838 msg = msgdesc.va_queue_first;
839 bcp->bau_msg_head = msg;
840 }
841 stat->d_time += (get_cycles() - time_start);
842 if (!count)
843 stat->d_nomsg++;
844 else if (count > 1)
845 stat->d_multmsg++;
846 ack_APIC_irq();
847}
848
849/*
850 * uv_enable_timeouts
851 *
852 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
853 * shootdown message timeouts enabled. The timeout does not cause
854 * an interrupt, but causes an error message to be returned to
855 * the sender.
856 */
857static void uv_enable_timeouts(void)
858{
859 int uvhub;
860 int nuvhubs;
861 int pnode;
862 unsigned long mmr_image;
863
864 nuvhubs = uv_num_possible_blades();
865
866 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
867 if (!uv_blade_nr_possible_cpus(uvhub))
868 continue;
869
870 pnode = uv_blade_to_pnode(uvhub);
871 mmr_image =
872 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
873 /*
874 * Set the timeout period and then lock it in, in three
875 * steps; captures and locks in the period.
876 *
877 * To program the period, the SOFT_ACK_MODE must be off.
878 */
879 mmr_image &= ~((unsigned long)1 <<
880 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
881 uv_write_global_mmr64
882 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
883 /*
884 * Set the 4-bit period.
885 */
886 mmr_image &= ~((unsigned long)0xf <<
887 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
888 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
889 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
890 uv_write_global_mmr64
891 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
892 /*
893 * Subsequent reversals of the timebase bit (3) cause an
894 * immediate timeout of one or all INTD resources as
895 * indicated in bits 2:0 (7 causes all of them to timeout).
896 */
897 mmr_image |= ((unsigned long)1 <<
898 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
899 uv_write_global_mmr64
900 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
901 }
902}
903
904static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
905{
906 if (*offset < num_possible_cpus())
907 return offset;
908 return NULL;
909}
910
911static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
912{
913 (*offset)++;
914 if (*offset < num_possible_cpus())
915 return offset;
916 return NULL;
917}
918
919static void uv_ptc_seq_stop(struct seq_file *file, void *data)
920{
921}
922
923static inline unsigned long long
924microsec_2_cycles(unsigned long microsec)
925{
926 unsigned long ns;
927 unsigned long long cyc;
928
929 ns = microsec * 1000;
930 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
931 return cyc;
932}
933
934/*
935 * Display the statistics thru /proc.
936 * 'data' points to the cpu number
937 */
938static int uv_ptc_seq_show(struct seq_file *file, void *data)
939{
940 struct ptc_stats *stat;
941 int cpu;
942
943 cpu = *(loff_t *)data;
944
945 if (!cpu) {
946 seq_printf(file,
947 "# cpu sent stime self locals remotes ncpus localhub ");
948 seq_printf(file,
949 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
950 seq_printf(file,
951 "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
952 seq_printf(file,
953 "retries rok resetp resett giveup sto bz throt ");
954 seq_printf(file,
955 "sw_ack recv rtime all ");
956 seq_printf(file,
957 "one mult none retry canc nocan reset rcan ");
958 seq_printf(file,
959 "disable enable\n");
960 }
961 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
962 stat = &per_cpu(ptcstats, cpu);
963 /* source side statistics */
964 seq_printf(file,
965 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
966 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
967 stat->s_ntargself, stat->s_ntarglocals,
968 stat->s_ntargremotes, stat->s_ntargcpu,
969 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970 stat->s_ntarguvhub, stat->s_ntarguvhub16);
971 seq_printf(file, "%ld %ld %ld %ld %ld ",
972 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
973 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
974 stat->s_dtimeout);
975 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
976 stat->s_retry_messages, stat->s_retriesok,
977 stat->s_resets_plug, stat->s_resets_timeout,
978 stat->s_giveup, stat->s_stimeout,
979 stat->s_busy, stat->s_throttles);
980
981 /* destination side statistics */
982 seq_printf(file,
983 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
984 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
985 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
986 stat->d_requestee, cycles_2_us(stat->d_time),
987 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
988 stat->d_nomsg, stat->d_retries, stat->d_canceled,
989 stat->d_nocanceled, stat->d_resets,
990 stat->d_rcanceled);
991 seq_printf(file, "%ld %ld\n",
992 stat->s_bau_disabled, stat->s_bau_reenabled);
993 }
994
995 return 0;
996}
997
998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002 size_t count, loff_t *ppos)
1003{
1004 char buf[300];
1005 int ret;
1006
1007 ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008 "max_bau_concurrent plugged_delay plugsb4reset",
1009 "timeoutsb4reset ipi_reset_limit complete_threshold",
1010 "congested_response_us congested_reps congested_period",
1011 max_bau_concurrent, plugged_delay, plugsb4reset,
1012 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013 congested_response_us, congested_reps, congested_period);
1014
1015 return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
1016}
1017
1018/*
1019 * -1: resetf the statistics
1020 * 0: display meaning of the statistics
1021 */
1022static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1023 size_t count, loff_t *data)
1024{
1025 int cpu;
1026 long input_arg;
1027 char optstr[64];
1028 struct ptc_stats *stat;
1029
1030 if (count == 0 || count > sizeof(optstr))
1031 return -EINVAL;
1032 if (copy_from_user(optstr, user, count))
1033 return -EFAULT;
1034 optstr[count - 1] = '\0';
1035 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1036 printk(KERN_DEBUG "%s is invalid\n", optstr);
1037 return -EINVAL;
1038 }
1039
1040 if (input_arg == 0) {
1041 printk(KERN_DEBUG "# cpu: cpu number\n");
1042 printk(KERN_DEBUG "Sender statistics:\n");
1043 printk(KERN_DEBUG
1044 "sent: number of shootdown messages sent\n");
1045 printk(KERN_DEBUG
1046 "stime: time spent sending messages\n");
1047 printk(KERN_DEBUG
1048 "numuvhubs: number of hubs targeted with shootdown\n");
1049 printk(KERN_DEBUG
1050 "numuvhubs16: number times 16 or more hubs targeted\n");
1051 printk(KERN_DEBUG
1052 "numuvhubs8: number times 8 or more hubs targeted\n");
1053 printk(KERN_DEBUG
1054 "numuvhubs4: number times 4 or more hubs targeted\n");
1055 printk(KERN_DEBUG
1056 "numuvhubs2: number times 2 or more hubs targeted\n");
1057 printk(KERN_DEBUG
1058 "numuvhubs1: number times 1 hub targeted\n");
1059 printk(KERN_DEBUG
1060 "numcpus: number of cpus targeted with shootdown\n");
1061 printk(KERN_DEBUG
1062 "dto: number of destination timeouts\n");
1063 printk(KERN_DEBUG
1064 "retries: destination timeout retries sent\n");
1065 printk(KERN_DEBUG
1066 "rok: : destination timeouts successfully retried\n");
1067 printk(KERN_DEBUG
1068 "resetp: ipi-style resource resets for plugs\n");
1069 printk(KERN_DEBUG
1070 "resett: ipi-style resource resets for timeouts\n");
1071 printk(KERN_DEBUG
1072 "giveup: fall-backs to ipi-style shootdowns\n");
1073 printk(KERN_DEBUG
1074 "sto: number of source timeouts\n");
1075 printk(KERN_DEBUG
1076 "bz: number of stay-busy's\n");
1077 printk(KERN_DEBUG
1078 "throt: number times spun in throttle\n");
1079 printk(KERN_DEBUG "Destination side statistics:\n");
1080 printk(KERN_DEBUG
1081 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
1082 printk(KERN_DEBUG
1083 "recv: shootdown messages received\n");
1084 printk(KERN_DEBUG
1085 "rtime: time spent processing messages\n");
1086 printk(KERN_DEBUG
1087 "all: shootdown all-tlb messages\n");
1088 printk(KERN_DEBUG
1089 "one: shootdown one-tlb messages\n");
1090 printk(KERN_DEBUG
1091 "mult: interrupts that found multiple messages\n");
1092 printk(KERN_DEBUG
1093 "none: interrupts that found no messages\n");
1094 printk(KERN_DEBUG
1095 "retry: number of retry messages processed\n");
1096 printk(KERN_DEBUG
1097 "canc: number messages canceled by retries\n");
1098 printk(KERN_DEBUG
1099 "nocan: number retries that found nothing to cancel\n");
1100 printk(KERN_DEBUG
1101 "reset: number of ipi-style reset requests processed\n");
1102 printk(KERN_DEBUG
1103 "rcan: number messages canceled by reset requests\n");
1104 printk(KERN_DEBUG
1105 "disable: number times use of the BAU was disabled\n");
1106 printk(KERN_DEBUG
1107 "enable: number times use of the BAU was re-enabled\n");
1108 } else if (input_arg == -1) {
1109 for_each_present_cpu(cpu) {
1110 stat = &per_cpu(ptcstats, cpu);
1111 memset(stat, 0, sizeof(struct ptc_stats));
1112 }
1113 }
1114
1115 return count;
1116}
1117
1118static int local_atoi(const char *name)
1119{
1120 int val = 0;
1121
1122 for (;; name++) {
1123 switch (*name) {
1124 case '0' ... '9':
1125 val = 10*val+(*name-'0');
1126 break;
1127 default:
1128 return val;
1129 }
1130 }
1131}
1132
1133/*
1134 * set the tunables
1135 * 0 values reset them to defaults
1136 */
1137static ssize_t tunables_write(struct file *file, const char __user *user,
1138 size_t count, loff_t *data)
1139{
1140 int cpu;
1141 int cnt = 0;
1142 int val;
1143 char *p;
1144 char *q;
1145 char instr[64];
1146 struct bau_control *bcp;
1147
1148 if (count == 0 || count > sizeof(instr)-1)
1149 return -EINVAL;
1150 if (copy_from_user(instr, user, count))
1151 return -EFAULT;
1152
1153 instr[count] = '\0';
1154 /* count the fields */
1155 p = instr + strspn(instr, WHITESPACE);
1156 q = p;
1157 for (; *p; p = q + strspn(q, WHITESPACE)) {
1158 q = p + strcspn(p, WHITESPACE);
1159 cnt++;
1160 if (q == p)
1161 break;
1162 }
1163 if (cnt != 9) {
1164 printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1165 return -EINVAL;
1166 }
1167
1168 p = instr + strspn(instr, WHITESPACE);
1169 q = p;
1170 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1171 q = p + strcspn(p, WHITESPACE);
1172 val = local_atoi(p);
1173 switch (cnt) {
1174 case 0:
1175 if (val == 0) {
1176 max_bau_concurrent = MAX_BAU_CONCURRENT;
1177 max_bau_concurrent_constant =
1178 MAX_BAU_CONCURRENT;
1179 continue;
1180 }
1181 bcp = &per_cpu(bau_control, smp_processor_id());
1182 if (val < 1 || val > bcp->cpus_in_uvhub) {
1183 printk(KERN_DEBUG
1184 "Error: BAU max concurrent %d is invalid\n",
1185 val);
1186 return -EINVAL;
1187 }
1188 max_bau_concurrent = val;
1189 max_bau_concurrent_constant = val;
1190 continue;
1191 case 1:
1192 if (val == 0)
1193 plugged_delay = PLUGGED_DELAY;
1194 else
1195 plugged_delay = val;
1196 continue;
1197 case 2:
1198 if (val == 0)
1199 plugsb4reset = PLUGSB4RESET;
1200 else
1201 plugsb4reset = val;
1202 continue;
1203 case 3:
1204 if (val == 0)
1205 timeoutsb4reset = TIMEOUTSB4RESET;
1206 else
1207 timeoutsb4reset = val;
1208 continue;
1209 case 4:
1210 if (val == 0)
1211 ipi_reset_limit = IPI_RESET_LIMIT;
1212 else
1213 ipi_reset_limit = val;
1214 continue;
1215 case 5:
1216 if (val == 0)
1217 complete_threshold = COMPLETE_THRESHOLD;
1218 else
1219 complete_threshold = val;
1220 continue;
1221 case 6:
1222 if (val == 0)
1223 congested_response_us = CONGESTED_RESPONSE_US;
1224 else
1225 congested_response_us = val;
1226 continue;
1227 case 7:
1228 if (val == 0)
1229 congested_reps = CONGESTED_REPS;
1230 else
1231 congested_reps = val;
1232 continue;
1233 case 8:
1234 if (val == 0)
1235 congested_period = CONGESTED_PERIOD;
1236 else
1237 congested_period = val;
1238 continue;
1239 }
1240 if (q == p)
1241 break;
1242 }
1243 for_each_present_cpu(cpu) {
1244 bcp = &per_cpu(bau_control, cpu);
1245 bcp->max_bau_concurrent = max_bau_concurrent;
1246 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1247 bcp->plugged_delay = plugged_delay;
1248 bcp->plugsb4reset = plugsb4reset;
1249 bcp->timeoutsb4reset = timeoutsb4reset;
1250 bcp->ipi_reset_limit = ipi_reset_limit;
1251 bcp->complete_threshold = complete_threshold;
1252 bcp->congested_response_us = congested_response_us;
1253 bcp->congested_reps = congested_reps;
1254 bcp->congested_period = congested_period;
1255 }
1256 return count;
1257}
1258
1259static const struct seq_operations uv_ptc_seq_ops = {
1260 .start = uv_ptc_seq_start,
1261 .next = uv_ptc_seq_next,
1262 .stop = uv_ptc_seq_stop,
1263 .show = uv_ptc_seq_show
1264};
1265
1266static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1267{
1268 return seq_open(file, &uv_ptc_seq_ops);
1269}
1270
1271static int tunables_open(struct inode *inode, struct file *file)
1272{
1273 return 0;
1274}
1275
1276static const struct file_operations proc_uv_ptc_operations = {
1277 .open = uv_ptc_proc_open,
1278 .read = seq_read,
1279 .write = uv_ptc_proc_write,
1280 .llseek = seq_lseek,
1281 .release = seq_release,
1282};
1283
1284static const struct file_operations tunables_fops = {
1285 .open = tunables_open,
1286 .read = tunables_read,
1287 .write = tunables_write,
1288};
1289
1290static int __init uv_ptc_init(void)
1291{
1292 struct proc_dir_entry *proc_uv_ptc;
1293
1294 if (!is_uv_system())
1295 return 0;
1296
1297 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1298 &proc_uv_ptc_operations);
1299 if (!proc_uv_ptc) {
1300 printk(KERN_ERR "unable to create %s proc entry\n",
1301 UV_PTC_BASENAME);
1302 return -EINVAL;
1303 }
1304
1305 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1306 if (!tunables_dir) {
1307 printk(KERN_ERR "unable to create debugfs directory %s\n",
1308 UV_BAU_TUNABLES_DIR);
1309 return -EINVAL;
1310 }
1311 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1312 tunables_dir, NULL, &tunables_fops);
1313 if (!tunables_file) {
1314 printk(KERN_ERR "unable to create debugfs file %s\n",
1315 UV_BAU_TUNABLES_FILE);
1316 return -EINVAL;
1317 }
1318 return 0;
1319}
1320
1321/*
1322 * initialize the sending side's sending buffers
1323 */
1324static void
1325uv_activation_descriptor_init(int node, int pnode)
1326{
1327 int i;
1328 int cpu;
1329 unsigned long pa;
1330 unsigned long m;
1331 unsigned long n;
1332 struct bau_desc *bau_desc;
1333 struct bau_desc *bd2;
1334 struct bau_control *bcp;
1335
1336 /*
1337 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1338 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
1339 */
1340 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
1341 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
1342 BUG_ON(!bau_desc);
1343
1344 pa = uv_gpa(bau_desc); /* need the real nasid*/
1345 n = pa >> uv_nshift;
1346 m = pa & uv_mmask;
1347
1348 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
1349 (n << UV_DESC_BASE_PNODE_SHIFT | m));
1350
1351 /*
1352 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
1353 * cpu even though we only use the first one; one descriptor can
1354 * describe a broadcast to 256 uv hubs.
1355 */
1356 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
1357 i++, bd2++) {
1358 memset(bd2, 0, sizeof(struct bau_desc));
1359 bd2->header.sw_ack_flag = 1;
1360 /*
1361 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
1362 * in the partition. The bit map will indicate uvhub numbers,
1363 * which are 0-N in a partition. Pnodes are unique system-wide.
1364 */
1365 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
1366 bd2->header.dest_subnodeid = 0x10; /* the LB */
1367 bd2->header.command = UV_NET_ENDPOINT_INTD;
1368 bd2->header.int_both = 1;
1369 /*
1370 * all others need to be set to zero:
1371 * fairness chaining multilevel count replied_to
1372 */
1373 }
1374 for_each_present_cpu(cpu) {
1375 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1376 continue;
1377 bcp = &per_cpu(bau_control, cpu);
1378 bcp->descriptor_base = bau_desc;
1379 }
1380}
1381
1382/*
1383 * initialize the destination side's receiving buffers
1384 * entered for each uvhub in the partition
1385 * - node is first node (kernel memory notion) on the uvhub
1386 * - pnode is the uvhub's physical identifier
1387 */
1388static void
1389uv_payload_queue_init(int node, int pnode)
1390{
1391 int pn;
1392 int cpu;
1393 char *cp;
1394 unsigned long pa;
1395 struct bau_payload_queue_entry *pqp;
1396 struct bau_payload_queue_entry *pqp_malloc;
1397 struct bau_control *bcp;
1398
1399 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
1400 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
1401 GFP_KERNEL, node);
1402 BUG_ON(!pqp);
1403 pqp_malloc = pqp;
1404
1405 cp = (char *)pqp + 31;
1406 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
1407
1408 for_each_present_cpu(cpu) {
1409 if (pnode != uv_cpu_to_pnode(cpu))
1410 continue;
1411 /* for every cpu on this pnode: */
1412 bcp = &per_cpu(bau_control, cpu);
1413 bcp->va_queue_first = pqp;
1414 bcp->bau_msg_head = pqp;
1415 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1416 }
1417 /*
1418 * need the pnode of where the memory was really allocated
1419 */
1420 pa = uv_gpa(pqp);
1421 pn = pa >> uv_nshift;
1422 uv_write_global_mmr64(pnode,
1423 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
1424 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
1425 uv_physnodeaddr(pqp));
1426 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
1427 uv_physnodeaddr(pqp));
1428 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
1429 (unsigned long)
1430 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1431 /* in effect, all msg_type's are set to MSG_NOOP */
1432 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
1433}
1434
1435/*
1436 * Initialization of each UV hub's structures
1437 */
1438static void __init uv_init_uvhub(int uvhub, int vector)
1439{
1440 int node;
1441 int pnode;
1442 unsigned long apicid;
1443
1444 node = uvhub_to_first_node(uvhub);
1445 pnode = uv_blade_to_pnode(uvhub);
1446 uv_activation_descriptor_init(node, pnode);
1447 uv_payload_queue_init(node, pnode);
1448 /*
1449 * the below initialization can't be in firmware because the
1450 * messaging IRQ will be determined by the OS
1451 */
1452 apicid = uvhub_to_first_apicid(uvhub);
1453 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1454 ((apicid << 32) | vector));
1455}
1456
1457/*
1458 * We will set BAU_MISC_CONTROL with a timeout period.
1459 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1460 * So the destination timeout period has be be calculated from them.
1461 */
1462static int
1463calculate_destination_timeout(void)
1464{
1465 unsigned long mmr_image;
1466 int mult1;
1467 int mult2;
1468 int index;
1469 int base;
1470 int ret;
1471 unsigned long ts_ns;
1472
1473 mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1474 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1475 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1476 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1477 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1478 base = timeout_base_ns[index];
1479 ts_ns = base * mult1 * mult2;
1480 ret = ts_ns / 1000;
1481 return ret;
1482}
1483
1484/*
1485 * initialize the bau_control structure for each cpu
1486 */
1487static void __init uv_init_per_cpu(int nuvhubs)
1488{
1489 int i;
1490 int cpu;
1491 int pnode;
1492 int uvhub;
1493 int have_hmaster;
1494 short socket = 0;
1495 unsigned short socket_mask;
1496 unsigned char *uvhub_mask;
1497 struct bau_control *bcp;
1498 struct uvhub_desc *bdp;
1499 struct socket_desc *sdp;
1500 struct bau_control *hmaster = NULL;
1501 struct bau_control *smaster = NULL;
1502 struct socket_desc {
1503 short num_cpus;
1504 short cpu_number[16];
1505 };
1506 struct uvhub_desc {
1507 unsigned short socket_mask;
1508 short num_cpus;
1509 short uvhub;
1510 short pnode;
1511 struct socket_desc socket[2];
1512 };
1513 struct uvhub_desc *uvhub_descs;
1514
1515 timeout_us = calculate_destination_timeout();
1516
1517 uvhub_descs = (struct uvhub_desc *)
1518 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1519 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1520 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1521 for_each_present_cpu(cpu) {
1522 bcp = &per_cpu(bau_control, cpu);
1523 memset(bcp, 0, sizeof(struct bau_control));
1524 pnode = uv_cpu_hub_info(cpu)->pnode;
1525 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1526 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1527 bdp = &uvhub_descs[uvhub];
1528 bdp->num_cpus++;
1529 bdp->uvhub = uvhub;
1530 bdp->pnode = pnode;
1531 /* kludge: 'assuming' one node per socket, and assuming that
1532 disabling a socket just leaves a gap in node numbers */
1533 socket = (cpu_to_node(cpu) & 1);
1534 bdp->socket_mask |= (1 << socket);
1535 sdp = &bdp->socket[socket];
1536 sdp->cpu_number[sdp->num_cpus] = cpu;
1537 sdp->num_cpus++;
1538 }
1539 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1540 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1541 continue;
1542 have_hmaster = 0;
1543 bdp = &uvhub_descs[uvhub];
1544 socket_mask = bdp->socket_mask;
1545 socket = 0;
1546 while (socket_mask) {
1547 if (!(socket_mask & 1))
1548 goto nextsocket;
1549 sdp = &bdp->socket[socket];
1550 for (i = 0; i < sdp->num_cpus; i++) {
1551 cpu = sdp->cpu_number[i];
1552 bcp = &per_cpu(bau_control, cpu);
1553 bcp->cpu = cpu;
1554 if (i == 0) {
1555 smaster = bcp;
1556 if (!have_hmaster) {
1557 have_hmaster++;
1558 hmaster = bcp;
1559 }
1560 }
1561 bcp->cpus_in_uvhub = bdp->num_cpus;
1562 bcp->cpus_in_socket = sdp->num_cpus;
1563 bcp->socket_master = smaster;
1564 bcp->uvhub = bdp->uvhub;
1565 bcp->uvhub_master = hmaster;
1566 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1567 blade_processor_id;
1568 }
1569nextsocket:
1570 socket++;
1571 socket_mask = (socket_mask >> 1);
1572 }
1573 }
1574 kfree(uvhub_descs);
1575 kfree(uvhub_mask);
1576 for_each_present_cpu(cpu) {
1577 bcp = &per_cpu(bau_control, cpu);
1578 bcp->baudisabled = 0;
1579 bcp->statp = &per_cpu(ptcstats, cpu);
1580 /* time interval to catch a hardware stay-busy bug */
1581 bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1582 bcp->max_bau_concurrent = max_bau_concurrent;
1583 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1584 bcp->plugged_delay = plugged_delay;
1585 bcp->plugsb4reset = plugsb4reset;
1586 bcp->timeoutsb4reset = timeoutsb4reset;
1587 bcp->ipi_reset_limit = ipi_reset_limit;
1588 bcp->complete_threshold = complete_threshold;
1589 bcp->congested_response_us = congested_response_us;
1590 bcp->congested_reps = congested_reps;
1591 bcp->congested_period = congested_period;
1592 }
1593}
1594
1595/*
1596 * Initialization of BAU-related structures
1597 */
1598static int __init uv_bau_init(void)
1599{
1600 int uvhub;
1601 int pnode;
1602 int nuvhubs;
1603 int cur_cpu;
1604 int vector;
1605 unsigned long mmr;
1606
1607 if (!is_uv_system())
1608 return 0;
1609
1610 if (nobau)
1611 return 0;
1612
1613 for_each_possible_cpu(cur_cpu)
1614 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1615 GFP_KERNEL, cpu_to_node(cur_cpu));
1616
1617 uv_nshift = uv_hub_info->m_val;
1618 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1619 nuvhubs = uv_num_possible_blades();
1620 spin_lock_init(&disable_lock);
1621 congested_cycles = microsec_2_cycles(congested_response_us);
1622
1623 uv_init_per_cpu(nuvhubs);
1624
1625 uv_partition_base_pnode = 0x7fffffff;
1626 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
1627 if (uv_blade_nr_possible_cpus(uvhub) &&
1628 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
1629 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
1630
1631 vector = UV_BAU_MESSAGE;
1632 for_each_possible_blade(uvhub)
1633 if (uv_blade_nr_possible_cpus(uvhub))
1634 uv_init_uvhub(uvhub, vector);
1635
1636 uv_enable_timeouts();
1637 alloc_intr_gate(vector, uv_bau_message_intr1);
1638
1639 for_each_possible_blade(uvhub) {
1640 if (uv_blade_nr_possible_cpus(uvhub)) {
1641 pnode = uv_blade_to_pnode(uvhub);
1642 /* INIT the bau */
1643 uv_write_global_mmr64(pnode,
1644 UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1645 ((unsigned long)1 << 63));
1646 mmr = 1; /* should be 1 to broadcast to both sockets */
1647 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1648 mmr);
1649 }
1650 }
1651
1652 return 0;
1653}
1654core_initcall(uv_bau_init);
1655fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a595257390..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,56 +1,42 @@
1#include <linux/io.h> 1#include <linux/io.h>
2#include <linux/memblock.h>
2 3
3#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
4#include <asm/pgtable.h> 6#include <asm/pgtable.h>
5#include <asm/e820.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 unsigned long mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == -1L) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base);
40}
41 38
42void __init setup_trampoline_page_table(void) 39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
43{ 40 return 0;
44#ifdef CONFIG_X86_32
45 /* Copy kernel address range */
46 clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
47 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
48 KERNEL_PGD_PTRS);
49
50 /* Initialize low mappings */
51 clone_pgd_range(trampoline_pg_dir,
52 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
53 min_t(unsigned long, KERNEL_PGD_PTRS,
54 KERNEL_PGD_BOUNDARY));
55#endif
56} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -127,8 +126,9 @@ startup_64:
127no_longmode: 126no_longmode:
128 hlt 127 hlt
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu_64.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors);
83 83
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
92
86static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
87{ 94{
88 if (regs->flags & X86_EFLAGS_IF) 95 if (regs->flags & X86_EFLAGS_IF)
@@ -300,16 +307,23 @@ gp_in_kernel:
300 die("general protection fault", regs, error_code); 307 die("general protection fault", regs, error_code);
301} 308}
302 309
303static notrace __kprobes void 310static int __init setup_unknown_nmi_panic(char *str)
304mem_parity_error(unsigned char reason, struct pt_regs *regs)
305{ 311{
306 printk(KERN_EMERG 312 unknown_nmi_panic = 1;
307 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 313 return 1;
308 reason, smp_processor_id()); 314}
315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
309 316
310 printk(KERN_EMERG 317static notrace __kprobes void
311 "You have some hardware problem, likely on the PCI bus.\n"); 318pci_serr_error(unsigned char reason, struct pt_regs *regs)
319{
320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
321 reason, smp_processor_id());
312 322
323 /*
324 * On some machines, PCI SERR line is used to report memory
325 * errors. EDAC makes use of it.
326 */
313#if defined(CONFIG_EDAC) 327#if defined(CONFIG_EDAC)
314 if (edac_handler_set()) { 328 if (edac_handler_set()) {
315 edac_atomic_assert_error(); 329 edac_atomic_assert_error();
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
320 if (panic_on_unrecovered_nmi) 334 if (panic_on_unrecovered_nmi)
321 panic("NMI: Not continuing"); 335 panic("NMI: Not continuing");
322 336
323 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 337 pr_emerg("Dazed and confused, but trying to continue\n");
324 338
325 /* Clear and disable the memory parity error line. */ 339 /* Clear and disable the PCI SERR error line. */
326 reason = (reason & 0xf) | 4; 340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
327 outb(reason, 0x61); 341 outb(reason, NMI_REASON_PORT);
328} 342}
329 343
330static notrace __kprobes void 344static notrace __kprobes void
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
332{ 346{
333 unsigned long i; 347 unsigned long i;
334 348
335 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
336 show_registers(regs); 352 show_registers(regs);
337 353
338 if (panic_on_io_nmi) 354 if (panic_on_io_nmi)
339 panic("NMI IOCK error: Not continuing"); 355 panic("NMI IOCK error: Not continuing");
340 356
341 /* Re-enable the IOCK line, wait for a few seconds */ 357 /* Re-enable the IOCK line, wait for a few seconds */
342 reason = (reason & 0xf) | 8; 358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
343 outb(reason, 0x61); 359 outb(reason, NMI_REASON_PORT);
344 360
345 i = 2000; 361 i = 20000;
346 while (--i) 362 while (--i) {
347 udelay(1000); 363 touch_nmi_watchdog();
364 udelay(100);
365 }
348 366
349 reason &= ~8; 367 reason &= ~NMI_REASON_CLEAR_IOCHK;
350 outb(reason, 0x61); 368 outb(reason, NMI_REASON_PORT);
351} 369}
352 370
353static notrace __kprobes void 371static notrace __kprobes void
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
366 return; 384 return;
367 } 385 }
368#endif 386#endif
369 printk(KERN_EMERG 387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
370 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 388 reason, smp_processor_id());
371 reason, smp_processor_id());
372 389
373 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 390 pr_emerg("Do you have a strange power saving mode enabled?\n");
374 if (panic_on_unrecovered_nmi) 391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
375 panic("NMI: Not continuing"); 392 panic("NMI: Not continuing");
376 393
377 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 394 pr_emerg("Dazed and confused, but trying to continue\n");
378} 395}
379 396
380static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
381{ 398{
382 unsigned char reason = 0; 399 unsigned char reason = 0;
383 int cpu;
384 400
385 cpu = smp_processor_id(); 401 /*
386 402 * CPU-specific NMI must be processed before non-CPU-specific
387 /* Only the BSP gets external NMIs from the system. */ 403 * NMI, otherwise we may lose it, because the CPU-specific
388 if (!cpu) 404 * NMI can not be detected/processed on other CPUs.
389 reason = get_nmi_reason(); 405 */
390 406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
391 if (!(reason & 0xc0)) { 407 return;
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP)
394 return;
395 408
396#ifdef CONFIG_X86_LOCAL_APIC 409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 410 raw_spin_lock(&nmi_reason_lock);
398 == NOTIFY_STOP) 411 reason = get_nmi_reason();
399 return;
400 412
401#ifndef CONFIG_LOCKUP_DETECTOR 413 if (reason & NMI_REASON_MASK) {
414 if (reason & NMI_REASON_SERR)
415 pci_serr_error(reason, regs);
416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
402 /* 419 /*
403 * Ok, so this is none of the documented NMI sources, 420 * Reassert NMI in case it became active
404 * so it must be the NMI watchdog. 421 * meanwhile as it's edge-triggered:
405 */ 422 */
406 if (nmi_watchdog_tick(regs, reason)) 423 reassert_nmi();
407 return;
408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
410 unknown_nmi_error(reason, regs);
411#else
412 unknown_nmi_error(reason, regs);
413#endif 424#endif
414 425 raw_spin_unlock(&nmi_reason_lock);
415 return; 426 return;
416 } 427 }
417 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 428 raw_spin_unlock(&nmi_reason_lock);
418 return;
419 429
420 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 430 unknown_nmi_error(reason, regs);
421 if (reason & 0x80)
422 mem_parity_error(reason, regs);
423 if (reason & 0x40)
424 io_check_error(reason, regs);
425#ifdef CONFIG_X86_32
426 /*
427 * Reassert NMI in case it became active meanwhile
428 * as it's edge-triggered:
429 */
430 reassert_nmi();
431#endif
432} 431}
433 432
434dotraplinkage notrace __kprobes void 433dotraplinkage notrace __kprobes void
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
446 445
447void stop_nmi(void) 446void stop_nmi(void)
448{ 447{
449 acpi_nmi_disable();
450 ignore_nmis++; 448 ignore_nmis++;
451} 449}
452 450
453void restart_nmi(void) 451void restart_nmi(void)
454{ 452{
455 ignore_nmis--; 453 ignore_nmis--;
456 acpi_nmi_enable();
457} 454}
458 455
459/* May run on IST stack. */ 456/* May run on IST stack. */
@@ -575,6 +572,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
575 if (regs->flags & X86_VM_MASK) { 572 if (regs->flags & X86_VM_MASK) {
576 handle_vm86_trap((struct kernel_vm86_regs *) regs, 573 handle_vm86_trap((struct kernel_vm86_regs *) regs,
577 error_code, 1); 574 error_code, 1);
575 preempt_conditional_cli(regs);
578 return; 576 return;
579 } 577 }
580 578
@@ -776,21 +774,10 @@ asmlinkage void math_state_restore(void)
776} 774}
777EXPORT_SYMBOL_GPL(math_state_restore); 775EXPORT_SYMBOL_GPL(math_state_restore);
778 776
779#ifndef CONFIG_MATH_EMULATION
780void math_emulate(struct math_emu_info *info)
781{
782 printk(KERN_EMERG
783 "math-emulation not enabled and no coprocessor found.\n");
784 printk(KERN_EMERG "killing %s.\n", current->comm);
785 force_sig(SIGFPE, current);
786 schedule();
787}
788#endif /* CONFIG_MATH_EMULATION */
789
790dotraplinkage void __kprobes 777dotraplinkage void __kprobes
791do_device_not_available(struct pt_regs *regs, long error_code) 778do_device_not_available(struct pt_regs *regs, long error_code)
792{ 779{
793#ifdef CONFIG_X86_32 780#ifdef CONFIG_MATH_EMULATION
794 if (read_cr0() & X86_CR0_EM) { 781 if (read_cr0() & X86_CR0_EM) {
795 struct math_emu_info info = { }; 782 struct math_emu_info info = { };
796 783
@@ -798,12 +785,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
798 785
799 info.regs = regs; 786 info.regs = regs;
800 math_emulate(&info); 787 math_emulate(&info);
801 } else { 788 return;
802 math_state_restore(); /* interrupts still off */
803 conditional_sti(regs);
804 } 789 }
805#else 790#endif
806 math_state_restore(); 791 math_state_restore(); /* interrupts still off */
792#ifdef CONFIG_X86_32
793 conditional_sti(regs);
807#endif 794#endif
808} 795}
809 796
@@ -881,18 +868,6 @@ void __init trap_init(void)
881#endif 868#endif
882 869
883#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
884 if (cpu_has_fxsr) {
885 printk(KERN_INFO "Enabling fast FPU save and restore... ");
886 set_in_cr4(X86_CR4_OSFXSR);
887 printk("done.\n");
888 }
889 if (cpu_has_xmm) {
890 printk(KERN_INFO
891 "Enabling unmasked SIMD FPU exception support... ");
892 set_in_cr4(X86_CR4_OSXMMEXCPT);
893 printk("done.\n");
894 }
895
896 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 871 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
897 set_bit(SYSCALL_VECTOR, used_vectors); 872 set_bit(SYSCALL_VECTOR, used_vectors);
898#endif 873#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..6cc6922262af 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
104 104
105__setup("notsc", notsc_setup); 105__setup("notsc", notsc_setup);
106 106
107static int no_sched_irq_time;
108
107static int __init tsc_setup(char *str) 109static int __init tsc_setup(char *str)
108{ 110{
109 if (!strcmp(str, "reliable")) 111 if (!strcmp(str, "reliable"))
110 tsc_clocksource_reliable = 1; 112 tsc_clocksource_reliable = 1;
113 if (!strncmp(str, "noirqtime", 9))
114 no_sched_irq_time = 1;
111 return 1; 115 return 1;
112} 116}
113 117
@@ -423,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
423 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
424 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
425 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
426 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
427 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
428 * then we discard the result and have another try. 432 * then we discard the result and have another try.
429 * 433 *
@@ -460,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
460 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
461 465
462 /* hpet or pmtimer available ? */ 466 /* hpet or pmtimer available ? */
463 if (!hpet && !ref1 && !ref2) 467 if (ref1 == ref2)
464 continue; 468 continue;
465 469
466 /* Check, whether the sampling was disturbed by an SMI */ 470 /* Check, whether the sampling was disturbed by an SMI */
@@ -655,7 +659,7 @@ void restore_sched_clock_state(void)
655 659
656 local_irq_save(flags); 660 local_irq_save(flags);
657 661
658 __get_cpu_var(cyc2ns_offset) = 0; 662 __this_cpu_write(cyc2ns_offset, 0);
659 offset = cyc2ns_suspend - sched_clock(); 663 offset = cyc2ns_suspend - sched_clock();
660 664
661 for_each_possible_cpu(cpu) 665 for_each_possible_cpu(cpu)
@@ -759,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
759 ret : clocksource_tsc.cycle_last; 763 ret : clocksource_tsc.cycle_last;
760} 764}
761 765
762#ifdef CONFIG_X86_64
763static cycle_t __vsyscall_fn vread_tsc(void)
764{
765 cycle_t ret;
766
767 /*
768 * Surround the RDTSC by barriers, to make sure it's not
769 * speculated to outside the seqlock critical section and
770 * does not cause time warps:
771 */
772 rdtsc_barrier();
773 ret = (cycle_t)vget_cycles();
774 rdtsc_barrier();
775
776 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
777 ret : __vsyscall_gtod_data.clock.cycle_last;
778}
779#endif
780
781static void resume_tsc(struct clocksource *cs) 766static void resume_tsc(struct clocksource *cs)
782{ 767{
783 clocksource_tsc.cycle_last = 0; 768 clocksource_tsc.cycle_last = 0;
@@ -801,6 +786,7 @@ void mark_tsc_unstable(char *reason)
801 if (!tsc_unstable) { 786 if (!tsc_unstable) {
802 tsc_unstable = 1; 787 tsc_unstable = 1;
803 sched_clock_stable = 0; 788 sched_clock_stable = 0;
789 disable_sched_clock_irqtime();
804 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 790 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
805 /* Change only the rating, when not registered */ 791 /* Change only the rating, when not registered */
806 if (clocksource_tsc.mult) 792 if (clocksource_tsc.mult)
@@ -867,6 +853,9 @@ __cpuinit int unsynchronized_tsc(void)
867 853
868 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 854 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
869 return 0; 855 return 0;
856
857 if (tsc_clocksource_reliable)
858 return 0;
870 /* 859 /*
871 * Intel systems are normally all synchronized. 860 * Intel systems are normally all synchronized.
872 * Exceptions must mark TSC as unstable: 861 * Exceptions must mark TSC as unstable:
@@ -874,14 +863,92 @@ __cpuinit int unsynchronized_tsc(void)
874 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 863 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
875 /* assume multi socket systems are not synchronized: */ 864 /* assume multi socket systems are not synchronized: */
876 if (num_possible_cpus() > 1) 865 if (num_possible_cpus() > 1)
877 tsc_unstable = 1; 866 return 1;
878 } 867 }
879 868
880 return tsc_unstable; 869 return 0;
881} 870}
882 871
883static void __init init_tsc_clocksource(void) 872
873static void tsc_refine_calibration_work(struct work_struct *work);
874static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
875/**
876 * tsc_refine_calibration_work - Further refine tsc freq calibration
877 * @work - ignored.
878 *
879 * This functions uses delayed work over a period of a
880 * second to further refine the TSC freq value. Since this is
881 * timer based, instead of loop based, we don't block the boot
882 * process while this longer calibration is done.
883 *
884 * If there are any calibration anomalies (too many SMIs, etc),
885 * or the refined calibration is off by 1% of the fast early
886 * calibration, we throw out the new calibration and use the
887 * early calibration.
888 */
889static void tsc_refine_calibration_work(struct work_struct *work)
884{ 890{
891 static u64 tsc_start = -1, ref_start;
892 static int hpet;
893 u64 tsc_stop, ref_stop, delta;
894 unsigned long freq;
895
896 /* Don't bother refining TSC on unstable systems */
897 if (check_tsc_unstable())
898 goto out;
899
900 /*
901 * Since the work is started early in boot, we may be
902 * delayed the first time we expire. So set the workqueue
903 * again once we know timers are working.
904 */
905 if (tsc_start == -1) {
906 /*
907 * Only set hpet once, to avoid mixing hardware
908 * if the hpet becomes enabled later.
909 */
910 hpet = is_hpet_enabled();
911 schedule_delayed_work(&tsc_irqwork, HZ);
912 tsc_start = tsc_read_refs(&ref_start, hpet);
913 return;
914 }
915
916 tsc_stop = tsc_read_refs(&ref_stop, hpet);
917
918 /* hpet or pmtimer available ? */
919 if (ref_start == ref_stop)
920 goto out;
921
922 /* Check, whether the sampling was disturbed by an SMI */
923 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
924 goto out;
925
926 delta = tsc_stop - tsc_start;
927 delta *= 1000000LL;
928 if (hpet)
929 freq = calc_hpet_ref(delta, ref_start, ref_stop);
930 else
931 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
932
933 /* Make sure we're within 1% */
934 if (abs(tsc_khz - freq) > tsc_khz/100)
935 goto out;
936
937 tsc_khz = freq;
938 printk(KERN_INFO "Refined TSC clocksource calibration: "
939 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
940 (unsigned long)tsc_khz % 1000);
941
942out:
943 clocksource_register_khz(&clocksource_tsc, tsc_khz);
944}
945
946
947static int __init init_tsc_clocksource(void)
948{
949 if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
950 return 0;
951
885 if (tsc_clocksource_reliable) 952 if (tsc_clocksource_reliable)
886 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 953 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
887 /* lower the rating if we already know its unstable: */ 954 /* lower the rating if we already know its unstable: */
@@ -889,62 +956,14 @@ static void __init init_tsc_clocksource(void)
889 clocksource_tsc.rating = 0; 956 clocksource_tsc.rating = 0;
890 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 957 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
891 } 958 }
892 clocksource_register_khz(&clocksource_tsc, tsc_khz); 959 schedule_delayed_work(&tsc_irqwork, 0);
960 return 0;
893} 961}
894
895#ifdef CONFIG_X86_64
896/* 962/*
897 * calibrate_cpu is used on systems with fixed rate TSCs to determine 963 * We use device_initcall here, to ensure we run after the hpet
898 * processor frequency 964 * is fully initialized, which may occur at fs_initcall time.
899 */ 965 */
900#define TICK_COUNT 100000000 966device_initcall(init_tsc_clocksource);
901static unsigned long __init calibrate_cpu(void)
902{
903 int tsc_start, tsc_now;
904 int i, no_ctr_free;
905 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
906 unsigned long flags;
907
908 for (i = 0; i < 4; i++)
909 if (avail_to_resrv_perfctr_nmi_bit(i))
910 break;
911 no_ctr_free = (i == 4);
912 if (no_ctr_free) {
913 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
914 "cpu_khz value may be incorrect.\n");
915 i = 3;
916 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
917 wrmsrl(MSR_K7_EVNTSEL3, 0);
918 rdmsrl(MSR_K7_PERFCTR3, pmc3);
919 } else {
920 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
921 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
922 }
923 local_irq_save(flags);
924 /* start measuring cycles, incrementing from 0 */
925 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
926 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
927 rdtscl(tsc_start);
928 do {
929 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
930 tsc_now = get_cycles();
931 } while ((tsc_now - tsc_start) < TICK_COUNT);
932
933 local_irq_restore(flags);
934 if (no_ctr_free) {
935 wrmsrl(MSR_K7_EVNTSEL3, 0);
936 wrmsrl(MSR_K7_PERFCTR3, pmc3);
937 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
938 } else {
939 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
940 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
941 }
942
943 return pmc_now * tsc_khz / (tsc_now - tsc_start);
944}
945#else
946static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
947#endif
948 967
949void __init tsc_init(void) 968void __init tsc_init(void)
950{ 969{
@@ -964,10 +983,6 @@ void __init tsc_init(void)
964 return; 983 return;
965 } 984 }
966 985
967 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
968 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
969 cpu_khz = calibrate_cpu();
970
971 printk("Detected %lu.%03lu MHz processor.\n", 986 printk("Detected %lu.%03lu MHz processor.\n",
972 (unsigned long)cpu_khz / 1000, 987 (unsigned long)cpu_khz / 1000,
973 (unsigned long)cpu_khz % 1000); 988 (unsigned long)cpu_khz % 1000);
@@ -987,6 +1002,9 @@ void __init tsc_init(void)
987 /* now allow native_sched_clock() to use rdtsc */ 1002 /* now allow native_sched_clock() to use rdtsc */
988 tsc_disabled = 0; 1003 tsc_disabled = 0;
989 1004
1005 if (!no_sched_irq_time)
1006 enable_sched_clock_irqtime();
1007
990 lpj = ((u64)tsc_khz * 1000); 1008 lpj = ((u64)tsc_khz * 1000);
991 do_div(lpj, HZ); 1009 do_div(lpj, HZ);
992 lpj_fine = lpj; 1010 lpj_fine = lpj;
@@ -999,6 +1017,5 @@ void __init tsc_init(void)
999 mark_tsc_unstable("TSCs unsynchronized"); 1017 mark_tsc_unstable("TSCs unsynchronized");
1000 1018
1001 check_system_tsc_reliable(); 1019 check_system_tsc_reliable();
1002 init_tsc_clocksource();
1003} 1020}
1004 1021
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
deleted file mode 100644
index 1132129db792..000000000000
--- a/arch/x86/kernel/uv_irq.c
+++ /dev/null
@@ -1,302 +0,0 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ functions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#include <linux/module.h>
12#include <linux/rbtree.h>
13#include <linux/slab.h>
14#include <linux/irq.h>
15
16#include <asm/apic.h>
17#include <asm/uv/uv_irq.h>
18#include <asm/uv/uv_hub.h>
19
20/* MMR offset and pnode of hub sourcing interrupts for a given irq */
21struct uv_irq_2_mmr_pnode{
22 struct rb_node list;
23 unsigned long offset;
24 int pnode;
25 int irq;
26};
27
28static spinlock_t uv_irq_lock;
29static struct rb_root uv_irq_root;
30
31static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
32
33static void uv_noop(unsigned int irq)
34{
35}
36
37static unsigned int uv_noop_ret(unsigned int irq)
38{
39 return 0;
40}
41
42static void uv_ack_apic(unsigned int irq)
43{
44 ack_APIC_irq();
45}
46
47static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE",
49 .startup = uv_noop_ret,
50 .shutdown = uv_noop,
51 .enable = uv_noop,
52 .disable = uv_noop,
53 .ack = uv_noop,
54 .mask = uv_noop,
55 .unmask = uv_noop,
56 .eoi = uv_ack_apic,
57 .end = uv_noop,
58 .set_affinity = uv_set_irq_affinity,
59};
60
61/*
62 * Add offset and pnode information of the hub sourcing interrupts to the
63 * rb tree for a specific irq.
64 */
65static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
66{
67 struct rb_node **link = &uv_irq_root.rb_node;
68 struct rb_node *parent = NULL;
69 struct uv_irq_2_mmr_pnode *n;
70 struct uv_irq_2_mmr_pnode *e;
71 unsigned long irqflags;
72
73 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
74 uv_blade_to_memory_nid(blade));
75 if (!n)
76 return -ENOMEM;
77
78 n->irq = irq;
79 n->offset = offset;
80 n->pnode = uv_blade_to_pnode(blade);
81 spin_lock_irqsave(&uv_irq_lock, irqflags);
82 /* Find the right place in the rbtree: */
83 while (*link) {
84 parent = *link;
85 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
86
87 if (unlikely(irq == e->irq)) {
88 /* irq entry exists */
89 e->pnode = uv_blade_to_pnode(blade);
90 e->offset = offset;
91 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
92 kfree(n);
93 return 0;
94 }
95
96 if (irq < e->irq)
97 link = &(*link)->rb_left;
98 else
99 link = &(*link)->rb_right;
100 }
101
102 /* Insert the node into the rbtree. */
103 rb_link_node(&n->list, parent, link);
104 rb_insert_color(&n->list, &uv_irq_root);
105
106 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
107 return 0;
108}
109
110/* Retrieve offset and pnode information from the rb tree for a specific irq */
111int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
112{
113 struct uv_irq_2_mmr_pnode *e;
114 struct rb_node *n;
115 unsigned long irqflags;
116
117 spin_lock_irqsave(&uv_irq_lock, irqflags);
118 n = uv_irq_root.rb_node;
119 while (n) {
120 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
121
122 if (e->irq == irq) {
123 *offset = e->offset;
124 *pnode = e->pnode;
125 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
126 return 0;
127 }
128
129 if (irq < e->irq)
130 n = n->rb_left;
131 else
132 n = n->rb_right;
133 }
134 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
135 return -1;
136}
137
138/*
139 * Re-target the irq to the specified CPU and enable the specified MMR located
140 * on the specified blade to allow the sending of MSIs to the specified CPU.
141 */
142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int limit)
145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq);
148 struct irq_cfg *cfg;
149 int mmr_pnode;
150 unsigned long mmr_value;
151 struct uv_IO_APIC_route_entry *entry;
152 int err;
153
154 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
155 sizeof(unsigned long));
156
157 cfg = irq_cfg(irq);
158
159 err = assign_irq_vector(irq, cfg, eligible_cpu);
160 if (err != 0)
161 return err;
162
163 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING;
165 else
166 desc->status |= IRQ_MOVE_PCNTXT;
167
168 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
169 irq_name);
170
171 mmr_value = 0;
172 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
173 entry->vector = cfg->vector;
174 entry->delivery_mode = apic->irq_delivery_mode;
175 entry->dest_mode = apic->irq_dest_mode;
176 entry->polarity = 0;
177 entry->trigger = 0;
178 entry->mask = 0;
179 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
180
181 mmr_pnode = uv_blade_to_pnode(mmr_blade);
182 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
183
184 if (cfg->move_in_progress)
185 send_cleanup_vector(cfg);
186
187 return irq;
188}
189
190/*
191 * Disable the specified MMR located on the specified blade so that MSIs are
192 * longer allowed to be sent.
193 */
194static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
195{
196 unsigned long mmr_value;
197 struct uv_IO_APIC_route_entry *entry;
198
199 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
200 sizeof(unsigned long));
201
202 mmr_value = 0;
203 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
204 entry->mask = 1;
205
206 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
207}
208
209static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
210{
211 struct irq_desc *desc = irq_to_desc(irq);
212 struct irq_cfg *cfg = desc->chip_data;
213 unsigned int dest;
214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset;
217 int mmr_pnode;
218
219 if (set_desc_affinity(desc, mask, &dest))
220 return -1;
221
222 mmr_value = 0;
223 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
224
225 entry->vector = cfg->vector;
226 entry->delivery_mode = apic->irq_delivery_mode;
227 entry->dest_mode = apic->irq_dest_mode;
228 entry->polarity = 0;
229 entry->trigger = 0;
230 entry->mask = 0;
231 entry->dest = dest;
232
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
235 return -1;
236
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
238
239 if (cfg->move_in_progress)
240 send_cleanup_vector(cfg);
241
242 return 0;
243}
244
245/*
246 * Set up a mapping of an available irq and vector, and enable the specified
247 * MMR that defines the MSI that is to be sent to the specified CPU when an
248 * interrupt is raised.
249 */
250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
251 unsigned long mmr_offset, int limit)
252{
253 int irq, ret;
254
255 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
256
257 if (irq <= 0)
258 return -EBUSY;
259
260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
261 limit);
262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else
265 destroy_irq(irq);
266
267 return ret;
268}
269EXPORT_SYMBOL_GPL(uv_setup_irq);
270
271/*
272 * Tear down a mapping of an irq and vector, and disable the specified MMR that
273 * defined the MSI that was to be sent to the specified CPU when an interrupt
274 * was raised.
275 *
276 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
277 */
278void uv_teardown_irq(unsigned int irq)
279{
280 struct uv_irq_2_mmr_pnode *e;
281 struct rb_node *n;
282 unsigned long irqflags;
283
284 spin_lock_irqsave(&uv_irq_lock, irqflags);
285 n = uv_irq_root.rb_node;
286 while (n) {
287 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
288 if (e->irq == irq) {
289 arch_disable_uv_irq(e->pnode, e->offset);
290 rb_erase(n, &uv_irq_root);
291 kfree(e);
292 break;
293 }
294 if (irq < e->irq)
295 n = n->rb_left;
296 else
297 n = n->rb_right;
298 }
299 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
300 destroy_irq(irq);
301}
302EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
deleted file mode 100644
index 309c70fb7759..000000000000
--- a/arch/x86/kernel/uv_sysfs.c
+++ /dev/null
@@ -1,76 +0,0 @@
1/*
2 * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/sysdev.h>
23#include <asm/uv/bios.h>
24#include <asm/uv/uv.h>
25
26struct kobject *sgi_uv_kobj;
27
28static ssize_t partition_id_show(struct kobject *kobj,
29 struct kobj_attribute *attr, char *buf)
30{
31 return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
32}
33
34static ssize_t coherence_id_show(struct kobject *kobj,
35 struct kobj_attribute *attr, char *buf)
36{
37 return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
38}
39
40static struct kobj_attribute partition_id_attr =
41 __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
42
43static struct kobj_attribute coherence_id_attr =
44 __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
45
46
47static int __init sgi_uv_sysfs_init(void)
48{
49 unsigned long ret;
50
51 if (!is_uv_system())
52 return -ENODEV;
53
54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL;
59 }
60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret;
65 }
66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret;
71 }
72
73 return 0;
74}
75
76device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
deleted file mode 100644
index 56e421bc379b..000000000000
--- a/arch/x86/kernel/uv_time.c
+++ /dev/null
@@ -1,423 +0,0 @@
1/*
2 * SGI RTC clock/timer routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Dimitri Sivanich
20 */
21#include <linux/clockchips.h>
22#include <linux/slab.h>
23
24#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h>
26#include <asm/uv/bios.h>
27#include <asm/uv/uv.h>
28#include <asm/apic.h>
29#include <asm/cpu.h>
30
31#define RTC_NAME "sgi_rtc"
32
33static cycle_t uv_read_rtc(struct clocksource *cs);
34static int uv_rtc_next_event(unsigned long, struct clock_event_device *);
35static void uv_rtc_timer_setup(enum clock_event_mode,
36 struct clock_event_device *);
37
38static struct clocksource clocksource_uv = {
39 .name = RTC_NAME,
40 .rating = 400,
41 .read = uv_read_rtc,
42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
43 .shift = 10,
44 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
45};
46
47static struct clock_event_device clock_event_device_uv = {
48 .name = RTC_NAME,
49 .features = CLOCK_EVT_FEAT_ONESHOT,
50 .shift = 20,
51 .rating = 400,
52 .irq = -1,
53 .set_next_event = uv_rtc_next_event,
54 .set_mode = uv_rtc_timer_setup,
55 .event_handler = NULL,
56};
57
58static DEFINE_PER_CPU(struct clock_event_device, cpu_ced);
59
60/* There is one of these allocated per node */
61struct uv_rtc_timer_head {
62 spinlock_t lock;
63 /* next cpu waiting for timer, local node relative: */
64 int next_cpu;
65 /* number of cpus on this node: */
66 int ncpus;
67 struct {
68 int lcpu; /* systemwide logical cpu number */
69 u64 expires; /* next timer expiration for this cpu */
70 } cpu[1];
71};
72
73/*
74 * Access to uv_rtc_timer_head via blade id.
75 */
76static struct uv_rtc_timer_head **blade_info __read_mostly;
77
78static int uv_rtc_evt_enable;
79
80/*
81 * Hardware interface routines
82 */
83
84/* Send IPIs to another node */
85static void uv_rtc_send_IPI(int cpu)
86{
87 unsigned long apicid, val;
88 int pnode;
89
90 apicid = cpu_physical_id(cpu);
91 pnode = uv_apicid_to_pnode(apicid);
92 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
95
96 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
97}
98
99/* Check for an RTC interrupt pending */
100static int uv_intr_pending(int pnode)
101{
102 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
103 UVH_EVENT_OCCURRED0_RTC1_MASK;
104}
105
106/* Setup interrupt and return non-zero if early expiration occurred. */
107static int uv_setup_intr(int cpu, u64 expires)
108{
109 u64 val;
110 int pnode = uv_cpu_to_pnode(cpu);
111
112 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
113 UVH_RTC1_INT_CONFIG_M_MASK);
114 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
115
116 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
117 UVH_EVENT_OCCURRED0_RTC1_MASK);
118
119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
121
122 /* Set configuration */
123 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
124 /* Initialize comparator value */
125 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
126
127 if (uv_read_rtc(NULL) <= expires)
128 return 0;
129
130 return !uv_intr_pending(pnode);
131}
132
133/*
134 * Per-cpu timer tracking routines
135 */
136
137static __init void uv_rtc_deallocate_timers(void)
138{
139 int bid;
140
141 for_each_possible_blade(bid) {
142 kfree(blade_info[bid]);
143 }
144 kfree(blade_info);
145}
146
147/* Allocate per-node list of cpu timer expiration times. */
148static __init int uv_rtc_allocate_timers(void)
149{
150 int cpu;
151
152 blade_info = kmalloc(uv_possible_blades * sizeof(void *), GFP_KERNEL);
153 if (!blade_info)
154 return -ENOMEM;
155 memset(blade_info, 0, uv_possible_blades * sizeof(void *));
156
157 for_each_present_cpu(cpu) {
158 int nid = cpu_to_node(cpu);
159 int bid = uv_cpu_to_blade_id(cpu);
160 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
161 struct uv_rtc_timer_head *head = blade_info[bid];
162
163 if (!head) {
164 head = kmalloc_node(sizeof(struct uv_rtc_timer_head) +
165 (uv_blade_nr_possible_cpus(bid) *
166 2 * sizeof(u64)),
167 GFP_KERNEL, nid);
168 if (!head) {
169 uv_rtc_deallocate_timers();
170 return -ENOMEM;
171 }
172 spin_lock_init(&head->lock);
173 head->ncpus = uv_blade_nr_possible_cpus(bid);
174 head->next_cpu = -1;
175 blade_info[bid] = head;
176 }
177
178 head->cpu[bcpu].lcpu = cpu;
179 head->cpu[bcpu].expires = ULLONG_MAX;
180 }
181
182 return 0;
183}
184
185/* Find and set the next expiring timer. */
186static void uv_rtc_find_next_timer(struct uv_rtc_timer_head *head, int pnode)
187{
188 u64 lowest = ULLONG_MAX;
189 int c, bcpu = -1;
190
191 head->next_cpu = -1;
192 for (c = 0; c < head->ncpus; c++) {
193 u64 exp = head->cpu[c].expires;
194 if (exp < lowest) {
195 bcpu = c;
196 lowest = exp;
197 }
198 }
199 if (bcpu >= 0) {
200 head->next_cpu = bcpu;
201 c = head->cpu[bcpu].lcpu;
202 if (uv_setup_intr(c, lowest))
203 /* If we didn't set it up in time, trigger */
204 uv_rtc_send_IPI(c);
205 } else {
206 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
207 UVH_RTC1_INT_CONFIG_M_MASK);
208 }
209}
210
211/*
212 * Set expiration time for current cpu.
213 *
214 * Returns 1 if we missed the expiration time.
215 */
216static int uv_rtc_set_timer(int cpu, u64 expires)
217{
218 int pnode = uv_cpu_to_pnode(cpu);
219 int bid = uv_cpu_to_blade_id(cpu);
220 struct uv_rtc_timer_head *head = blade_info[bid];
221 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
222 u64 *t = &head->cpu[bcpu].expires;
223 unsigned long flags;
224 int next_cpu;
225
226 spin_lock_irqsave(&head->lock, flags);
227
228 next_cpu = head->next_cpu;
229 *t = expires;
230
231 /* Will this one be next to go off? */
232 if (next_cpu < 0 || bcpu == next_cpu ||
233 expires < head->cpu[next_cpu].expires) {
234 head->next_cpu = bcpu;
235 if (uv_setup_intr(cpu, expires)) {
236 *t = ULLONG_MAX;
237 uv_rtc_find_next_timer(head, pnode);
238 spin_unlock_irqrestore(&head->lock, flags);
239 return -ETIME;
240 }
241 }
242
243 spin_unlock_irqrestore(&head->lock, flags);
244 return 0;
245}
246
247/*
248 * Unset expiration time for current cpu.
249 *
250 * Returns 1 if this timer was pending.
251 */
252static int uv_rtc_unset_timer(int cpu, int force)
253{
254 int pnode = uv_cpu_to_pnode(cpu);
255 int bid = uv_cpu_to_blade_id(cpu);
256 struct uv_rtc_timer_head *head = blade_info[bid];
257 int bcpu = uv_cpu_hub_info(cpu)->blade_processor_id;
258 u64 *t = &head->cpu[bcpu].expires;
259 unsigned long flags;
260 int rc = 0;
261
262 spin_lock_irqsave(&head->lock, flags);
263
264 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
265 rc = 1;
266
267 if (rc) {
268 *t = ULLONG_MAX;
269 /* Was the hardware setup for this timer? */
270 if (head->next_cpu == bcpu)
271 uv_rtc_find_next_timer(head, pnode);
272 }
273
274 spin_unlock_irqrestore(&head->lock, flags);
275
276 return rc;
277}
278
279
280/*
281 * Kernel interface routines.
282 */
283
284/*
285 * Read the RTC.
286 *
287 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
288 * cachelines of it's own page. This allows faster simultaneous reads
289 * from a given socket.
290 */
291static cycle_t uv_read_rtc(struct clocksource *cs)
292{
293 unsigned long offset;
294
295 if (uv_get_min_hub_revision_id() == 1)
296 offset = 0;
297 else
298 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
299
300 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
301}
302
303/*
304 * Program the next event, relative to now
305 */
306static int uv_rtc_next_event(unsigned long delta,
307 struct clock_event_device *ced)
308{
309 int ced_cpu = cpumask_first(ced->cpumask);
310
311 return uv_rtc_set_timer(ced_cpu, delta + uv_read_rtc(NULL));
312}
313
314/*
315 * Setup the RTC timer in oneshot mode
316 */
317static void uv_rtc_timer_setup(enum clock_event_mode mode,
318 struct clock_event_device *evt)
319{
320 int ced_cpu = cpumask_first(evt->cpumask);
321
322 switch (mode) {
323 case CLOCK_EVT_MODE_PERIODIC:
324 case CLOCK_EVT_MODE_ONESHOT:
325 case CLOCK_EVT_MODE_RESUME:
326 /* Nothing to do here yet */
327 break;
328 case CLOCK_EVT_MODE_UNUSED:
329 case CLOCK_EVT_MODE_SHUTDOWN:
330 uv_rtc_unset_timer(ced_cpu, 1);
331 break;
332 }
333}
334
335static void uv_rtc_interrupt(void)
336{
337 int cpu = smp_processor_id();
338 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
339
340 if (!ced || !ced->event_handler)
341 return;
342
343 if (uv_rtc_unset_timer(cpu, 0) != 1)
344 return;
345
346 ced->event_handler(ced);
347}
348
349static int __init uv_enable_evt_rtc(char *str)
350{
351 uv_rtc_evt_enable = 1;
352
353 return 1;
354}
355__setup("uvrtcevt", uv_enable_evt_rtc);
356
357static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
358{
359 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
360
361 *ced = clock_event_device_uv;
362 ced->cpumask = cpumask_of(smp_processor_id());
363 clockevents_register_device(ced);
364}
365
366static __init int uv_rtc_setup_clock(void)
367{
368 int rc;
369
370 if (!is_uv_system())
371 return -ENODEV;
372
373 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
374 clocksource_uv.shift);
375
376 /* If single blade, prefer tsc */
377 if (uv_num_possible_blades() == 1)
378 clocksource_uv.rating = 250;
379
380 rc = clocksource_register(&clocksource_uv);
381 if (rc)
382 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
383 else
384 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
385 sn_rtc_cycles_per_second/(unsigned long)1E6);
386
387 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
388 return rc;
389
390 /* Setup and register clockevents */
391 rc = uv_rtc_allocate_timers();
392 if (rc)
393 goto error;
394
395 x86_platform_ipi_callback = uv_rtc_interrupt;
396
397 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
398 NSEC_PER_SEC, clock_event_device_uv.shift);
399
400 clock_event_device_uv.min_delta_ns = NSEC_PER_SEC /
401 sn_rtc_cycles_per_second;
402
403 clock_event_device_uv.max_delta_ns = clocksource_uv.mask *
404 (NSEC_PER_SEC / sn_rtc_cycles_per_second);
405
406 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
407 if (rc) {
408 x86_platform_ipi_callback = NULL;
409 uv_rtc_deallocate_timers();
410 goto error;
411 }
412
413 printk(KERN_INFO "UV RTC clockevents registered\n");
414
415 return 0;
416
417error:
418 clocksource_unregister(&clocksource_uv);
419 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
420
421 return rc;
422}
423arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de) 7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) 8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) 9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
10 * 11 *
11 * This source code is licensed under the GNU General Public License, 12 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details. 13 * Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
14 * This is a common code for verification whether CPU supports 15 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this 16 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context. 17 * file is included at various places and compiled in that context.
17 * Following are the current usage. 18 * This file is expected to run in 32bit code. Currently:
18 * 19 *
19 * This file is included by both 16bit and 32bit code. 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup
20 * 23 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure 25 * 0: Success 1: Failure
28 * 26 *
27 * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
28 *
29 * The caller needs to check for the error code and take the action 29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
@@ -62,8 +62,41 @@ verify_cpu:
62 cmpl $0x444d4163,%ecx 62 cmpl $0x444d4163,%ecx
63 jnz verify_cpu_noamd 63 jnz verify_cpu_noamd
64 mov $1,%di # cpu is from AMD 64 mov $1,%di # cpu is from AMD
65 jmp verify_cpu_check
65 66
66verify_cpu_noamd: 67verify_cpu_noamd:
68 cmpl $0x756e6547,%ebx # GenuineIntel?
69 jnz verify_cpu_check
70 cmpl $0x49656e69,%edx
71 jnz verify_cpu_check
72 cmpl $0x6c65746e,%ecx
73 jnz verify_cpu_check
74
75 # only call IA32_MISC_ENABLE when:
76 # family > 6 || (family == 6 && model >= 0xd)
77 movl $0x1, %eax # check CPU family and model
78 cpuid
79 movl %eax, %ecx
80
81 andl $0x0ff00f00, %eax # mask family and extended family
82 shrl $8, %eax
83 cmpl $6, %eax
84 ja verify_cpu_clear_xd # family > 6, ok
85 jb verify_cpu_check # family < 6, skip
86
87 andl $0x000f00f0, %ecx # mask model and extended model
88 shrl $4, %ecx
89 cmpl $0xd, %ecx
90 jb verify_cpu_check # family == 6, model < 0xd, skip
91
92verify_cpu_clear_xd:
93 movl $MSR_IA32_MISC_ENABLE, %ecx
94 rdmsr
95 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
96 jnc verify_cpu_check # only write MSR if bit was changed
97 wrmsr
98
99verify_cpu_check:
67 movl $0x1,%eax # Does the cpu have what it takes 100 movl $0x1,%eax # Does the cpu have what it takes
68 cpuid 101 cpuid
69 andl $REQUIRED_MASK0,%edx 102 andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
deleted file mode 100644
index e680ea52db9b..000000000000
--- a/arch/x86/kernel/visws_quirks.c
+++ /dev/null
@@ -1,666 +0,0 @@
1/*
2 * SGI Visual Workstation support and quirks, unmaintained.
3 *
4 * Split out from setup.c by davej@suse.de
5 *
6 * Copyright (C) 1999 Bent Hagemark, Ingo Molnar
7 *
8 * SGI Visual Workstation interrupt controller
9 *
10 * The Cobalt system ASIC in the Visual Workstation contains a "Cobalt" APIC
11 * which serves as the main interrupt controller in the system. Non-legacy
12 * hardware in the system uses this controller directly. Legacy devices
13 * are connected to the PIIX4 which in turn has its 8259(s) connected to
14 * a of the Cobalt APIC entry.
15 *
16 * 09/02/2000 - Updated for 2.4 by jbarnes@sgi.com
17 *
18 * 25/11/2002 - Updated for 2.5 by Andrey Panin <pazke@orbita1.ru>
19 */
20#include <linux/interrupt.h>
21#include <linux/module.h>
22#include <linux/init.h>
23#include <linux/smp.h>
24
25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h>
27#include <asm/io_apic.h>
28#include <asm/fixmap.h>
29#include <asm/reboot.h>
30#include <asm/setup.h>
31#include <asm/apic.h>
32#include <asm/e820.h>
33#include <asm/time.h>
34#include <asm/io.h>
35
36#include <linux/kernel_stat.h>
37
38#include <asm/i8259.h>
39#include <asm/irq_vectors.h>
40#include <asm/visws/lithium.h>
41
42#include <linux/sched.h>
43#include <linux/kernel.h>
44#include <linux/pci.h>
45#include <linux/pci_ids.h>
46
47extern int no_broadcast;
48
49char visws_board_type = -1;
50char visws_board_rev = -1;
51
52static void __init visws_time_init(void)
53{
54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
55
56 /* Set the countdown value */
57 co_cpu_write(CO_CPU_TIMEVAL, CO_TIME_HZ/HZ);
58
59 /* Start the timer */
60 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) | CO_CTRL_TIMERUN);
61
62 /* Enable (unmask) the timer interrupt */
63 co_cpu_write(CO_CPU_CTRL, co_cpu_read(CO_CPU_CTRL) & ~CO_CTRL_TIMEMASK);
64
65 setup_default_timer_irq();
66}
67
68/* Replaces the default init_ISA_irqs in the generic setup */
69static void __init visws_pre_intr_init(void)
70{
71 init_VISWS_APIC_irqs();
72}
73
74/* Quirk for machine specific memory setup. */
75
76#define MB (1024 * 1024)
77
78unsigned long sgivwfb_mem_phys;
79unsigned long sgivwfb_mem_size;
80EXPORT_SYMBOL(sgivwfb_mem_phys);
81EXPORT_SYMBOL(sgivwfb_mem_size);
82
83long long mem_size __initdata = 0;
84
85static char * __init visws_memory_setup(void)
86{
87 long long gfx_mem_size = 8 * MB;
88
89 mem_size = boot_params.alt_mem_k;
90
91 if (!mem_size) {
92 printk(KERN_WARNING "Bootloader didn't set memory size, upgrade it !\n");
93 mem_size = 128 * MB;
94 }
95
96 /*
97 * this hardcodes the graphics memory to 8 MB
98 * it really should be sized dynamically (or at least
99 * set as a boot param)
100 */
101 if (!sgivwfb_mem_size) {
102 printk(KERN_WARNING "Defaulting to 8 MB framebuffer size\n");
103 sgivwfb_mem_size = 8 * MB;
104 }
105
106 /*
107 * Trim to nearest MB
108 */
109 sgivwfb_mem_size &= ~((1 << 20) - 1);
110 sgivwfb_mem_phys = mem_size - gfx_mem_size;
111
112 e820_add_region(0, LOWMEMSIZE(), E820_RAM);
113 e820_add_region(HIGH_MEMORY, mem_size - sgivwfb_mem_size - HIGH_MEMORY, E820_RAM);
114 e820_add_region(sgivwfb_mem_phys, sgivwfb_mem_size, E820_RESERVED);
115
116 return "PROM";
117}
118
119static void visws_machine_emergency_restart(void)
120{
121 /*
122 * Visual Workstations restart after this
123 * register is poked on the PIIX4
124 */
125 outb(PIIX4_RESET_VAL, PIIX4_RESET_PORT);
126}
127
128static void visws_machine_power_off(void)
129{
130 unsigned short pm_status;
131/* extern unsigned int pci_bus0; */
132
133 while ((pm_status = inw(PMSTS_PORT)) & 0x100)
134 outw(pm_status, PMSTS_PORT);
135
136 outw(PM_SUSPEND_ENABLE, PMCNTRL_PORT);
137
138 mdelay(10);
139
140#define PCI_CONF1_ADDRESS(bus, devfn, reg) \
141 (0x80000000 | (bus << 16) | (devfn << 8) | (reg & ~3))
142
143/* outl(PCI_CONF1_ADDRESS(pci_bus0, SPECIAL_DEV, SPECIAL_REG), 0xCF8); */
144 outl(PIIX_SPECIAL_STOP, 0xCFC);
145}
146
147static void __init visws_get_smp_config(unsigned int early)
148{
149}
150
151/*
152 * The Visual Workstation is Intel MP compliant in the hardware
153 * sense, but it doesn't have a BIOS(-configuration table).
154 * No problem for Linux.
155 */
156
157static void __init MP_processor_info(struct mpc_cpu *m)
158{
159 int ver, logical_apicid;
160 physid_mask_t apic_cpus;
161
162 if (!(m->cpuflag & CPU_ENABLED))
163 return;
164
165 logical_apicid = m->apicid;
166 printk(KERN_INFO "%sCPU #%d %u:%u APIC version %d\n",
167 m->cpuflag & CPU_BOOTPROCESSOR ? "Bootup " : "",
168 m->apicid, (m->cpufeature & CPU_FAMILY_MASK) >> 8,
169 (m->cpufeature & CPU_MODEL_MASK) >> 4, m->apicver);
170
171 if (m->cpuflag & CPU_BOOTPROCESSOR)
172 boot_cpu_physical_apicid = m->apicid;
173
174 ver = m->apicver;
175 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
176 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
177 m->apicid, MAX_APICS);
178 return;
179 }
180
181 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
182 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
183 /*
184 * Validate version
185 */
186 if (ver == 0x0) {
187 printk(KERN_ERR "BIOS bug, APIC version is 0 for CPU#%d! "
188 "fixing up to 0x10. (tell your hw vendor)\n",
189 m->apicid);
190 ver = 0x10;
191 }
192 apic_version[m->apicid] = ver;
193}
194
195static void __init visws_find_smp_config(void)
196{
197 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
198 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
199
200 if (ncpus > CO_CPU_MAX) {
201 printk(KERN_WARNING "find_visws_smp: got cpu count of %d at %p\n",
202 ncpus, mp);
203
204 ncpus = CO_CPU_MAX;
205 }
206
207 if (ncpus > setup_max_cpus)
208 ncpus = setup_max_cpus;
209
210#ifdef CONFIG_X86_LOCAL_APIC
211 smp_found_config = 1;
212#endif
213 while (ncpus--)
214 MP_processor_info(mp++);
215
216 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
217}
218
219static void visws_trap_init(void);
220
221void __init visws_early_detect(void)
222{
223 int raw;
224
225 visws_board_type = (char)(inb_p(PIIX_GPI_BD_REG) & PIIX_GPI_BD_REG)
226 >> PIIX_GPI_BD_SHIFT;
227
228 if (visws_board_type < 0)
229 return;
230
231 /*
232 * Override the default platform setup functions
233 */
234 x86_init.resources.memory_setup = visws_memory_setup;
235 x86_init.mpparse.get_smp_config = visws_get_smp_config;
236 x86_init.mpparse.find_smp_config = visws_find_smp_config;
237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
238 x86_init.irqs.trap_init = visws_trap_init;
239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
242
243 /*
244 * Install reboot quirks:
245 */
246 pm_power_off = visws_machine_power_off;
247 machine_ops.emergency_restart = visws_machine_emergency_restart;
248
249 /*
250 * Do not use broadcast IPIs:
251 */
252 no_broadcast = 0;
253
254#ifdef CONFIG_X86_IO_APIC
255 /*
256 * Turn off IO-APIC detection and initialization:
257 */
258 skip_ioapic_setup = 1;
259#endif
260
261 /*
262 * Get Board rev.
263 * First, we have to initialize the 307 part to allow us access
264 * to the GPIO registers. Let's map them at 0x0fc0 which is right
265 * after the PIIX4 PM section.
266 */
267 outb_p(SIO_DEV_SEL, SIO_INDEX);
268 outb_p(SIO_GP_DEV, SIO_DATA); /* Talk to GPIO regs. */
269
270 outb_p(SIO_DEV_MSB, SIO_INDEX);
271 outb_p(SIO_GP_MSB, SIO_DATA); /* MSB of GPIO base address */
272
273 outb_p(SIO_DEV_LSB, SIO_INDEX);
274 outb_p(SIO_GP_LSB, SIO_DATA); /* LSB of GPIO base address */
275
276 outb_p(SIO_DEV_ENB, SIO_INDEX);
277 outb_p(1, SIO_DATA); /* Enable GPIO registers. */
278
279 /*
280 * Now, we have to map the power management section to write
281 * a bit which enables access to the GPIO registers.
282 * What lunatic came up with this shit?
283 */
284 outb_p(SIO_DEV_SEL, SIO_INDEX);
285 outb_p(SIO_PM_DEV, SIO_DATA); /* Talk to GPIO regs. */
286
287 outb_p(SIO_DEV_MSB, SIO_INDEX);
288 outb_p(SIO_PM_MSB, SIO_DATA); /* MSB of PM base address */
289
290 outb_p(SIO_DEV_LSB, SIO_INDEX);
291 outb_p(SIO_PM_LSB, SIO_DATA); /* LSB of PM base address */
292
293 outb_p(SIO_DEV_ENB, SIO_INDEX);
294 outb_p(1, SIO_DATA); /* Enable PM registers. */
295
296 /*
297 * Now, write the PM register which enables the GPIO registers.
298 */
299 outb_p(SIO_PM_FER2, SIO_PM_INDEX);
300 outb_p(SIO_PM_GP_EN, SIO_PM_DATA);
301
302 /*
303 * Now, initialize the GPIO registers.
304 * We want them all to be inputs which is the
305 * power on default, so let's leave them alone.
306 * So, let's just read the board rev!
307 */
308 raw = inb_p(SIO_GP_DATA1);
309 raw &= 0x7f; /* 7 bits of valid board revision ID. */
310
311 if (visws_board_type == VISWS_320) {
312 if (raw < 0x6) {
313 visws_board_rev = 4;
314 } else if (raw < 0xc) {
315 visws_board_rev = 5;
316 } else {
317 visws_board_rev = 6;
318 }
319 } else if (visws_board_type == VISWS_540) {
320 visws_board_rev = 2;
321 } else {
322 visws_board_rev = raw;
323 }
324
325 printk(KERN_INFO "Silicon Graphics Visual Workstation %s (rev %d) detected\n",
326 (visws_board_type == VISWS_320 ? "320" :
327 (visws_board_type == VISWS_540 ? "540" :
328 "unknown")), visws_board_rev);
329}
330
331#define A01234 (LI_INTA_0 | LI_INTA_1 | LI_INTA_2 | LI_INTA_3 | LI_INTA_4)
332#define BCD (LI_INTB | LI_INTC | LI_INTD)
333#define ALLDEVS (A01234 | BCD)
334
335static __init void lithium_init(void)
336{
337 set_fixmap(FIX_LI_PCIA, LI_PCI_A_PHYS);
338 set_fixmap(FIX_LI_PCIB, LI_PCI_B_PHYS);
339
340 if ((li_pcia_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
341 (li_pcia_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
342 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'A');
343/* panic("This machine is not SGI Visual Workstation 320/540"); */
344 }
345
346 if ((li_pcib_read16(PCI_VENDOR_ID) != PCI_VENDOR_ID_SGI) ||
347 (li_pcib_read16(PCI_DEVICE_ID) != PCI_DEVICE_ID_SGI_LITHIUM)) {
348 printk(KERN_EMERG "Lithium hostbridge %c not found\n", 'B');
349/* panic("This machine is not SGI Visual Workstation 320/540"); */
350 }
351
352 li_pcia_write16(LI_PCI_INTEN, ALLDEVS);
353 li_pcib_write16(LI_PCI_INTEN, ALLDEVS);
354}
355
356static __init void cobalt_init(void)
357{
358 /*
359 * On normal SMP PC this is used only with SMP, but we have to
360 * use it and set it up here to start the Cobalt clock
361 */
362 set_fixmap(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
363 setup_local_APIC();
364 printk(KERN_INFO "Local APIC Version %#x, ID %#x\n",
365 (unsigned int)apic_read(APIC_LVR),
366 (unsigned int)apic_read(APIC_ID));
367
368 set_fixmap(FIX_CO_CPU, CO_CPU_PHYS);
369 set_fixmap(FIX_CO_APIC, CO_APIC_PHYS);
370 printk(KERN_INFO "Cobalt Revision %#lx, APIC ID %#lx\n",
371 co_cpu_read(CO_CPU_REV), co_apic_read(CO_APIC_ID));
372
373 /* Enable Cobalt APIC being careful to NOT change the ID! */
374 co_apic_write(CO_APIC_ID, co_apic_read(CO_APIC_ID) | CO_APIC_ENABLE);
375
376 printk(KERN_INFO "Cobalt APIC enabled: ID reg %#lx\n",
377 co_apic_read(CO_APIC_ID));
378}
379
380static void __init visws_trap_init(void)
381{
382 lithium_init();
383 cobalt_init();
384}
385
386/*
387 * IRQ controller / APIC support:
388 */
389
390static DEFINE_SPINLOCK(cobalt_lock);
391
392/*
393 * Set the given Cobalt APIC Redirection Table entry to point
394 * to the given IDT vector/index.
395 */
396static inline void co_apic_set(int entry, int irq)
397{
398 co_apic_write(CO_APIC_LO(entry), CO_APIC_LEVEL | (irq + FIRST_EXTERNAL_VECTOR));
399 co_apic_write(CO_APIC_HI(entry), 0);
400}
401
402/*
403 * Cobalt (IO)-APIC functions to handle PCI devices.
404 */
405static inline int co_apic_ide0_hack(void)
406{
407 extern char visws_board_type;
408 extern char visws_board_rev;
409
410 if (visws_board_type == VISWS_320 && visws_board_rev == 5)
411 return 5;
412 return CO_APIC_IDE0;
413}
414
415static int is_co_apic(unsigned int irq)
416{
417 if (IS_CO_APIC(irq))
418 return CO_APIC(irq);
419
420 switch (irq) {
421 case 0: return CO_APIC_CPU;
422 case CO_IRQ_IDE0: return co_apic_ide0_hack();
423 case CO_IRQ_IDE1: return CO_APIC_IDE1;
424 default: return -1;
425 }
426}
427
428
429/*
430 * This is the SGI Cobalt (IO-)APIC:
431 */
432
433static void enable_cobalt_irq(unsigned int irq)
434{
435 co_apic_set(is_co_apic(irq), irq);
436}
437
438static void disable_cobalt_irq(unsigned int irq)
439{
440 int entry = is_co_apic(irq);
441
442 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
443 co_apic_read(CO_APIC_LO(entry));
444}
445
446/*
447 * "irq" really just serves to identify the device. Here is where we
448 * map this to the Cobalt APIC entry where it's physically wired.
449 * This is called via request_irq -> setup_irq -> irq_desc->startup()
450 */
451static unsigned int startup_cobalt_irq(unsigned int irq)
452{
453 unsigned long flags;
454 struct irq_desc *desc = irq_to_desc(irq);
455
456 spin_lock_irqsave(&cobalt_lock, flags);
457 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
458 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
459 enable_cobalt_irq(irq);
460 spin_unlock_irqrestore(&cobalt_lock, flags);
461 return 0;
462}
463
464static void ack_cobalt_irq(unsigned int irq)
465{
466 unsigned long flags;
467
468 spin_lock_irqsave(&cobalt_lock, flags);
469 disable_cobalt_irq(irq);
470 apic_write(APIC_EOI, APIC_EIO_ACK);
471 spin_unlock_irqrestore(&cobalt_lock, flags);
472}
473
474static void end_cobalt_irq(unsigned int irq)
475{
476 unsigned long flags;
477 struct irq_desc *desc = irq_to_desc(irq);
478
479 spin_lock_irqsave(&cobalt_lock, flags);
480 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
481 enable_cobalt_irq(irq);
482 spin_unlock_irqrestore(&cobalt_lock, flags);
483}
484
485static struct irq_chip cobalt_irq_type = {
486 .name = "Cobalt-APIC",
487 .startup = startup_cobalt_irq,
488 .shutdown = disable_cobalt_irq,
489 .enable = enable_cobalt_irq,
490 .disable = disable_cobalt_irq,
491 .ack = ack_cobalt_irq,
492 .end = end_cobalt_irq,
493};
494
495
496/*
497 * This is the PIIX4-based 8259 that is wired up indirectly to Cobalt
498 * -- not the manner expected by the code in i8259.c.
499 *
500 * there is a 'master' physical interrupt source that gets sent to
501 * the CPU. But in the chipset there are various 'virtual' interrupts
502 * waiting to be handled. We represent this to Linux through a 'master'
503 * interrupt controller type, and through a special virtual interrupt-
504 * controller. Device drivers only see the virtual interrupt sources.
505 */
506static unsigned int startup_piix4_master_irq(unsigned int irq)
507{
508 legacy_pic->init(0);
509
510 return startup_cobalt_irq(irq);
511}
512
513static void end_piix4_master_irq(unsigned int irq)
514{
515 unsigned long flags;
516
517 spin_lock_irqsave(&cobalt_lock, flags);
518 enable_cobalt_irq(irq);
519 spin_unlock_irqrestore(&cobalt_lock, flags);
520}
521
522static struct irq_chip piix4_master_irq_type = {
523 .name = "PIIX4-master",
524 .startup = startup_piix4_master_irq,
525 .ack = ack_cobalt_irq,
526 .end = end_piix4_master_irq,
527};
528
529
530static struct irq_chip piix4_virtual_irq_type = {
531 .name = "PIIX4-virtual",
532};
533
534
535/*
536 * PIIX4-8259 master/virtual functions to handle interrupt requests
537 * from legacy devices: floppy, parallel, serial, rtc.
538 *
539 * None of these get Cobalt APIC entries, neither do they have IDT
540 * entries. These interrupts are purely virtual and distributed from
541 * the 'master' interrupt source: CO_IRQ_8259.
542 *
543 * When the 8259 interrupts its handler figures out which of these
544 * devices is interrupting and dispatches to its handler.
545 *
546 * CAREFUL: devices see the 'virtual' interrupt only. Thus disable/
547 * enable_irq gets the right irq. This 'master' irq is never directly
548 * manipulated by any driver.
549 */
550static irqreturn_t piix4_master_intr(int irq, void *dev_id)
551{
552 int realirq;
553 struct irq_desc *desc;
554 unsigned long flags;
555
556 raw_spin_lock_irqsave(&i8259A_lock, flags);
557
558 /* Find out what's interrupting in the PIIX4 master 8259 */
559 outb(0x0c, 0x20); /* OCW3 Poll command */
560 realirq = inb(0x20);
561
562 /*
563 * Bit 7 == 0 means invalid/spurious
564 */
565 if (unlikely(!(realirq & 0x80)))
566 goto out_unlock;
567
568 realirq &= 7;
569
570 if (unlikely(realirq == 2)) {
571 outb(0x0c, 0xa0);
572 realirq = inb(0xa0);
573
574 if (unlikely(!(realirq & 0x80)))
575 goto out_unlock;
576
577 realirq = (realirq & 7) + 8;
578 }
579
580 /* mask and ack interrupt */
581 cached_irq_mask |= 1 << realirq;
582 if (unlikely(realirq > 7)) {
583 inb(0xa1);
584 outb(cached_slave_mask, 0xa1);
585 outb(0x60 + (realirq & 7), 0xa0);
586 outb(0x60 + 2, 0x20);
587 } else {
588 inb(0x21);
589 outb(cached_master_mask, 0x21);
590 outb(0x60 + realirq, 0x20);
591 }
592
593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
594
595 desc = irq_to_desc(realirq);
596
597 /*
598 * handle this 'virtual interrupt' as a Cobalt one now.
599 */
600 kstat_incr_irqs_this_cpu(realirq, desc);
601
602 if (likely(desc->action != NULL))
603 handle_IRQ_event(realirq, desc->action);
604
605 if (!(desc->status & IRQ_DISABLED))
606 legacy_pic->chip->unmask(realirq);
607
608 return IRQ_HANDLED;
609
610out_unlock:
611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
612 return IRQ_NONE;
613}
614
615static struct irqaction master_action = {
616 .handler = piix4_master_intr,
617 .name = "PIIX4-8259",
618};
619
620static struct irqaction cascade_action = {
621 .handler = no_action,
622 .name = "cascade",
623};
624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631
632void init_VISWS_APIC_irqs(void)
633{
634 int i;
635
636 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
637 struct irq_desc *desc = irq_to_desc(i);
638
639 desc->status = IRQ_DISABLED;
640 desc->action = 0;
641 desc->depth = 1;
642
643 if (i == 0) {
644 desc->chip = &cobalt_irq_type;
645 }
646 else if (i == CO_IRQ_IDE0) {
647 desc->chip = &cobalt_irq_type;
648 }
649 else if (i == CO_IRQ_IDE1) {
650 desc->chip = &cobalt_irq_type;
651 }
652 else if (i == CO_IRQ_8259) {
653 desc->chip = &piix4_master_irq_type;
654 }
655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
657 desc->chip = &piix4_virtual_irq_type;
658 }
659 else if (IS_CO_APIC(i)) {
660 desc->chip = &cobalt_irq_type;
661 }
662 }
663
664 setup_irq(CO_IRQ_8259, &master_action);
665 setup_irq(2, &cascade_action);
666}
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -551,8 +552,14 @@ cannot_handle:
551int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 552int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
552{ 553{
553 if (VMPI.is_vm86pus) { 554 if (VMPI.is_vm86pus) {
554 if ((trapno == 3) || (trapno == 1)) 555 if ((trapno == 3) || (trapno == 1)) {
555 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 556 KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
557 /* setting this flag forces the code in entry_32.S to
558 call save_v86_state() and change the stack pointer
559 to KVM86->regs32 */
560 set_thread_flag(TIF_IRET);
561 return 0;
562 }
556 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 563 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
557 return 0; 564 return 0;
558 } 565 }
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/gfp.h>
32#include <asm/vmi.h>
33#include <asm/io.h>
34#include <asm/fixmap.h>
35#include <asm/apicdef.h>
36#include <asm/apic.h>
37#include <asm/pgalloc.h>
38#include <asm/processor.h>
39#include <asm/timer.h>
40#include <asm/vmi_time.h>
41#include <asm/kmap_types.h>
42#include <asm/setup.h>
43
44/* Convenient for calling VMI functions indirectly in the ROM */
45typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
46typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
47
48#define call_vrom_func(rom,func) \
49 (((VROMFUNC *)(rom->func))())
50
51#define call_vrom_long_func(rom,func,arg) \
52 (((VROMLONGFUNC *)(rom->func)) (arg))
53
54static struct vrom_header *vmi_rom;
55static int disable_pge;
56static int disable_pse;
57static int disable_sep;
58static int disable_tsc;
59static int disable_mtrr;
60static int disable_noidle;
61static int disable_vmi_timer;
62
63/* Cached VMI operations */
64static struct {
65 void (*cpuid)(void /* non-c */);
66 void (*_set_ldt)(u32 selector);
67 void (*set_tr)(u32 selector);
68 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
69 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
70 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
71 void (*set_kernel_stack)(u32 selector, u32 sp0);
72 void (*allocate_page)(u32, u32, u32, u32, u32);
73 void (*release_page)(u32, u32);
74 void (*set_pte)(pte_t, pte_t *, unsigned);
75 void (*update_pte)(pte_t *, unsigned);
76 void (*set_linear_mapping)(int, void *, u32, u32);
77 void (*_flush_tlb)(int);
78 void (*set_initial_ap_state)(int, int);
79 void (*halt)(void);
80 void (*set_lazy_mode)(int mode);
81} vmi_ops;
82
83/* Cached VMI operations */
84struct vmi_timer_ops vmi_timer_ops;
85
86/*
87 * VMI patching routines.
88 */
89#define MNEM_CALL 0xe8
90#define MNEM_JMP 0xe9
91#define MNEM_RET 0xc3
92
93#define IRQ_PATCH_INT_MASK 0
94#define IRQ_PATCH_DISABLE 5
95
96static inline void patch_offset(void *insnbuf,
97 unsigned long ip, unsigned long dest)
98{
99 *(unsigned long *)(insnbuf+1) = dest-ip-5;
100}
101
102static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 unsigned long ip)
104{
105 u64 reloc;
106 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
107 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
108 switch(rel->type) {
109 case VMI_RELOCATION_CALL_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_CALL;
112 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_JUMP_REL:
116 BUG_ON(len < 5);
117 *(char *)insnbuf = MNEM_JMP;
118 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
119 return 5;
120
121 case VMI_RELOCATION_NOP:
122 /* obliterate the whole thing */
123 return 0;
124
125 case VMI_RELOCATION_NONE:
126 /* leave native code in place */
127 break;
128
129 default:
130 BUG();
131 }
132 return len;
133}
134
135/*
136 * Apply patch if appropriate, return length of new instruction
137 * sequence. The callee does nop padding for us.
138 */
139static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
140 unsigned long ip, unsigned len)
141{
142 switch (type) {
143 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
144 return patch_internal(VMI_CALL_DisableInterrupts, len,
145 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
147 return patch_internal(VMI_CALL_EnableInterrupts, len,
148 insns, ip);
149 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
150 return patch_internal(VMI_CALL_SetInterruptMask, len,
151 insns, ip);
152 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
153 return patch_internal(VMI_CALL_GetInterruptMask, len,
154 insns, ip);
155 case PARAVIRT_PATCH(pv_cpu_ops.iret):
156 return patch_internal(VMI_CALL_IRET, len, insns, ip);
157 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
158 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
159 default:
160 break;
161 }
162 return len;
163}
164
165/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
166static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
167 unsigned int *cx, unsigned int *dx)
168{
169 int override = 0;
170 if (*ax == 1)
171 override = 1;
172 asm volatile ("call *%6"
173 : "=a" (*ax),
174 "=b" (*bx),
175 "=c" (*cx),
176 "=d" (*dx)
177 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
178 if (override) {
179 if (disable_pse)
180 *dx &= ~X86_FEATURE_PSE;
181 if (disable_pge)
182 *dx &= ~X86_FEATURE_PGE;
183 if (disable_sep)
184 *dx &= ~X86_FEATURE_SEP;
185 if (disable_tsc)
186 *dx &= ~X86_FEATURE_TSC;
187 if (disable_mtrr)
188 *dx &= ~X86_FEATURE_MTRR;
189 }
190}
191
192static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
193{
194 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
195 write_gdt_entry(gdt, nr, new, 0);
196}
197
198static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
199{
200 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
201 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
202 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
203 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
204}
205
206static void vmi_set_ldt(const void *addr, unsigned entries)
207{
208 unsigned cpu = smp_processor_id();
209 struct desc_struct desc;
210
211 pack_descriptor(&desc, (unsigned long)addr,
212 entries * sizeof(struct desc_struct) - 1,
213 DESC_LDT, 0);
214 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
215 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
216}
217
218static void vmi_set_tr(void)
219{
220 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
221}
222
223static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
224{
225 u32 *idt_entry = (u32 *)g;
226 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
227}
228
229static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
230 const void *desc, int type)
231{
232 u32 *gdt_entry = (u32 *)desc;
233 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
234}
235
236static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
237 const void *desc)
238{
239 u32 *ldt_entry = (u32 *)desc;
240 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
241}
242
243static void vmi_load_sp0(struct tss_struct *tss,
244 struct thread_struct *thread)
245{
246 tss->x86_tss.sp0 = thread->sp0;
247
248 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
249 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
250 tss->x86_tss.ss1 = thread->sysenter_cs;
251 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
252 }
253 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
254}
255
256static void vmi_flush_tlb_user(void)
257{
258 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
259}
260
261static void vmi_flush_tlb_kernel(void)
262{
263 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
264}
265
266/* Stub to do nothing at all; used for delays and unimplemented calls */
267static void vmi_nop(void)
268{
269}
270
271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
272{
273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
274}
275
276static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
277{
278 /*
279 * This call comes in very early, before mem_map is setup.
280 * It is called only for swapper_pg_dir, which already has
281 * data on it.
282 */
283 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
284}
285
286static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
287{
288 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
289}
290
291static void vmi_release_pte(unsigned long pfn)
292{
293 vmi_ops.release_page(pfn, VMI_PAGE_L1);
294}
295
296static void vmi_release_pmd(unsigned long pfn)
297{
298 vmi_ops.release_page(pfn, VMI_PAGE_L2);
299}
300
301/*
302 * We use the pgd_free hook for releasing the pgd page:
303 */
304static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
305{
306 unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
307
308 vmi_ops.release_page(pfn, VMI_PAGE_L2);
309}
310
311/*
312 * Helper macros for MMU update flags. We can defer updates until a flush
313 * or page invalidation only if the update is to the current address space
314 * (otherwise, there is no flush). We must check against init_mm, since
315 * this could be a kernel update, which usually passes init_mm, although
316 * sometimes this check can be skipped if we know the particular function
317 * is only called on user mode PTEs. We could change the kernel to pass
318 * current->active_mm here, but in particular, I was unsure if changing
319 * mm/highmem.c to do this would still be correct on other architectures.
320 */
321#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
322 (!mustbeuser && (mm) == &init_mm))
323#define vmi_flags_addr(mm, addr, level, user) \
324 ((level) | (is_current_as(mm, user) ? \
325 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
326#define vmi_flags_addr_defer(mm, addr, level, user) \
327 ((level) | (is_current_as(mm, user) ? \
328 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
329
330static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
331{
332 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
333}
334
335static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
336{
337 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
338}
339
340static void vmi_set_pte(pte_t *ptep, pte_t pte)
341{
342 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
343 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
344}
345
346static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
347{
348 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
349}
350
351static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
352{
353#ifdef CONFIG_X86_PAE
354 const pte_t pte = { .pte = pmdval.pmd };
355#else
356 const pte_t pte = { pmdval.pud.pgd.pgd };
357#endif
358 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
359}
360
361#ifdef CONFIG_X86_PAE
362
363static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
364{
365 /*
366 * XXX This is called from set_pmd_pte, but at both PT
367 * and PD layers so the VMI_PAGE_PT flag is wrong. But
368 * it is only called for large page mapping changes,
369 * the Xen backend, doesn't support large pages, and the
370 * ESX backend doesn't depend on the flag.
371 */
372 set_64bit((unsigned long long *)ptep,pte_val(pteval));
373 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
374}
375
376static void vmi_set_pud(pud_t *pudp, pud_t pudval)
377{
378 /* Um, eww */
379 const pte_t pte = { .pte = pudval.pgd.pgd };
380 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
381}
382
383static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
384{
385 const pte_t pte = { .pte = 0 };
386 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
387}
388
389static void vmi_pmd_clear(pmd_t *pmd)
390{
391 const pte_t pte = { .pte = 0 };
392 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
393}
394#endif
395
396#ifdef CONFIG_SMP
397static void __devinit
398vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
399 unsigned long start_esp)
400{
401 struct vmi_ap_state ap;
402
403 /* Default everything to zero. This is fine for most GPRs. */
404 memset(&ap, 0, sizeof(struct vmi_ap_state));
405
406 ap.gdtr_limit = GDT_SIZE - 1;
407 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
408
409 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
410 ap.idtr_base = (unsigned long) idt_table;
411
412 ap.ldtr = 0;
413
414 ap.cs = __KERNEL_CS;
415 ap.eip = (unsigned long) start_eip;
416 ap.ss = __KERNEL_DS;
417 ap.esp = (unsigned long) start_esp;
418
419 ap.ds = __USER_DS;
420 ap.es = __USER_DS;
421 ap.fs = __KERNEL_PERCPU;
422 ap.gs = __KERNEL_STACK_CANARY;
423
424 ap.eflags = 0;
425
426#ifdef CONFIG_X86_PAE
427 /* efer should match BSP efer. */
428 if (cpu_has_nx) {
429 unsigned l, h;
430 rdmsr(MSR_EFER, l, h);
431 ap.efer = (unsigned long long) h << 32 | l;
432 }
433#endif
434
435 ap.cr3 = __pa(swapper_pg_dir);
436 /* Protected mode, paging, AM, WP, NE, MP. */
437 ap.cr0 = 0x80050023;
438 ap.cr4 = mmu_cr4_features;
439 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
440}
441#endif
442
443static void vmi_start_context_switch(struct task_struct *prev)
444{
445 paravirt_start_context_switch(prev);
446 vmi_ops.set_lazy_mode(2);
447}
448
449static void vmi_end_context_switch(struct task_struct *next)
450{
451 vmi_ops.set_lazy_mode(0);
452 paravirt_end_context_switch(next);
453}
454
455static void vmi_enter_lazy_mmu(void)
456{
457 paravirt_enter_lazy_mmu();
458 vmi_ops.set_lazy_mode(1);
459}
460
461static void vmi_leave_lazy_mmu(void)
462{
463 vmi_ops.set_lazy_mode(0);
464 paravirt_leave_lazy_mmu();
465}
466
467static inline int __init check_vmi_rom(struct vrom_header *rom)
468{
469 struct pci_header *pci;
470 struct pnp_header *pnp;
471 const char *manufacturer = "UNKNOWN";
472 const char *product = "UNKNOWN";
473 const char *license = "unspecified";
474
475 if (rom->rom_signature != 0xaa55)
476 return 0;
477 if (rom->vrom_signature != VMI_SIGNATURE)
478 return 0;
479 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
480 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
481 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
482 rom->api_version_maj,
483 rom->api_version_min);
484 return 0;
485 }
486
487 /*
488 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
489 * the PCI header and device type to make sure this is really a
490 * VMI device.
491 */
492 if (!rom->pci_header_offs) {
493 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
494 return 0;
495 }
496
497 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
498 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
499 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
500 /* Allow it to run... anyways, but warn */
501 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
502 }
503
504 if (rom->pnp_header_offs) {
505 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
506 if (pnp->manufacturer_offset)
507 manufacturer = (const char *)rom+pnp->manufacturer_offset;
508 if (pnp->product_offset)
509 product = (const char *)rom+pnp->product_offset;
510 }
511
512 if (rom->license_offs)
513 license = (char *)rom+rom->license_offs;
514
515 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
516 manufacturer, product,
517 rom->api_version_maj, rom->api_version_min,
518 pci->rom_version_maj, pci->rom_version_min);
519
520 /* Don't allow BSD/MIT here for now because we don't want to end up
521 with any binary only shim layers */
522 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
523 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
524 license);
525 return 0;
526 }
527
528 return 1;
529}
530
531/*
532 * Probe for the VMI option ROM
533 */
534static inline int __init probe_vmi_rom(void)
535{
536 unsigned long base;
537
538 /* VMI ROM is in option ROM area, check signature */
539 for (base = 0xC0000; base < 0xE0000; base += 2048) {
540 struct vrom_header *romstart;
541 romstart = (struct vrom_header *)isa_bus_to_virt(base);
542 if (check_vmi_rom(romstart)) {
543 vmi_rom = romstart;
544 return 1;
545 }
546 }
547 return 0;
548}
549
550/*
551 * VMI setup common to all processors
552 */
553void vmi_bringup(void)
554{
555 /* We must establish the lowmem mapping for MMU ops to work */
556 if (vmi_ops.set_linear_mapping)
557 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
558}
559
560/*
561 * Return a pointer to a VMI function or NULL if unimplemented
562 */
563static void *vmi_get_function(int vmicall)
564{
565 u64 reloc;
566 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
567 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
568 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
569 if (rel->type == VMI_RELOCATION_CALL_REL)
570 return (void *)rel->eip;
571 else
572 return NULL;
573}
574
575/*
576 * Helper macro for making the VMI paravirt-ops fill code readable.
577 * For unimplemented operations, fall back to default, unless nop
578 * is returned by the ROM.
579 */
580#define para_fill(opname, vmicall) \
581do { \
582 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
583 VMI_CALL_##vmicall); \
584 if (rel->type == VMI_RELOCATION_CALL_REL) \
585 opname = (void *)rel->eip; \
586 else if (rel->type == VMI_RELOCATION_NOP) \
587 opname = (void *)vmi_nop; \
588 else if (rel->type != VMI_RELOCATION_NONE) \
589 printk(KERN_WARNING "VMI: Unknown relocation " \
590 "type %d for " #vmicall"\n",\
591 rel->type); \
592} while (0)
593
594/*
595 * Helper macro for making the VMI paravirt-ops fill code readable.
596 * For cached operations which do not match the VMI ROM ABI and must
597 * go through a tranlation stub. Ignore NOPs, since it is not clear
598 * a NOP * VMI function corresponds to a NOP paravirt-op when the
599 * functions are not in 1-1 correspondence.
600 */
601#define para_wrap(opname, wrapper, cache, vmicall) \
602do { \
603 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
604 VMI_CALL_##vmicall); \
605 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
606 if (rel->type == VMI_RELOCATION_CALL_REL) { \
607 opname = wrapper; \
608 vmi_ops.cache = (void *)rel->eip; \
609 } \
610} while (0)
611
612/*
613 * Activate the VMI interface and switch into paravirtualized mode
614 */
615static inline int __init activate_vmi(void)
616{
617 short kernel_cs;
618 u64 reloc;
619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
628 printk(KERN_ERR "VMI ROM failed to initialize!");
629 return 0;
630 }
631 savesegment(cs, kernel_cs);
632
633 pv_info.paravirt_enabled = 1;
634 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
635 pv_info.name = "vmi [deprecated]";
636
637 pv_init_ops.patch = vmi_patch;
638
639 /*
640 * Many of these operations are ABI compatible with VMI.
641 * This means we can fill in the paravirt-ops with direct
642 * pointers into the VMI ROM. If the calling convention for
643 * these operations changes, this code needs to be updated.
644 *
645 * Exceptions
646 * CPUID paravirt-op uses pointers, not the native ISA
647 * halt has no VMI equivalent; all VMI halts are "safe"
648 * no MSR support yet - just trap and emulate. VMI uses the
649 * same ABI as the native ISA, but Linux wants exceptions
650 * from bogus MSR read / write handled
651 * rdpmc is not yet used in Linux
652 */
653
654 /* CPUID is special, so very special it gets wrapped like a present */
655 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
656
657 para_fill(pv_cpu_ops.clts, CLTS);
658 para_fill(pv_cpu_ops.get_debugreg, GetDR);
659 para_fill(pv_cpu_ops.set_debugreg, SetDR);
660 para_fill(pv_cpu_ops.read_cr0, GetCR0);
661 para_fill(pv_mmu_ops.read_cr2, GetCR2);
662 para_fill(pv_mmu_ops.read_cr3, GetCR3);
663 para_fill(pv_cpu_ops.read_cr4, GetCR4);
664 para_fill(pv_cpu_ops.write_cr0, SetCR0);
665 para_fill(pv_mmu_ops.write_cr2, SetCR2);
666 para_fill(pv_mmu_ops.write_cr3, SetCR3);
667 para_fill(pv_cpu_ops.write_cr4, SetCR4);
668
669 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
670 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
671 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
672 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
673
674 para_fill(pv_cpu_ops.wbinvd, WBINVD);
675 para_fill(pv_cpu_ops.read_tsc, RDTSC);
676
677 /* The following we emulate with trap and emulate for now */
678 /* paravirt_ops.read_msr = vmi_rdmsr */
679 /* paravirt_ops.write_msr = vmi_wrmsr */
680 /* paravirt_ops.rdpmc = vmi_rdpmc */
681
682 /* TR interface doesn't pass TR value, wrap */
683 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
684
685 /* LDT is special, too */
686 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
687
688 para_fill(pv_cpu_ops.load_gdt, SetGDT);
689 para_fill(pv_cpu_ops.load_idt, SetIDT);
690 para_fill(pv_cpu_ops.store_gdt, GetGDT);
691 para_fill(pv_cpu_ops.store_idt, GetIDT);
692 para_fill(pv_cpu_ops.store_tr, GetTR);
693 pv_cpu_ops.load_tls = vmi_load_tls;
694 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
695 write_ldt_entry, WriteLDTEntry);
696 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
697 write_gdt_entry, WriteGDTEntry);
698 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
699 write_idt_entry, WriteIDTEntry);
700 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
701 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
702 para_fill(pv_cpu_ops.io_delay, IODelay);
703
704 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
705 set_lazy_mode, SetLazyMode);
706 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
707 set_lazy_mode, SetLazyMode);
708
709 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
710 set_lazy_mode, SetLazyMode);
711 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
712 set_lazy_mode, SetLazyMode);
713
714 /* user and kernel flush are just handled with different flags to FlushTLB */
715 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
716 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
717 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
718
719 /*
720 * Until a standard flag format can be agreed on, we need to
721 * implement these as wrappers in Linux. Get the VMI ROM
722 * function pointers for the two backend calls.
723 */
724#ifdef CONFIG_X86_PAE
725 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
726 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
727#else
728 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
729 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
730#endif
731
732 if (vmi_ops.set_pte) {
733 pv_mmu_ops.set_pte = vmi_set_pte;
734 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
735 pv_mmu_ops.set_pmd = vmi_set_pmd;
736#ifdef CONFIG_X86_PAE
737 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
738 pv_mmu_ops.set_pud = vmi_set_pud;
739 pv_mmu_ops.pte_clear = vmi_pte_clear;
740 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
741#endif
742 }
743
744 if (vmi_ops.update_pte) {
745 pv_mmu_ops.pte_update = vmi_update_pte;
746 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
747 }
748
749 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
750 if (vmi_ops.allocate_page) {
751 pv_mmu_ops.alloc_pte = vmi_allocate_pte;
752 pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
753 pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
754 }
755
756 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
757 if (vmi_ops.release_page) {
758 pv_mmu_ops.release_pte = vmi_release_pte;
759 pv_mmu_ops.release_pmd = vmi_release_pmd;
760 pv_mmu_ops.pgd_free = vmi_pgd_free;
761 }
762
763 /* Set linear is needed in all cases */
764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
765
766 /*
767 * These MUST always be patched. Don't support indirect jumps
768 * through these operations, as the VMI interface may use either
769 * a jump or a call to get to these operations, depending on
770 * the backend. They are performance critical anyway, so requiring
771 * a patch is not a big problem.
772 */
773 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
774 pv_cpu_ops.iret = (void *)0xbadbab0;
775
776#ifdef CONFIG_SMP
777 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
778#endif
779
780#ifdef CONFIG_X86_LOCAL_APIC
781 para_fill(apic->read, APICRead);
782 para_fill(apic->write, APICWrite);
783#endif
784
785 /*
786 * Check for VMI timer functionality by probing for a cycle frequency method
787 */
788 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
789 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
790 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
791 vmi_timer_ops.get_cycle_counter =
792 vmi_get_function(VMI_CALL_GetCycleCounter);
793 vmi_timer_ops.get_wallclock =
794 vmi_get_function(VMI_CALL_GetWallclockTime);
795 vmi_timer_ops.wallclock_updated =
796 vmi_get_function(VMI_CALL_WallclockUpdated);
797 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
798 vmi_timer_ops.cancel_alarm =
799 vmi_get_function(VMI_CALL_CancelAlarm);
800 x86_init.timers.timer_init = vmi_time_init;
801#ifdef CONFIG_X86_LOCAL_APIC
802 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
803 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
804#endif
805 pv_time_ops.sched_clock = vmi_sched_clock;
806 x86_platform.calibrate_tsc = vmi_tsc_khz;
807 x86_platform.get_wallclock = vmi_get_wallclock;
808 x86_platform.set_wallclock = vmi_set_wallclock;
809
810 /* We have true wallclock functions; disable CMOS clock sync */
811 no_sync_cmos_clock = 1;
812 } else {
813 disable_noidle = 1;
814 disable_vmi_timer = 1;
815 }
816
817 para_fill(pv_irq_ops.safe_halt, Halt);
818
819 /*
820 * Alternative instruction rewriting doesn't happen soon enough
821 * to convert VMI_IRET to a call instead of a jump; so we have
822 * to do this before IRQs get reenabled. Fortunately, it is
823 * idempotent.
824 */
825 apply_paravirt(__parainstructions, __parainstructions_end);
826
827 vmi_bringup();
828
829 return 1;
830}
831
832#undef para_fill
833
834void __init vmi_init(void)
835{
836 if (!vmi_rom)
837 probe_vmi_rom();
838 else
839 check_vmi_rom(vmi_rom);
840
841 /* In case probing for or validating the ROM failed, basil */
842 if (!vmi_rom)
843 return;
844
845 reserve_top_address(-vmi_rom->virtual_top);
846
847#ifdef CONFIG_X86_IO_APIC
848 /* This is virtual hardware; timer routing is wired correctly */
849 no_timer_check = 1;
850#endif
851}
852
853void __init vmi_activate(void)
854{
855 unsigned long flags;
856
857 if (!vmi_rom)
858 return;
859
860 local_irq_save(flags);
861 activate_vmi();
862 local_irq_restore(flags & X86_EFLAGS_IF);
863}
864
865static int __init parse_vmi(char *arg)
866{
867 if (!arg)
868 return -EINVAL;
869
870 if (!strcmp(arg, "disable_pge")) {
871 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
872 disable_pge = 1;
873 } else if (!strcmp(arg, "disable_pse")) {
874 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
875 disable_pse = 1;
876 } else if (!strcmp(arg, "disable_sep")) {
877 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
878 disable_sep = 1;
879 } else if (!strcmp(arg, "disable_tsc")) {
880 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
881 disable_tsc = 1;
882 } else if (!strcmp(arg, "disable_mtrr")) {
883 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
884 disable_mtrr = 1;
885 } else if (!strcmp(arg, "disable_timer")) {
886 disable_vmi_timer = 1;
887 disable_noidle = 1;
888 } else if (!strcmp(arg, "disable_noidle"))
889 disable_noidle = 1;
890 return 0;
891}
892
893early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/apicdef.h>
32#include <asm/apic.h>
33#include <asm/timer.h>
34#include <asm/i8253.h>
35#include <asm/irq_vectors.h>
36
37#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
38#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
39
40static DEFINE_PER_CPU(struct clock_event_device, local_events);
41
42static inline u32 vmi_counter(u32 flags)
43{
44 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
45 * cycle counter. */
46 return flags & VMI_ALARM_COUNTER_MASK;
47}
48
49/* paravirt_ops.get_wallclock = vmi_get_wallclock */
50unsigned long vmi_get_wallclock(void)
51{
52 unsigned long long wallclock;
53 wallclock = vmi_timer_ops.get_wallclock(); // nsec
54 (void)do_div(wallclock, 1000000000); // sec
55
56 return wallclock;
57}
58
59/* paravirt_ops.set_wallclock = vmi_set_wallclock */
60int vmi_set_wallclock(unsigned long now)
61{
62 return 0;
63}
64
65/* paravirt_ops.sched_clock = vmi_sched_clock */
66unsigned long long vmi_sched_clock(void)
67{
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69}
70
71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void)
73{
74 unsigned long long khz;
75 khz = vmi_timer_ops.get_cycle_frequency();
76 (void)do_div(khz, 1000);
77 return khz;
78}
79
80static inline unsigned int vmi_get_timer_vector(void)
81{
82 return IRQ0_VECTOR;
83}
84
85/** vmi clockchip */
86#ifdef CONFIG_X86_LOCAL_APIC
87static unsigned int startup_timer_irq(unsigned int irq)
88{
89 unsigned long val = apic_read(APIC_LVTT);
90 apic_write(APIC_LVTT, vmi_get_timer_vector());
91
92 return (val & APIC_SEND_PENDING);
93}
94
95static void mask_timer_irq(unsigned int irq)
96{
97 unsigned long val = apic_read(APIC_LVTT);
98 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
99}
100
101static void unmask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
105}
106
107static void ack_timer_irq(unsigned int irq)
108{
109 ack_APIC_irq();
110}
111
112static struct irq_chip vmi_chip __read_mostly = {
113 .name = "VMI-LOCAL",
114 .startup = startup_timer_irq,
115 .mask = mask_timer_irq,
116 .unmask = unmask_timer_irq,
117 .ack = ack_timer_irq
118};
119#endif
120
121/** vmi clockevent */
122#define VMI_ALARM_WIRED_IRQ0 0x00000000
123#define VMI_ALARM_WIRED_LVTT 0x00010000
124static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
125
126static inline int vmi_get_alarm_wiring(void)
127{
128 return vmi_wiring;
129}
130
131static void vmi_timer_set_mode(enum clock_event_mode mode,
132 struct clock_event_device *evt)
133{
134 cycle_t now, cycles_per_hz;
135 BUG_ON(!irqs_disabled());
136
137 switch (mode) {
138 case CLOCK_EVT_MODE_ONESHOT:
139 case CLOCK_EVT_MODE_RESUME:
140 break;
141 case CLOCK_EVT_MODE_PERIODIC:
142 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
143 (void)do_div(cycles_per_hz, HZ);
144 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
145 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
146 break;
147 case CLOCK_EVT_MODE_UNUSED:
148 case CLOCK_EVT_MODE_SHUTDOWN:
149 switch (evt->mode) {
150 case CLOCK_EVT_MODE_ONESHOT:
151 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
152 break;
153 case CLOCK_EVT_MODE_PERIODIC:
154 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
155 break;
156 default:
157 break;
158 }
159 break;
160 default:
161 break;
162 }
163}
164
165static int vmi_timer_next_event(unsigned long delta,
166 struct clock_event_device *evt)
167{
168 /* Unfortunately, set_next_event interface only passes relative
169 * expiry, but we want absolute expiry. It'd be better if were
170 * were passed an absolute expiry, since a bunch of time may
171 * have been stolen between the time the delta is computed and
172 * when we set the alarm below. */
173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
174
175 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
176 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
177 return 0;
178}
179
180static struct clock_event_device vmi_clockevent = {
181 .name = "vmi-timer",
182 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
183 .shift = 22,
184 .set_mode = vmi_timer_set_mode,
185 .set_next_event = vmi_timer_next_event,
186 .rating = 1000,
187 .irq = 0,
188};
189
190static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
191{
192 struct clock_event_device *evt = &__get_cpu_var(local_events);
193 evt->event_handler(evt);
194 return IRQ_HANDLED;
195}
196
197static struct irqaction vmi_clock_action = {
198 .name = "vmi-timer",
199 .handler = vmi_timer_interrupt,
200 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
201};
202
203static void __devinit vmi_time_init_clockevent(void)
204{
205 cycle_t cycles_per_msec;
206 struct clock_event_device *evt;
207
208 int cpu = smp_processor_id();
209 evt = &__get_cpu_var(local_events);
210
211 /* Use cycles_per_msec since div_sc params are 32-bits. */
212 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
213 (void)do_div(cycles_per_msec, 1000);
214
215 memcpy(evt, &vmi_clockevent, sizeof(*evt));
216 /* Must pick .shift such that .mult fits in 32-bits. Choosing
217 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
218 * before overflow. */
219 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
220 /* Upper bound is clockevent's use of ulong for cycle deltas. */
221 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
223 evt->cpumask = cpumask_of(cpu);
224
225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
226 evt->name, evt->mult, evt->shift);
227 clockevents_register_device(evt);
228}
229
230void __init vmi_time_init(void)
231{
232 unsigned int cpu;
233 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
234 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
235
236 vmi_time_init_clockevent();
237 setup_irq(0, &vmi_clock_action);
238 for_each_possible_cpu(cpu)
239 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
240}
241
242#ifdef CONFIG_X86_LOCAL_APIC
243void __devinit vmi_time_bsp_init(void)
244{
245 /*
246 * On APIC systems, we want local timers to fire on each cpu. We do
247 * this by programming LVTT to deliver timer events to the IRQ handler
248 * for IRQ-0, since we can't re-use the APIC local timer handler
249 * without interfering with that code.
250 */
251 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
252 local_irq_disable();
253#ifdef CONFIG_SMP
254 /*
255 * XXX handle_percpu_irq only defined for SMP; we need to switch over
256 * to using it, since this is a local interrupt, which each CPU must
257 * handle individually without locking out or dropping simultaneous
258 * local timers on other CPUs. We also don't want to trigger the
259 * quirk workaround code for interrupts which gets invoked from
260 * handle_percpu_irq via eoi, so we use our own IRQ chip.
261 */
262 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
263#else
264 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
265#endif
266 vmi_wiring = VMI_ALARM_WIRED_LVTT;
267 apic_write(APIC_LVTT, vmi_get_timer_vector());
268 local_irq_enable();
269 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
270}
271
272void __devinit vmi_time_ap_init(void)
273{
274 vmi_time_init_clockevent();
275 apic_write(APIC_LVTT, vmi_get_timer_vector());
276}
277#endif
278
279/** vmi clocksource */
280static struct clocksource clocksource_vmi;
281
282static cycle_t read_real_cycles(struct clocksource *cs)
283{
284 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
285 return max(ret, clocksource_vmi.cycle_last);
286}
287
288static struct clocksource clocksource_vmi = {
289 .name = "vmi-timer",
290 .rating = 450,
291 .read = read_real_cycles,
292 .mask = CLOCKSOURCE_MASK(64),
293 .mult = 0, /* to be set */
294 .shift = 22,
295 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
296};
297
298static int __init init_vmi_clocksource(void)
299{
300 cycle_t cycles_per_msec;
301
302 if (!vmi_timer_ops.get_cycle_frequency)
303 return 0;
304 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
305 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
306 (void)do_div(cycles_per_msec, 1000);
307
308 /* Note that clocksource.{mult, shift} converts in the opposite direction
309 * as clockevents. */
310 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
311 clocksource_vmi.shift);
312
313 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
314 return clocksource_register(&clocksource_vmi);
315
316}
317module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..89aed99aafce 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
69 69
70PHDRS { 70PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */ 74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -116,6 +117,10 @@ SECTIONS
116 117
117 EXCEPTION_TABLE(16) :text = 0x9090 118 EXCEPTION_TABLE(16) :text = 0x9090
118 119
120#if defined(CONFIG_DEBUG_RODATA)
121 /* .text should occupy whole number of pages */
122 . = ALIGN(PAGE_SIZE);
123#endif
119 X64_ALIGN_DEBUG_RODATA_BEGIN 124 X64_ALIGN_DEBUG_RODATA_BEGIN
120 RO_DATA(PAGE_SIZE) 125 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END 126 X64_ALIGN_DEBUG_RODATA_END
@@ -156,6 +161,12 @@ SECTIONS
156 161
157#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
158#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
159 170
160 . = ALIGN(4096); 171 . = ALIGN(4096);
161 __vsyscall_0 = .; 172 __vsyscall_0 = .;
@@ -170,18 +181,6 @@ SECTIONS
170 *(.vsyscall_fn) 181 *(.vsyscall_fn)
171 } 182 }
172 183
173 . = ALIGN(L1_CACHE_BYTES);
174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
175 *(.vsyscall_gtod_data)
176 }
177
178 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
179 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
180 *(.vsyscall_clock)
181 }
182 vsyscall_clock = VVIRT(.vsyscall_clock);
183
184
185 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { 184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
186 *(.vsyscall_1) 185 *(.vsyscall_1)
187 } 186 }
@@ -189,21 +188,14 @@ SECTIONS
189 *(.vsyscall_2) 188 *(.vsyscall_2)
190 } 189 }
191 190
192 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
193 *(.vgetcpu_mode)
194 }
195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
196
197 . = ALIGN(L1_CACHE_BYTES);
198 .jiffies : AT(VLOAD(.jiffies)) {
199 *(.jiffies)
200 }
201 jiffies = VVIRT(.jiffies);
202
203 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
204 *(.vsyscall_3) 192 *(.vsyscall_3)
205 } 193 }
206 194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198
207 . = __vsyscall_0 + PAGE_SIZE; 199 . = __vsyscall_0 + PAGE_SIZE;
208 200
209#undef VSYSCALL_ADDR 201#undef VSYSCALL_ADDR
@@ -211,6 +203,7 @@ SECTIONS
211#undef VLOAD 203#undef VLOAD
212#undef VVIRT_OFFSET 204#undef VVIRT_OFFSET
213#undef VVIRT 205#undef VVIRT
206#undef EMIT_VVAR
214 207
215#endif /* CONFIG_X86_64 */ 208#endif /* CONFIG_X86_64 */
216 209
@@ -226,7 +219,7 @@ SECTIONS
226 * output PHDR, so the next output section - .init.text - should 219 * output PHDR, so the next output section - .init.text - should
227 * start another segment - init. 220 * start another segment - init.
228 */ 221 */
229 PERCPU_VADDR(0, :percpu) 222 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
230#endif 223#endif
231 224
232 INIT_TEXT_SECTION(PAGE_SIZE) 225 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -236,12 +229,30 @@ SECTIONS
236 229
237 INIT_DATA_SECTION(16) 230 INIT_DATA_SECTION(16)
238 231
232 /*
233 * Code and data for a variety of lowlevel trampolines, to be
234 * copied into base memory (< 1 MiB) during initialization.
235 * Since it is copied early, the main copy can be discarded
236 * afterwards.
237 */
238 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
239 x86_trampoline_start = .;
240 *(.x86_trampoline)
241 x86_trampoline_end = .;
242 }
243
239 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 244 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
240 __x86_cpu_dev_start = .; 245 __x86_cpu_dev_start = .;
241 *(.x86_cpu_dev.init) 246 *(.x86_cpu_dev.init)
242 __x86_cpu_dev_end = .; 247 __x86_cpu_dev_end = .;
243 } 248 }
244 249
250 /*
251 * start address and size of operations which during runtime
252 * can be patched with virtualization friendly instructions or
253 * baremetal native ones. Think page table operations.
254 * Details in paravirt_types.h
255 */
245 . = ALIGN(8); 256 . = ALIGN(8);
246 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 257 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
247 __parainstructions = .; 258 __parainstructions = .;
@@ -249,6 +260,11 @@ SECTIONS
249 __parainstructions_end = .; 260 __parainstructions_end = .;
250 } 261 }
251 262
263 /*
264 * struct alt_inst entries. From the header (alternative.h):
265 * "Alternative instructions for different CPU types or capabilities"
266 * Think locking instructions on spinlocks.
267 */
252 . = ALIGN(8); 268 . = ALIGN(8);
253 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 269 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
254 __alt_instructions = .; 270 __alt_instructions = .;
@@ -256,11 +272,36 @@ SECTIONS
256 __alt_instructions_end = .; 272 __alt_instructions_end = .;
257 } 273 }
258 274
275 /*
276 * And here are the replacement instructions. The linker sticks
277 * them as binary blobs. The .altinstructions has enough data to
278 * get the address and the length of them to patch the kernel safely.
279 */
259 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 280 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
260 *(.altinstr_replacement) 281 *(.altinstr_replacement)
261 } 282 }
262 283
263 /* 284 /*
285 * struct iommu_table_entry entries are injected in this section.
286 * It is an array of IOMMUs which during run time gets sorted depending
287 * on its dependency order. After rootfs_initcall is complete
288 * this section can be safely removed.
289 */
290 .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
291 __iommu_table = .;
292 *(.iommu_table)
293 __iommu_table_end = .;
294 }
295
296 . = ALIGN(8);
297 .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
298 __apicdrivers = .;
299 *(.apicdrivers);
300 __apicdrivers_end = .;
301 }
302
303 . = ALIGN(8);
304 /*
264 * .exit.text is discard at runtime, not link time, to deal with 305 * .exit.text is discard at runtime, not link time, to deal with
265 * references from .altinstructions and .eh_frame 306 * references from .altinstructions and .eh_frame
266 */ 307 */
@@ -273,7 +314,7 @@ SECTIONS
273 } 314 }
274 315
275#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 316#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
276 PERCPU(PAGE_SIZE) 317 PERCPU_SECTION(INTERNODE_CACHE_BYTES)
277#endif 318#endif
278 319
279 . = ALIGN(PAGE_SIZE); 320 . = ALIGN(PAGE_SIZE);
@@ -307,7 +348,7 @@ SECTIONS
307 __bss_start = .; 348 __bss_start = .;
308 *(.bss..page_aligned) 349 *(.bss..page_aligned)
309 *(.bss) 350 *(.bss)
310 . = ALIGN(4); 351 . = ALIGN(PAGE_SIZE);
311 __bss_stop = .; 352 __bss_stop = .;
312 } 353 }
313 354
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 000000000000..a81aa9e9894c
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b694..3e682184d76c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,17 +49,10 @@
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory" 50#define __syscall_clobber "r11","cx","memory"
51 51
52/* 52DEFINE_VVAR(int, vgetcpu_mode);
53 * vsyscall_gtod_data contains data that is : 53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54 * - readonly from vsyscalls
55 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
56 * Try to keep this structure as small as possible to avoid cache line ping pongs
57 */
58int __vgetcpu_mode __section_vgetcpu_mode;
59
60struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
61{ 54{
62 .lock = SEQLOCK_UNLOCKED, 55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
63 .sysctl_enabled = 1, 56 .sysctl_enabled = 1,
64}; 57};
65 58
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
97 */ 90 */
98static __always_inline void do_get_tz(struct timezone * tz) 91static __always_inline void do_get_tz(struct timezone * tz)
99{ 92{
100 *tz = __vsyscall_gtod_data.sys_tz; 93 *tz = VVAR(vsyscall_gtod_data).sys_tz;
101} 94}
102 95
103static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
126 unsigned long mult, shift, nsec; 119 unsigned long mult, shift, nsec;
127 cycle_t (*vread)(void); 120 cycle_t (*vread)(void);
128 do { 121 do {
129 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
130 123
131 vread = __vsyscall_gtod_data.clock.vread; 124 vread = VVAR(vsyscall_gtod_data).clock.vread;
132 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { 125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
133 gettimeofday(tv,NULL); 127 gettimeofday(tv,NULL);
134 return; 128 return;
135 } 129 }
136 130
137 now = vread(); 131 now = vread();
138 base = __vsyscall_gtod_data.clock.cycle_last; 132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
139 mask = __vsyscall_gtod_data.clock.mask; 133 mask = VVAR(vsyscall_gtod_data).clock.mask;
140 mult = __vsyscall_gtod_data.clock.mult; 134 mult = VVAR(vsyscall_gtod_data).clock.mult;
141 shift = __vsyscall_gtod_data.clock.shift; 135 shift = VVAR(vsyscall_gtod_data).clock.shift;
142 136
143 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; 137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
144 nsec = __vsyscall_gtod_data.wall_time_nsec; 138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
145 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
146 140
147 /* calculate interval: */ 141 /* calculate interval: */
148 cycle_delta = (now - base) & mask; 142 cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
171{ 165{
172 unsigned seq; 166 unsigned seq;
173 time_t result; 167 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
175 return time_syscall(t); 169 return time_syscall(t);
176 170
177 do { 171 do {
178 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
179 173
180 result = __vsyscall_gtod_data.wall_time_sec; 174 result = VVAR(vsyscall_gtod_data).wall_time_sec;
181 175
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
183 177
184 if (t) 178 if (t)
185 *t = result; 179 *t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
208 We do this here because otherwise user space would do it on 202 We do this here because otherwise user space would do it on
209 its own in a likely inferior way (no access to jiffies). 203 its own in a likely inferior way (no access to jiffies).
210 If you don't like it pass NULL. */ 204 If you don't like it pass NULL. */
211 if (tcache && tcache->blob[0] == (j = __jiffies)) { 205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
212 p = tcache->blob[1]; 206 p = tcache->blob[1];
213 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
214 /* Load per CPU data from RDTSCP */ 208 /* Load per CPU data from RDTSCP */
215 native_read_tscp(&p); 209 native_read_tscp(&p);
216 } else { 210 } else {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3eca..6f164bd5e14d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h> 7#include <linux/ioport.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pci.h>
9 10
10#include <asm/bios_ebda.h> 11#include <asm/bios_ebda.h>
11#include <asm/paravirt.h> 12#include <asm/paravirt.h>
12#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14#include <asm/pci.h>
13#include <asm/mpspec.h> 15#include <asm/mpspec.h>
14#include <asm/setup.h> 16#include <asm/setup.h>
15#include <asm/apic.h> 17#include <asm/apic.h>
@@ -33,7 +35,7 @@ void iommu_shutdown_noop(void) { }
33struct x86_init_ops x86_init __initdata = { 35struct x86_init_ops x86_init __initdata = {
34 36
35 .resources = { 37 .resources = {
36 .probe_roms = x86_init_noop, 38 .probe_roms = probe_roms,
37 .reserve_resources = reserve_standard_io_resources, 39 .reserve_resources = reserve_standard_io_resources,
38 .memory_setup = default_machine_specific_memory_setup, 40 .memory_setup = default_machine_specific_memory_setup,
39 }, 41 },
@@ -59,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
59 .banner = default_banner, 61 .banner = default_banner,
60 }, 62 },
61 63
64 .mapping = {
65 .pagetable_reserve = native_pagetable_reserve,
66 },
67
62 .paging = { 68 .paging = {
63 .pagetable_setup_start = native_pagetable_setup_start, 69 .pagetable_setup_start = native_pagetable_setup_start,
64 .pagetable_setup_done = native_pagetable_setup_done, 70 .pagetable_setup_done = native_pagetable_setup_done,
@@ -68,6 +74,7 @@ struct x86_init_ops x86_init __initdata = {
68 .setup_percpu_clockev = setup_boot_APIC_clock, 74 .setup_percpu_clockev = setup_boot_APIC_clock,
69 .tsc_pre_init = x86_init_noop, 75 .tsc_pre_init = x86_init_noop,
70 .timer_init = hpet_time_init, 76 .timer_init = hpet_time_init,
77 .wallclock_init = x86_init_noop,
71 }, 78 },
72 79
73 .iommu = { 80 .iommu = {
@@ -99,3 +106,8 @@ struct x86_platform_ops x86_platform = {
99}; 106};
100 107
101EXPORT_SYMBOL_GPL(x86_platform); 108EXPORT_SYMBOL_GPL(x86_platform);
109struct x86_msi_ops x86_msi = {
110 .setup_msi_irqs = native_setup_msi_irqs,
111 .teardown_msi_irq = native_teardown_msi_irq,
112 .teardown_msi_irqs = default_teardown_msi_irqs,
113};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
394 * Setup init_xstate_buf to represent the init state of 394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave 395 * all the features managed by the xsave
396 */ 396 */
397 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem_align(xstate_size,
398 __alignof__(struct xsave_struct));
398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 399 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399 400
400 clts(); 401 clts();